diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index 059fd2695..c01006efe 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
index 6ecf3bcc7..0314d469b 100644
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 432fb5dad..6d5943a2f 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -18,7 +18,7 @@ COPY . .
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
ENV LC_ALL=C.utf8
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index b937a4829..23f428944 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
-RUN make
+RUN make -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile
index 274b91b71..7516c8313 100644
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+ rm /etc/apt/sources.list.d/intel-graphics.list && \
+ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+ chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile
index 0a706dc73..37576d68e 100644
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT [ "/app/main" ]
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 3ab1decd6..763d75fce 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -9,7 +9,7 @@ WORKDIR /app
COPY . .
-RUN make
+RUN make -j$(nproc)
FROM ubuntu:$UBUNTU_VERSION as runtime
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
index 59a52ba21..7f5228185 100644
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile
index a8e451fa9..13d00b737 100644
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+ rm /etc/apt/sources.list.d/intel-graphics.list && \
+ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+ chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
@@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+ rm /etc/apt/sources.list.d/intel-graphics.list && \
+ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+ chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile
index c02a31dd8..a6b76dee8 100644
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT [ "/app/server" ]
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
index be964e0e8..0d09d3627 100644
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -11,7 +11,7 @@ COPY . .
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
FROM ubuntu:$UBUNTU_VERSION as runtime
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 3a7d274e4..97424c3aa 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,7 +8,7 @@ arg1="$1"
shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
- python3 ./convert.py "$@"
+ python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml
new file mode 100644
index 000000000..bfb9d9a06
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -0,0 +1,50 @@
+name: Low Severity Bugs
+description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
+title: "Bug: "
+labels: ["bug-unconfirmed", "low severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
new file mode 100644
index 000000000..e8297eea0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -0,0 +1,50 @@
+name: Medium Severity Bug
+description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
+title: "Bug: "
+labels: ["bug-unconfirmed", "medium severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml
new file mode 100644
index 000000000..3c9d50d16
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -0,0 +1,50 @@
+name: High Severity Bug
+description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
+title: "Bug: "
+labels: ["bug-unconfirmed", "high severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
new file mode 100644
index 000000000..d089d5fa1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -0,0 +1,50 @@
+name: Critical Severity Bug
+description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
+title: "Bug: "
+labels: ["bug-unconfirmed", "critical severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/05-enhancement.yml
new file mode 100644
index 000000000..58fca7318
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@@ -0,0 +1,51 @@
+name: Enhancement
+description: Used to request enhancements for llama.cpp
+title: "Feature Request: "
+labels: ["enhancement"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+
+ - type: checkboxes
+ id: prerequisites
+ attributes:
+ label: Prerequisites
+ description: Please confirm the following before submitting your enhancement request.
+ options:
+ - label: I am running the latest code. Mention the version if possible as well.
+ required: true
+ - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+ required: true
+ - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
+ required: true
+ - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+ required: true
+
+ - type: textarea
+ id: feature-description
+ attributes:
+ label: Feature Description
+ description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+ placeholder: Detailed description of the enhancement
+ validations:
+ required: true
+
+ - type: textarea
+ id: motivation
+ attributes:
+ label: Motivation
+ description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+ placeholder: Explanation of why this feature is needed and its benefits
+ validations:
+ required: true
+
+ - type: textarea
+ id: possible-implementation
+ attributes:
+ label: Possible Implementation
+ description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
+ placeholder: Detailed description of potential implementation
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/06-research.yml
new file mode 100644
index 000000000..3ae4e9f8c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@@ -0,0 +1,52 @@
+name: Research
+description: Track new technical research area
+title: "Research: "
+labels: ["research 🔬"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+
+ - type: checkboxes
+ id: research-stage
+ attributes:
+ label: Research Stage
+ description: Track general state of this research ticket
+ options:
+ - label: Background Research (Let's try to avoid reinventing the wheel)
+ - label: Hypothesis Formed (How do you think this will work and it's effect?)
+ - label: Strategy / Implementation Forming
+ - label: Analysis of results
+ - label: Debrief / Documentation (So people in the future can learn from us)
+
+ - type: textarea
+ id: background
+ attributes:
+ label: Previous existing literature and research
+ description: Whats the current state of the art and whats the motivation for this research?
+
+ - type: textarea
+ id: hypothesis
+ attributes:
+ label: Hypothesis
+ description: How do you think this will work and it's effect?
+
+ - type: textarea
+ id: implementation
+ attributes:
+ label: Implementation
+ description: Got an approach? e.g. a PR ready to go?
+
+ - type: textarea
+ id: analysis
+ attributes:
+ label: Analysis
+ description: How does the proposed implementation behave?
+
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/07-refactor.yml
new file mode 100644
index 000000000..3a68d3d53
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@@ -0,0 +1,28 @@
+name: Refactor (Maintainers)
+description: Used to track refactoring opportunities
+title: "Refactor: "
+labels: ["refactor"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+ Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+
+ - type: textarea
+ id: background-description
+ attributes:
+ label: Background Description
+ description: Please provide a detailed written description of the pain points you are trying to solve.
+ placeholder: Detailed description behind your motivation to request refactor
+ validations:
+ required: true
+
+ - type: textarea
+ id: possible-approaches
+ attributes:
+ label: Possible Refactor Approaches
+ description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
+ placeholder: Your idea of possible refactoring opportunity/approaches
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md
deleted file mode 100644
index 49812832c..000000000
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ /dev/null
@@ -1,11 +0,0 @@
----
-name: Bug template
-about: Used to report bugs in llama.cpp
-labels: ["bug-unconfirmed"]
-assignees: ''
-
----
-
-Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
-
-If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..c88134dbb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,13 @@
+blank_issues_enabled: true
+contact_links:
+ - name: Got an idea?
+ url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+ about: Pop it there. It may then become an enhancement ticket.
+ - name: Got a question?
+ url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+ about: Ask a question there!
+ - name: Want to contribute?
+ url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+ about: Head to the contribution guide page of the wiki for areas you can help with
+
+
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
deleted file mode 100644
index dcffda750..000000000
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ /dev/null
@@ -1,28 +0,0 @@
----
-name: Enhancement template
-about: Used to request enhancements for llama.cpp
-labels: ["enhancement"]
-assignees: ''
-
----
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
-- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
-- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
-- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Feature Description
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-
-# Motivation
-
-Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-
-# Possible Implementation
-
-If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
diff --git a/.github/labeler.yml b/.github/labeler.yml
index fca60594f..97d739b58 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,5 +1,16 @@
# https://github.com/actions/labeler
-
+Kompute:
+ - changed-files:
+ - any-glob-to-any-file:
+ - ggml-kompute.h
+ - ggml-kompute.cpp
+ - README-kompute.md
+Apple Metal:
+ - changed-files:
+ - any-glob-to-any-file:
+ - ggml-metal.h
+ - ggml-metal.cpp
+ - README-metal.md
SYCL:
- changed-files:
- any-glob-to-any-file:
@@ -9,6 +20,7 @@ SYCL:
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
+ - ggml-cuda.h
- ggml-cuda/**
Vulkan:
- changed-files:
@@ -62,6 +74,8 @@ server:
ggml:
- changed-files:
- any-glob-to-any-file:
+ - ggml.c
+ - ggml.h
- ggml-*.c
- ggml-*.h
- ggml-cuda/**
@@ -71,3 +85,6 @@ nix:
- "**/*.nix"
- .github/workflows/nix-*.yml
- .devops/nix/nixpkgs-instances.nix
+embedding:
+ - changed-files:
+ - any-glob-to-any-file: examples/embedding/
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 7b616281b..93669d531 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -294,12 +294,22 @@ jobs:
- name: Build
id: cmake_build
+ if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+ - name: Build (no OpenMP)
+ id: cmake_build_no_openmp
+ if: ${{ matrix.sanitizer == 'THREAD' }}
+ run: |
+ mkdir build
+ cd build
+ cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
+ cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
+
- name: Test
id: cmake_test
run: |
@@ -678,8 +688,6 @@ jobs:
env:
OPENBLAS_VERSION: 0.3.23
- OPENCL_VERSION: 2023.04.17
- CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.3.261.1
@@ -696,8 +704,6 @@ jobs:
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- - build: 'clblast-x64'
- defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
@@ -722,27 +728,6 @@ jobs:
run: |
git submodule update --init kompute
- - name: Download OpenCL SDK
- id: get_opencl
- if: ${{ matrix.build == 'clblast-x64' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
- mkdir $env:RUNNER_TEMP/opencl
- tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
-
- - name: Download CLBlast
- id: get_clblast
- if: ${{ matrix.build == 'clblast-x64' }}
- run: |
- curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
- curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
- 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
- rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
- foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
- $txt = Get-Content -Path $f -Raw
- $txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
- }
-
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@@ -776,13 +761,6 @@ jobs:
cmake -S . -B build ${{ matrix.defines }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- - name: Add clblast.dll
- id: add_clblast_dll
- if: ${{ matrix.build == 'clblast-x64' }}
- run: |
- cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
- cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
-
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas-x64' }}
@@ -806,7 +784,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
- if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
+ if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@@ -1061,7 +1039,7 @@ jobs:
# hypervisor: 'qemu'
# run: |
# sudo pkg update
-# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
+# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release:
diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml
deleted file mode 100644
index 747c35cc0..000000000
--- a/.github/workflows/zig-build.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: Zig CI
-
-on:
- pull_request:
- push:
- branches:
- - master
-
-concurrency:
- group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
- cancel-in-progress: true
-
-jobs:
- build:
- strategy:
- fail-fast: false
- matrix:
- runs-on: [ubuntu-latest, macos-latest, windows-latest]
- runs-on: ${{ matrix.runs-on }}
- steps:
- - uses: actions/checkout@v4
- with:
- submodules: recursive
- fetch-depth: 0
- - uses: goto-bus-stop/setup-zig@v2
- with:
- version: 0.11.0
- - name: Build Summary
- run: zig build --summary all -freference-trace
diff --git a/.gitignore b/.gitignore
index 50ae0973a..5223c6963 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,9 +34,11 @@ ggml-metal-embed.metal
lcov-report/
gcovr-report/
+tags
build*
!build.zig
cmake-build-*
+android-ndk-*
out/
tmp/
@@ -105,6 +107,7 @@ examples/jeopardy/results.txt
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
+examples/server/*.css.hpp
poetry.lock
poetry.toml
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cc60039a..cf37d5bb2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ else()
set(INS_ENB ON)
endif()
+option(LLAMA_SVE "llama: enable SVE" OFF)
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
@@ -105,11 +106,11 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
+option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
-option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
@@ -124,7 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
-option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
+option(LLAMA_OPENMP "llama: use OpenMP" ON)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@@ -295,6 +296,17 @@ if (LLAMA_METAL)
)
endif()
+if (LLAMA_OPENMP)
+ find_package(OpenMP)
+ if (OpenMP_FOUND)
+ message(STATUS "OpenMP found")
+ add_compile_definitions(GGML_USE_OPENMP)
+ set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
+ else()
+ message(WARNING "OpenMP not found")
+ endif()
+endif()
+
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@@ -384,10 +396,6 @@ if (LLAMA_LLAMAFILE)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
endif()
-if (LLAMA_QKK_64)
- add_compile_definitions(GGML_QKK_64)
-endif()
-
if (LLAMA_CUBLAS)
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
set(LLAMA_CUDA ON)
@@ -406,6 +414,8 @@ if (LLAMA_CUDA)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@@ -431,6 +441,18 @@ if (LLAMA_CUDA)
if (LLAMA_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
+ if (LLAMA_CUDA_FA_ALL_QUANTS)
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+ else()
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+ list(APPEND GGML_SOURCES_CUDA ${SRCS})
+ endif()
if (LLAMA_STATIC)
if (WIN32)
@@ -479,22 +501,6 @@ if (LLAMA_RPC)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()
-if (LLAMA_CLBLAST)
- find_package(CLBlast)
- if (CLBlast_FOUND)
- message(STATUS "CLBlast found")
-
- set(GGML_HEADERS_OPENCL ggml-opencl.h)
- set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
-
- add_compile_definitions(GGML_USE_CLBLAST)
-
- set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
- else()
- message(WARNING "CLBlast not found")
- endif()
-endif()
-
if (LLAMA_VULKAN)
find_package(Vulkan)
if (Vulkan_FOUND)
@@ -505,6 +511,12 @@ if (LLAMA_VULKAN)
add_compile_definitions(GGML_USE_VULKAN)
+ # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
+ # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+ if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
+ add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
+ endif()
+
if (LLAMA_VULKAN_CHECK_RESULTS)
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
endif()
@@ -528,12 +540,17 @@ if (LLAMA_VULKAN)
endif()
if (LLAMA_HIPBLAS)
- if ($ENV{ROCM_PATH})
- set(ROCM_PATH $ENV{ROCM_PATH})
+ if (NOT EXISTS $ENV{ROCM_PATH})
+ if (NOT EXISTS /opt/rocm)
+ set(ROCM_PATH /usr)
+ else()
+ set(ROCM_PATH /opt/rocm)
+ endif()
else()
- set(ROCM_PATH /opt/rocm)
+ set(ROCM_PATH $ENV{ROCM_PATH})
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+ list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
# CMake on Windows doesn't support the HIP language yet
if(WIN32)
@@ -569,6 +586,8 @@ if (LLAMA_HIPBLAS)
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@@ -588,6 +607,19 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
+ if (LLAMA_CUDA_FA_ALL_QUANTS)
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
+ else()
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
+ list(APPEND GGML_SOURCES_ROCM ${SRCS})
+ endif()
+
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@@ -626,6 +658,10 @@ if (LLAMA_SYCL)
add_compile_definitions(GGML_SYCL_F16)
endif()
+ if (LLAMA_CUDA_FORCE_MMQ)
+ add_compile_definitions(GGML_SYCL_FORCE_MMQ)
+ endif()
+
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
@@ -741,6 +777,7 @@ if (LLAMA_KOMPUTE)
kompute-shaders/op_mul_mat_q4_0.comp
kompute-shaders/op_mul_mat_q4_1.comp
kompute-shaders/op_mul_mat_q6_k.comp
+ kompute-shaders/op_getrows_f32.comp
kompute-shaders/op_getrows_f16.comp
kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp
@@ -773,6 +810,7 @@ if (LLAMA_KOMPUTE)
shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h
shaderop_mul_mat_q6_k.h
+ shaderop_getrows_f32.h
shaderop_getrows_f16.h
shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h
@@ -1039,6 +1077,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
+ if (LLAMA_SVE)
+ list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
+ endif()
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@@ -1207,7 +1248,6 @@ add_library(ggml OBJECT
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
- ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
@@ -1295,8 +1335,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
- "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
- "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
+ "${GGML_HEADERS_CUDA}"
+ "${GGML_HEADERS_METAL}"
+ "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
@@ -1305,7 +1346,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
install(TARGETS llama LIBRARY PUBLIC_HEADER)
install(
- FILES convert.py
+ FILES convert-hf-to-gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
@@ -1332,6 +1373,13 @@ if (LLAMA_METAL)
endif()
endif()
+configure_file(cmake/llama.pc.in
+ "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+ @ONLY)
+
+install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
+ DESTINATION lib/pkgconfig)
+
#
# programs, examples and tests
#
diff --git a/CMakePresets.json b/CMakePresets.json
index ad1af7ecc..e2b7a79e3 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,4 +1,4 @@
-{
+{
"version": 4,
"configurePresets": [
{
@@ -40,6 +40,10 @@
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
- { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
+ { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
+
+ { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
+ { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
+ { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
]
}
diff --git a/Makefile b/Makefile
index 6b7c853b3..802ee6a47 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
- simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
+ simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
LLAMA_METAL := 1
endif
+ LLAMA_NO_OPENMP := 1
+
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
@@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
endif
endif
+ifdef LLAMA_RPC
+ BUILD_TARGETS += rpc-server
+endif
+
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
ifdef LLAMA_FAST
MK_CFLAGS += -Ofast
HOST_CXXFLAGS += -Ofast
+ifndef LLAMA_DEBUG
MK_NVCCFLAGS += -O3
+endif # LLAMA_DEBUG
else
MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
+ifndef LLAMA_DEBUG
MK_NVCCFLAGS += -O3
-endif
+endif # LLAMA_DEBUG
+endif # LLAMA_FAST
ifndef LLAMA_NO_CCACHE
CCACHE := $(shell which ccache)
@@ -201,9 +211,10 @@ ifdef LLAMA_SCHED_MAX_COPIES
endif
ifdef LLAMA_DEBUG
- MK_CFLAGS += -O0 -g
- MK_CXXFLAGS += -O0 -g
- MK_LDFLAGS += -g
+ MK_CFLAGS += -O0 -g
+ MK_CXXFLAGS += -O0 -g
+ MK_LDFLAGS += -g
+ MK_NVCCFLAGS += -O0 -g
ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@@ -389,10 +400,6 @@ else
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif
-ifdef LLAMA_QKK_64
- MK_CPPFLAGS += -DGGML_QKK_64
-endif
-
ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
@@ -404,6 +411,12 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE
+ifndef LLAMA_NO_OPENMP
+ MK_CPPFLAGS += -DGGML_USE_OPENMP
+ MK_CFLAGS += -fopenmp
+ MK_CXXFLAGS += -fopenmp
+endif # LLAMA_NO_OPENMP
+
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@@ -420,11 +433,25 @@ ifdef LLAMA_BLIS
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
+ifdef LLAMA_RPC
+ MK_CPPFLAGS += -DGGML_USE_RPC
+ OBJS += ggml-rpc.o
+endif # LLAMA_RPC
+
ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1
endif
+OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
+ifdef LLAMA_CUDA_FA_ALL_QUANTS
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
+else
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
+ OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
+endif # LLAMA_CUDA_FA_ALL_QUANTS
+
ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
@@ -435,6 +462,7 @@ ifdef LLAMA_CUDA
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+ OBJS += $(OBJS_CUDA_TEMP_INST)
MK_NVCCFLAGS += -use_fast_math
ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings
@@ -445,6 +473,9 @@ endif # JETSON_EOL_MODULE_DETECT
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG
+ifdef LLAMA_CUDA_DEBUG
+ MK_NVCCFLAGS += --device-debug
+endif # LLAMA_CUDA_DEBUG
ifdef LLAMA_CUDA_NVCC
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
else
@@ -494,7 +525,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
-endif
+endif # LLAMA_CUDA_CCBIN
+ifdef LLAMA_CUDA_FA_ALL_QUANTS
+ MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
+endif # LLAMA_CUDA_FA_ALL_QUANTS
ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE
@@ -506,30 +540,13 @@ define NVCC_COMPILE
endef # NVCC_COMPILE
endif # JETSON_EOL_MODULE_DETECT
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(NVCC_COMPILE)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)
endif # LLAMA_CUDA
-ifdef LLAMA_CLBLAST
- MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
- MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
- MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
-
- # Mac provides OpenCL as a framework
- ifeq ($(UNAME_S),Darwin)
- MK_LDFLAGS += -lclblast -framework OpenCL
- else
- MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
- endif
- OBJS += ggml-opencl.o
-
-ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
- $(CXX) $(CXXFLAGS) -c $< -o $@
-endif # LLAMA_CLBLAST
-
ifdef LLAMA_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
@@ -572,6 +589,7 @@ ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+ MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@@ -585,11 +603,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
+ OBJS += $(OBJS_CUDA_TEMP_INST)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
-ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
+ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # LLAMA_HIPBLAS
@@ -627,11 +646,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif
endif # LLAMA_METAL
+OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
+COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
+COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
+
ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
+ifdef LLAMA_RPC
+ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
+ $(CXX) $(CXXFLAGS) -c $< -o $@
+
+rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+ $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
+endif # LLAMA_RPC
+
GF_CC := $(CC)
include scripts/get-flags.mk
@@ -711,14 +745,9 @@ unicode.o: unicode.cpp unicode.h
unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
-
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
-COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
-COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
-
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@@ -749,6 +778,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml-cuda/*.o
+ rm -vrf ggml-cuda/template-instances/*.o
find examples pocs -type f -name "*.o" -delete
#
@@ -817,7 +847,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
+server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@@ -867,10 +897,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
- $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
-
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/README-sycl.md b/README-sycl.md
index cfa248a95..62b38135c 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
-It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
+It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## News
@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
-| OS | Status | Verified |
-|---------|---------|------------------------------------|
-| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
-| Windows | Support | Windows 11 |
+| OS | Status | Verified |
+|---------|---------|------------------------------------------------|
+| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
+| Windows | Support | Windows 11 |
## Hardware
@@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
-| Intel Arc Series | Support | Arc 770, 730M |
+| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
diff --git a/README.md b/README.md
index f4088c05e..9d2a59d89 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,9 @@

-[](https://opensource.org/licenses/MIT) [](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[](https://opensource.org/licenses/MIT)
+[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -20,7 +22,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
-- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
+- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
+- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@@ -74,7 +77,7 @@ variety of hardware - locally and in the cloud.
- AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
-- Vulkan, SYCL, and (partial) OpenCL backend support
+- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@@ -127,6 +130,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
+- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
@@ -140,11 +144,14 @@ Typically finetunes of the base models below are supported as well.
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
+- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
**HTTP server**
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
+[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
+
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@@ -198,9 +205,14 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [AIKit](https://github.com/sozercan/aikit) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+**Tools:**
+
+- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
+
---
Here is a typical run using LLaMA v2 13B on M2 Ultra:
@@ -309,8 +321,6 @@ In order to build llama.cpp you have four different options.
make
```
- **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
-
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@@ -322,40 +332,38 @@ In order to build llama.cpp you have four different options.
make
```
+ - Notes:
+ - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
+ - For faster repeated compilation, install [ccache](https://ccache.dev/).
+ - For debug builds, run `make LLAMA_DEBUG=1`
+
- Using `CMake`:
- ```bash
- cmake -B build
- cmake --build build --config Release
- ```
+ ```bash
+ cmake -B build
+ cmake --build build --config Release
+ ```
- **Note**: for `Debug` builds, there are two cases:
+ **Notes**:
- - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+ - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
+ - For faster repeated compilation, install [ccache](https://ccache.dev/).
+ - For debug builds, there are two cases:
+
+ 1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
- - Multi-config generators (`-G` param set to Visual Studio, XCode...):
+ 2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
-- Using `Zig` (version 0.11 or later):
-
- Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
- it's also possible to cross compile for other operating systems and architectures:
-
- ```bash
- zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
- ```
-
- The `zig targets` command will give you valid options to use.
-
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@@ -363,15 +371,18 @@ In order to build llama.cpp you have four different options.
3. Install compilation dependencies.
```bash
- sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
- opencl clblast openblas
+ sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
- **Notes:** With this packages you can build llama.cpp with OPENBLAS and
- CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
- the instructions for use and activate this options in this document below.
+### Homebrew
+
+On Mac and Linux, the homebrew package manager can be used via
+```
+brew install llama.cpp
+```
+The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Metal Build
@@ -383,7 +394,7 @@ argument.
### BLAS Build
-Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
+Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
- #### Accelerate Framework:
@@ -471,10 +482,12 @@ Building the program with BLAS support may lead to some performance improvements
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+ | LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
+ | LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
- #### hipBLAS
@@ -535,111 +548,6 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
-- #### CLBlast
-
- OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
-
- You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
-
- - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
-
- -
- Installing the OpenCL SDK from source
-
- ```sh
- git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
- cd OpenCL-SDK
- cmake -B build -DBUILD_DOCS=OFF \
- -DBUILD_EXAMPLES=OFF \
- -DBUILD_TESTING=OFF \
- -DOPENCL_SDK_BUILD_SAMPLES=OFF \
- -DOPENCL_SDK_TEST_SAMPLES=OFF
- cmake --build build
- cmake --install build --prefix /some/path
- ```
-
-
- ##### Installing CLBlast
-
- Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
-
- Linux packaging:
- Fedora Linux:
- ```bash
- sudo dnf install clblast
- ```
-
- Alternatively, they may be built from source.
-
- -
- Windows:
-
- ```cmd
- set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
- git clone https://github.com/CNugteren/CLBlast.git
- cd CLBlast
- cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
- cmake --build build --config Release
- cmake --install build --prefix C:/CLBlast
- ```
-
- (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
-
- -
- Unix:
-
- ```sh
- git clone https://github.com/CNugteren/CLBlast.git
- cd CLBlast
- cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
- cmake --build build --config Release
- cmake --install build --prefix /some/path
- ```
-
- Where `/some/path` is where the built library will be installed (default is `/usr/local`).
-
-
- ##### Building Llama with CLBlast
-
- - Build with make:
- ```sh
- make LLAMA_CLBLAST=1
- ```
- - CMake (Unix):
- ```sh
- cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
- cmake --build build --config Release
- ```
- - CMake (Windows):
- ```cmd
- set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
- git clone https://github.com/ggerganov/llama.cpp
- cd llama.cpp
- cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
- cmake --build build --config Release
- cmake --install build --prefix C:/LlamaCPP
- ```
-
- ##### Running Llama with CLBlast
-
- The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
-
- To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
- The selection can be a number (starting from 0) or a text string to search:
-
- ```sh
- GGML_OPENCL_PLATFORM=1 ./main ...
- GGML_OPENCL_DEVICE=2 ./main ...
- GGML_OPENCL_PLATFORM=Intel ./main ...
- GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
- ```
-
- The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
- Using the variables it is possible to select a CPU-based driver as well, if so desired.
-
- You can get a list of platforms and devices from the `clinfo -l` command, etc.
-
- #### Vulkan
**With docker**:
@@ -690,7 +598,8 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
# obtain the official LLaMA model weights and place them in ./models
@@ -707,10 +616,10 @@ ls ./models
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
+python3 convert-hf-to-gguf.py models/mymodel/
# [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
+python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
diff --git a/build.zig b/build.zig
deleted file mode 100644
index 96783574f..000000000
--- a/build.zig
+++ /dev/null
@@ -1,172 +0,0 @@
-// Compatible with Zig Version 0.11.0
-const std = @import("std");
-const ArrayList = std.ArrayList;
-const Compile = std.Build.Step.Compile;
-const ConfigHeader = std.Build.Step.ConfigHeader;
-const Mode = std.builtin.Mode;
-const CrossTarget = std.zig.CrossTarget;
-
-const Maker = struct {
- builder: *std.build.Builder,
- target: CrossTarget,
- optimize: Mode,
- enable_lto: bool,
-
- include_dirs: ArrayList([]const u8),
- cflags: ArrayList([]const u8),
- cxxflags: ArrayList([]const u8),
- objs: ArrayList(*Compile),
-
- fn addInclude(m: *Maker, dir: []const u8) !void {
- try m.include_dirs.append(dir);
- }
- fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
- try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
- }
- fn addCFlag(m: *Maker, flag: []const u8) !void {
- try m.cflags.append(flag);
- }
- fn addCxxFlag(m: *Maker, flag: []const u8) !void {
- try m.cxxflags.append(flag);
- }
- fn addFlag(m: *Maker, flag: []const u8) !void {
- try m.addCFlag(flag);
- try m.addCxxFlag(flag);
- }
-
- fn init(builder: *std.build.Builder) !Maker {
- const target = builder.standardTargetOptions(.{});
- const zig_version = @import("builtin").zig_version_string;
- const commit_hash = try std.ChildProcess.exec(
- .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
- );
- try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
- \\int LLAMA_BUILD_NUMBER = {};
- \\char const *LLAMA_COMMIT = "{s}";
- \\char const *LLAMA_COMPILER = "Zig {s}";
- \\char const *LLAMA_BUILD_TARGET = "{s}";
- \\
- , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
- var m = Maker{
- .builder = builder,
- .target = target,
- .optimize = builder.standardOptimizeOption(.{}),
- .enable_lto = false,
- .include_dirs = ArrayList([]const u8).init(builder.allocator),
- .cflags = ArrayList([]const u8).init(builder.allocator),
- .cxxflags = ArrayList([]const u8).init(builder.allocator),
- .objs = ArrayList(*Compile).init(builder.allocator),
- };
-
- try m.addCFlag("-std=c11");
- try m.addCxxFlag("-std=c++11");
- try m.addProjectInclude(&.{});
- try m.addProjectInclude(&.{"common"});
- return m;
- }
-
- fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
- const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
- if (o.target.getAbi() != .msvc)
- o.defineCMacro("_GNU_SOURCE", null);
-
- if (std.mem.endsWith(u8, src, ".c")) {
- o.addCSourceFiles(&.{src}, m.cflags.items);
- o.linkLibC();
- } else {
- o.addCSourceFiles(&.{src}, m.cxxflags.items);
- if (o.target.getAbi() == .msvc) {
- o.linkLibC(); // need winsdk + crt
- } else {
- // linkLibCpp already add (libc++ + libunwind + libc)
- o.linkLibCpp();
- }
- }
- for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
- o.want_lto = m.enable_lto;
- return o;
- }
-
- fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
- const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
- e.addCSourceFiles(&.{src}, m.cxxflags.items);
- for (deps) |d| e.addObject(d);
- for (m.objs.items) |o| e.addObject(o);
- for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
-
- // https://github.com/ziglang/zig/issues/15448
- if (e.target.getAbi() == .msvc) {
- e.linkLibC(); // need winsdk + crt
- } else {
- // linkLibCpp already add (libc++ + libunwind + libc)
- e.linkLibCpp();
- }
- m.builder.installArtifact(e);
- e.want_lto = m.enable_lto;
- return e;
- }
-};
-
-pub fn build(b: *std.build.Builder) !void {
- var make = try Maker.init(b);
- make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
-
- const ggml = make.obj("ggml", "ggml.c");
- const sgemm = make.obj("sgemm", "sgemm.cpp");
- const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
- const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
- const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
- const unicode = make.obj("unicode", "unicode.cpp");
- const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
- const llama = make.obj("llama", "llama.cpp");
- const buildinfo = make.obj("common", "common/build-info.cpp");
- const common = make.obj("common", "common/common.cpp");
- const console = make.obj("console", "common/console.cpp");
- const sampling = make.obj("sampling", "common/sampling.cpp");
- const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
- const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
- const train = make.obj("train", "common/train.cpp");
- const clip = make.obj("clip", "examples/llava/clip.cpp");
- const llava = make.obj("llava", "examples/llava/llava.cpp");
-
- _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
- _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
- _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
- _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
- _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
- _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
-
- const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
- if (server.target.isWindows()) {
- server.linkSystemLibrary("ws2_32");
- }
-
- const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
- for (server_assets) |asset| {
- const input_path = b.fmt("examples/server/public/{s}", .{asset});
- const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
-
- // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
-
- const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
- defer b.allocator.free(input);
-
- var buf = std.ArrayList(u8).init(b.allocator);
- defer buf.deinit();
-
- for (input) |byte| {
- try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
- }
-
- var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
- defer b.allocator.free(name);
- std.mem.replaceScalar(u8, name, '.', '_');
-
- try std.fs.cwd().writeFile(output_path, b.fmt(
- "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
- .{ name, buf.items, name, input.len },
- ));
-
- std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
- }
-}
diff --git a/ci/run.sh b/ci/run.sh
index d5972480b..3fc5f48b2 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
}
function gg_get_model {
- local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
- local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
- if [[ -s $gguf_3b ]]; then
- echo -n "$gguf_3b"
- elif [[ -s $gguf_7b ]]; then
- echo -n "$gguf_7b"
+ local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
+ local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
+ local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+ if [[ -s $gguf_0 ]]; then
+ echo -n "$gguf_0"
+ elif [[ -s $gguf_1 ]]; then
+ echo -n "$gguf_1"
+ elif [[ -s $gguf_2 ]]; then
+ echo -n "$gguf_2"
else
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1
@@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
gg_printf '```\n'
}
-# open_llama_3b_v2
-
-function gg_run_open_llama_3b_v2 {
- cd ${SRC}
-
- gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
- gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
- gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
- gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
- gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
- gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
-
- gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
- unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
- head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
-
- path_models="../models-mnt/open-llama/3B-v2"
- path_wiki="../models-mnt/wikitext/wikitext-2-raw"
-
- rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
-
- set -e
-
- (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
- (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
-
- python3 ../convert.py ${path_models}
-
- model_f16="${path_models}/ggml-model-f16.gguf"
- model_q8_0="${path_models}/ggml-model-q8_0.gguf"
- model_q4_0="${path_models}/ggml-model-q4_0.gguf"
- model_q4_1="${path_models}/ggml-model-q4_1.gguf"
- model_q5_0="${path_models}/ggml-model-q5_0.gguf"
- model_q5_1="${path_models}/ggml-model-q5_1.gguf"
- model_q2_k="${path_models}/ggml-model-q2_k.gguf"
- model_q3_k="${path_models}/ggml-model-q3_k.gguf"
- model_q4_k="${path_models}/ggml-model-q4_k.gguf"
- model_q5_k="${path_models}/ggml-model-q5_k.gguf"
- model_q6_k="${path_models}/ggml-model-q6_k.gguf"
-
- wiki_test_60="${path_wiki}/wiki.test-60.raw"
-
- ./bin/quantize ${model_f16} ${model_q8_0} q8_0
- ./bin/quantize ${model_f16} ${model_q4_0} q4_0
- ./bin/quantize ${model_f16} ${model_q4_1} q4_1
- ./bin/quantize ${model_f16} ${model_q5_0} q5_0
- ./bin/quantize ${model_f16} ${model_q5_1} q5_1
- ./bin/quantize ${model_f16} ${model_q2_k} q2_k
- ./bin/quantize ${model_f16} ${model_q3_k} q3_k
- ./bin/quantize ${model_f16} ${model_q4_k} q4_k
- ./bin/quantize ${model_f16} ${model_q5_k} q5_k
- ./bin/quantize ${model_f16} ${model_q6_k} q6_k
-
- (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
- (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
- (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
- (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
- (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
- (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
- (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
- (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
- (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
- (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
- (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
- (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
- (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
- (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
- (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
- (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
- (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
- (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
- (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
- (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
- (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
- (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-
- (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-
- (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
- (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
-
- function check_ppl {
- qnt="$1"
- ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
-
- if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
- printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
- return 20
- fi
-
- printf ' - %s @ %s OK\n' "$qnt" "$ppl"
- return 0
- }
-
- check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
- check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
-
- cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
-
- set +e
-}
-
-function gg_sum_open_llama_3b_v2 {
- gg_printf '### %s\n\n' "${ci}"
-
- gg_printf 'OpenLLaMA 3B-v2:\n'
- gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
- gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
- gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
- gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
- gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
- gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
- gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
- gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
- gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
- gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
- gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
- gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
- gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
- gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
- gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-}
-
# open_llama_7b_v2
# requires: GG_BUILD_CUDA
@@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
- python3 ../convert.py ${path_models}
+ python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
+# pythia_1.4b
+
+function gg_run_pythia_1_4b {
+ cd ${SRC}
+
+ gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
+ gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
+ gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
+ gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
+ gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
+
+ gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+ head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
+
+ path_models="../models-mnt/pythia/1.4B"
+ path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+ set -e
+
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+ python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+ model_f16="${path_models}/ggml-model-f16.gguf"
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+ model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+ model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+ model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+ model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+ model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+ model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+ model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+ model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+ model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+ wiki_test_60="${path_wiki}/wiki.test-60.raw"
+
+ ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+ ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+ ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+ ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+ ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+ ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+ ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+ ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+ ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+ ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+ (time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+ (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+ (time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+ (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+ (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+ (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+ function check_ppl {
+ qnt="$1"
+ ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+ if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+ printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+ return 20
+ fi
+
+ printf ' - %s @ %s OK\n' "$qnt" "$ppl"
+ return 0
+ }
+
+ check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+ check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+ cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+ set +e
+}
+
+function gg_sum_pythia_1_4b {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Pythia 1.4B:\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+ gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+ gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+ gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+ gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+ gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+ gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+ gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+ gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+ gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+ gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+ gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+ gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+ gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
+# pythia_2_8b
+# requires: GG_BUILD_CUDA
+
+function gg_run_pythia_2_8b {
+ cd ${SRC}
+
+ gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
+ gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
+ gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
+ gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
+ gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
+
+ gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
+ unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
+
+ path_models="../models-mnt/pythia/2.8B"
+ path_wiki="../models-mnt/wikitext/wikitext-2-raw"
+
+ rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
+
+ set -e
+
+ (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+ (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
+
+ python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+
+ model_f16="${path_models}/ggml-model-f16.gguf"
+ model_q8_0="${path_models}/ggml-model-q8_0.gguf"
+ model_q4_0="${path_models}/ggml-model-q4_0.gguf"
+ model_q4_1="${path_models}/ggml-model-q4_1.gguf"
+ model_q5_0="${path_models}/ggml-model-q5_0.gguf"
+ model_q5_1="${path_models}/ggml-model-q5_1.gguf"
+ model_q2_k="${path_models}/ggml-model-q2_k.gguf"
+ model_q3_k="${path_models}/ggml-model-q3_k.gguf"
+ model_q4_k="${path_models}/ggml-model-q4_k.gguf"
+ model_q5_k="${path_models}/ggml-model-q5_k.gguf"
+ model_q6_k="${path_models}/ggml-model-q6_k.gguf"
+
+ wiki_test="${path_wiki}/wiki.test.raw"
+
+ ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+ ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+ ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+ ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+ ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+ ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+ ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+ ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+ ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+ ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+
+ (time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+ (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+ (time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+ (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+ (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+ (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+ (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+ (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+ (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+ (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+ (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+ (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+ (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+
+ (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+
+ (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+ (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+
+ function check_ppl {
+ qnt="$1"
+ ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
+
+ if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
+ printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
+ return 20
+ fi
+
+ printf ' - %s @ %s OK\n' "$qnt" "$ppl"
+ return 0
+ }
+
+ check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
+ check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+ check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
+
+ cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
+
+ set +e
+}
+
+function gg_sum_pythia_2_8b {
+ gg_printf '### %s\n\n' "${ci}"
+
+ gg_printf 'Pythia 2.8B:\n'
+ gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
+ gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
+ gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
+ gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
+ gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
+ gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
+ gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
+ gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
+ gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
+ gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
+ gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
+ gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
+ gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
+ gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
+ gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
+}
+
# bge-small
function gg_run_embd_bge_small {
@@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
- python3 ../convert-hf-to-gguf.py ${path_models}
+ python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then
- test $ret -eq 0 && gg_run open_llama_3b_v2
+ test $ret -eq 0 && gg_run pythia_1_4b
else
- test $ret -eq 0 && gg_run open_llama_7b_v2
+ test $ret -eq 0 && gg_run pythia_2_8b
+ #test $ret -eq 0 && gg_run open_llama_7b_v2
fi
test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release
diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
index 46fba6514..802379680 100644
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/llama.pc.in b/cmake/llama.pc.in
new file mode 100644
index 000000000..326acbb61
--- /dev/null
+++ b/cmake/llama.pc.in
@@ -0,0 +1,10 @@
+prefix=@CMAKE_INSTALL_PREFIX@
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: llama
+Description: Port of Facebook's LLaMA model in C/C++
+Version: @PROJECT_VERSION@
+Libs: -L${libdir} -lllama
+Cflags: -I${includedir}
diff --git a/common/common.cpp b/common/common.cpp
index ae11650b4..c8df9a4ce 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -7,20 +7,21 @@
#include
#include
+#include
#include
+#include
+#include
#include
#include
#include
-#include
#include
+#include
#include
#include
#include
#include
#include
#include
-#include
-#include
#if defined(__APPLE__) && defined(__MACH__)
#include
@@ -73,7 +74,11 @@
using json = nlohmann::ordered_json;
-int32_t get_num_physical_cores() {
+//
+// CPU utils
+//
+
+int32_t cpu_get_num_physical_cores() {
#ifdef __linux__
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set siblings;
@@ -142,9 +147,9 @@ static bool is_running_on_efficiency_core(void) {
return core_type == intel_atom;
}
-static int count_math_cpus(int cpu_count) {
+static int cpu_count_math_cpus(int n_cpu) {
int result = 0;
- for (int cpu = 0; cpu < cpu_count; ++cpu) {
+ for (int cpu = 0; cpu < n_cpu; ++cpu) {
if (pin_cpu(cpu)) {
return -1;
}
@@ -162,16 +167,16 @@ static int count_math_cpus(int cpu_count) {
/**
* Returns number of CPUs on system that are useful for math.
*/
-int get_math_cpu_count() {
+int32_t cpu_get_num_math() {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
- int cpu_count = sysconf(_SC_NPROCESSORS_ONLN);
- if (cpu_count < 1) {
- return get_num_physical_cores();
+ int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
+ if (n_cpu < 1) {
+ return cpu_get_num_physical_cores();
}
if (is_hybrid_cpu()) {
cpu_set_t affinity;
if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) {
- int result = count_math_cpus(cpu_count);
+ int result = cpu_count_math_cpus(n_cpu);
pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity);
if (result > 0) {
return result;
@@ -179,109 +184,104 @@ int get_math_cpu_count() {
}
}
#endif
- return get_num_physical_cores();
+ return cpu_get_num_physical_cores();
}
-void process_escapes(std::string & input) {
- std::size_t input_len = input.length();
- std::size_t output_idx = 0;
+//
+// CLI argument parsing
+//
- for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
- if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
- switch (input[++input_idx]) {
- case 'n': input[output_idx++] = '\n'; break;
- case 'r': input[output_idx++] = '\r'; break;
- case 't': input[output_idx++] = '\t'; break;
- case '\'': input[output_idx++] = '\''; break;
- case '\"': input[output_idx++] = '\"'; break;
- case '\\': input[output_idx++] = '\\'; break;
- case 'x':
- // Handle \x12, etc
- if (input_idx + 2 < input_len) {
- const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
- char *err_p = nullptr;
- const long val = std::strtol(x, &err_p, 16);
- if (err_p == x + 2) {
- input_idx += 2;
- input[output_idx++] = char(val);
- break;
- }
- }
- // fall through
- default: input[output_idx++] = '\\';
- input[output_idx++] = input[input_idx]; break;
+void gpt_params_handle_model_default(gpt_params & params) {
+ if (!params.hf_repo.empty()) {
+ // short-hand to avoid specifying --hf-file -> default it to --model
+ if (params.hf_file.empty()) {
+ if (params.model.empty()) {
+ throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
}
- } else {
- input[output_idx++] = input[input_idx];
+ params.hf_file = params.model;
+ } else if (params.model.empty()) {
+ std::string cache_directory = fs_get_cache_directory();
+ const bool success = fs_create_directory_with_parents(cache_directory);
+ if (!success) {
+ throw std::runtime_error("failed to create cache directory: " + cache_directory);
+ }
+ params.model = cache_directory + string_split(params.hf_file, '/').back();
+ }
+ } else if (!params.model_url.empty()) {
+ if (params.model.empty()) {
+ auto f = string_split(params.model_url, '#').front();
+ f = string_split(f, '?').front();
+ f = string_split(f, '/').back();
+ params.model = "models/" + f;
+ }
+ } else if (params.model.empty()) {
+ params.model = DEFAULT_MODEL_PATH;
+ }
+}
+
+bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
+ bool invalid_param = false;
+ std::string arg;
+ const std::string arg_prefix = "--";
+ llama_sampling_params & sparams = params.sparams;
+
+ for (int i = 1; i < argc; i++) {
+ arg = argv[i];
+ if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
+ std::replace(arg.begin(), arg.end(), '_', '-');
+ }
+ if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
+ throw std::invalid_argument("error: unknown argument: " + arg);
+ }
+ if (invalid_param) {
+ throw std::invalid_argument("error: invalid parameter for argument: " + arg);
}
}
- input.resize(output_idx);
+ if (params.prompt_cache_all && (params.interactive || params.interactive_first)) {
+ throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
+ }
+
+ gpt_params_handle_model_default(params);
+
+ if (params.escape) {
+ string_process_escapes(params.prompt);
+ string_process_escapes(params.input_prefix);
+ string_process_escapes(params.input_suffix);
+ string_process_escapes(sparams.cfg_negative_prompt);
+ for (auto & antiprompt : params.antiprompt) {
+ string_process_escapes(antiprompt);
+ }
+ }
+
+ if (!params.kv_overrides.empty()) {
+ params.kv_overrides.emplace_back();
+ params.kv_overrides.back().key[0] = 0;
+ }
+
+ return true;
}
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
- bool result = true;
- try {
- if (!gpt_params_parse_ex(argc, argv, params)) {
- gpt_print_usage(argc, argv, gpt_params());
- exit(0);
- }
- }
- catch (const std::invalid_argument & ex) {
- fprintf(stderr, "%s\n", ex.what());
- gpt_print_usage(argc, argv, gpt_params());
- exit(1);
- }
- return result;
-}
+ const auto params_org = params; // the example can modify the default params
-bool parse_kv_override(const char * data, std::vector & overrides) {
- const char * sep = strchr(data, '=');
- if (sep == nullptr || sep - data >= 128) {
- fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
- return false;
- }
- llama_model_kv_override kvo;
- std::strncpy(kvo.key, data, sep - data);
- kvo.key[sep - data] = 0;
- sep++;
- if (strncmp(sep, "int:", 4) == 0) {
- sep += 4;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
- kvo.val_i64 = std::atol(sep);
- } else if (strncmp(sep, "float:", 6) == 0) {
- sep += 6;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
- kvo.val_f64 = std::atof(sep);
- } else if (strncmp(sep, "bool:", 5) == 0) {
- sep += 5;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
- if (std::strcmp(sep, "true") == 0) {
- kvo.val_bool = true;
- } else if (std::strcmp(sep, "false") == 0) {
- kvo.val_bool = false;
- } else {
- fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+ try {
+ if (!gpt_params_parse_ex(argc, argv, params) || params.usage) {
+ params = params_org;
+ params.usage = true;
return false;
}
- } else if (strncmp(sep, "str:", 4) == 0) {
- sep += 4;
- kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
- if (strlen(sep) > 127) {
- fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
- return false;
- }
- strncpy(kvo.val_str, sep, 127);
- kvo.val_str[127] = '\0';
- } else {
- fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+ } catch (const std::invalid_argument & ex) {
+ fprintf(stderr, "%s\n", ex.what());
return false;
}
- overrides.emplace_back(std::move(kvo));
+
return true;
}
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) {
+ const char split_delim = ',';
+
llama_sampling_params & sparams = params.sparams;
if (arg == "-s" || arg == "--seed") {
@@ -289,7 +289,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
- // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context.
+ // TODO: this is temporary, in the future the sampling state will be moved fully to llama_sampling_context.
params.seed = std::stoul(argv[i]);
sparams.seed = std::stoul(argv[i]);
return true;
@@ -350,6 +350,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.escape = true;
return true;
}
+ if (arg == "--no-escape") {
+ params.escape = false;
+ return true;
+ }
if (arg == "--prompt-cache") {
if (++i >= argc) {
invalid_param = true;
@@ -404,7 +408,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
- if (arg == "-n" || arg == "--n-predict") {
+ if (arg == "-n" || arg == "--predict" || arg == "--n-predict") {
if (++i >= argc) {
invalid_param = true;
return true;
@@ -546,7 +550,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
const auto sampler_names = string_split(argv[i], ';');
- sparams.samplers_sequence = sampler_types_from_names(sampler_names, true);
+ sparams.samplers_sequence = llama_sampling_types_from_names(sampler_names, true);
return true;
}
if (arg == "--sampling-seq") {
@@ -554,7 +558,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
- sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
+ sparams.samplers_sequence = llama_sampling_types_from_chars(argv[i]);
return true;
}
if (arg == "--top-p") {
@@ -901,30 +905,22 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.interactive = true;
return true;
}
- if (arg == "--interactive-specials") {
- params.interactive_specials = true;
+ if (arg == "-sp" || arg == "--special") {
+ params.special = true;
return true;
}
- if (arg == "--embedding") {
+ if (arg == "--embedding" || arg == "--embeddings") {
params.embedding = true;
return true;
}
- if (arg == "--interactive-first") {
+ if (arg == "-if" || arg == "--interactive-first") {
params.interactive_first = true;
return true;
}
- if (arg == "-ins" || arg == "--instruct") {
- params.instruct = true;
- return true;
- }
if (arg == "-cnv" || arg == "--conversation") {
params.conversation = true;
return true;
}
- if (arg == "-cml" || arg == "--chatml") {
- params.chatml = true;
- return true;
- }
if (arg == "--infill") {
params.infill = true;
return true;
@@ -961,7 +957,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.flash_attn = true;
return true;
}
- if (arg == "--color") {
+ if (arg == "-co" || arg == "--color") {
params.use_color = true;
return true;
}
@@ -969,26 +965,26 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.use_mlock = true;
return true;
}
- if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") {
+ if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_gpu_layers = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
}
return true;
}
- if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
+ if (arg == "-ngld" || arg == "--gpu-layers-draft" || arg == "--gpu-layers-draft") {
if (++i >= argc) {
invalid_param = true;
return true;
}
params.n_gpu_layers_draft = std::stoi(argv[i]);
if (!llama_supports_gpu_offload()) {
- fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
+ fprintf(stderr, "warning: not compiled with GPU offload support, --gpu-layers-draft option will be ignored\n");
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
}
return true;
@@ -999,9 +995,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return true;
}
params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUDA_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--split-mode" || arg == "-sm") {
@@ -1027,9 +1023,9 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
-#ifndef GGML_USE_CUDA_SYCL
- fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUDA_SYCL
+#ifndef GGML_USE_CUDA_SYCL_VULKAN
+ fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CUDA_SYCL_VULKAN
return true;
}
if (arg == "--tensor-split" || arg == "-ts") {
@@ -1084,6 +1080,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
else { invalid_param = true; }
return true;
}
+ if (arg == "-v" || arg == "--verbose") {
+ params.verbose = true;
+ return true;
+ }
if (arg == "--verbose-prompt") {
params.verbose_prompt = true;
return true;
@@ -1148,6 +1148,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.ppl_stride = std::stoi(argv[i]);
return true;
}
+ if (arg == "--ppl-output-type") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ppl_output_type = std::stoi(argv[i]);
+ return true;
+ }
if (arg == "-ptc" || arg == "--print-token-count") {
if (++i >= argc) {
invalid_param = true;
@@ -1160,14 +1168,6 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.check_tensors = true;
return true;
}
- if (arg == "--ppl-output-type") {
- if (++i >= argc) {
- invalid_param = true;
- return true;
- }
- params.ppl_output_type = std::stoi(argv[i]);
- return true;
- }
if (arg == "--hellaswag") {
params.hellaswag = true;
return true;
@@ -1239,19 +1239,15 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
}
return true;
}
- if (arg == "-h" || arg == "--help") {
- gpt_print_usage(argc, argv, gpt_params());
- exit(0);
+ if (arg == "-h" || arg == "--help" || arg == "--usage" ) {
+ params.usage = true;
+ return true;
}
if (arg == "--version") {
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
exit(0);
}
- if (arg == "--random-prompt") {
- params.random_prompt = true;
- return true;
- }
if (arg == "--in-prefix-bos") {
params.input_prefix_bos = true;
return true;
@@ -1311,13 +1307,236 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
invalid_param = true;
return true;
}
- if (!parse_kv_override(argv[i], params.kv_overrides)) {
+ if (!string_parse_kv_override(argv[i], params.kv_overrides)) {
fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]);
invalid_param = true;
return true;
}
return true;
}
+ if (arg == "--host") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.hostname = argv[i];
+ return true;
+ }
+ if (arg == "--port") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.port = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--path") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.public_path = argv[i];
+ return true;
+ }
+ if (arg == "--api-key") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.api_keys.push_back(argv[i]);
+ return true;
+ }
+ if (arg == "--api-key-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream key_file(argv[i]);
+ if (!key_file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::string key;
+ while (std::getline(key_file, key)) {
+ if (!key.empty()) {
+ params.api_keys.push_back(key);
+ }
+ }
+ key_file.close();
+ return true;
+ }
+ if (arg == "--ssl-key-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ssl_file_key = argv[i];
+ return true;
+ }
+ if (arg == "--ssl-cert-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.ssl_file_cert = argv[i];
+ return true;
+ }
+ if (arg == "--timeout" || arg == "-to") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.timeout_read = std::stoi(argv[i]);
+ params.timeout_write = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "-spf" || arg == "--system-prompt-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i]);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ std::string system_prompt;
+ std::copy(
+ std::istreambuf_iterator(file),
+ std::istreambuf_iterator(),
+ std::back_inserter(system_prompt)
+ );
+ params.system_prompt = system_prompt;
+ return true;
+ }
+ if (arg == "--log-format") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ if (std::strcmp(argv[i], "json") == 0) {
+ params.log_json = true;
+ } else if (std::strcmp(argv[i], "text") == 0) {
+ params.log_json = false;
+ } else {
+ invalid_param = true;
+ return true;
+ }
+ return true;
+ }
+ if (arg == "--no-slots") {
+ params.endpoint_slots = false;
+ return true;
+ }
+ if (arg == "--metrics") {
+ params.endpoint_metrics = true;
+ return true;
+ }
+ if (arg == "--slot-save-path") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.slot_save_path = argv[i];
+ // if doesn't end with DIRECTORY_SEPARATOR, add it
+ if (!params.slot_save_path.empty() && params.slot_save_path[params.slot_save_path.size() - 1] != DIRECTORY_SEPARATOR) {
+ params.slot_save_path += DIRECTORY_SEPARATOR;
+ }
+ return true;
+ }
+ if (arg == "--chat-template") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ if (!llama_chat_verify_template(argv[i])) {
+ fprintf(stderr, "error: the supplied chat template is not supported: %s\n", argv[i]);
+ fprintf(stderr, "note: llama.cpp does not use jinja parser, we only support commonly used templates\n");
+ invalid_param = true;
+ return true;
+ }
+ params.chat_template = argv[i];
+ return true;
+ }
+ if (arg == "-pps") {
+ params.is_pp_shared = true;
+ return true;
+ }
+ if (arg == "-npp") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split(argv[i], split_delim);
+ params.n_pp.insert(params.n_pp.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "-ntg") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split(argv[i], split_delim);
+ params.n_tg.insert(params.n_tg.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "-npl") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ auto p = string_split(argv[i], split_delim);
+ params.n_pl.insert(params.n_pl.end(), p.begin(), p.end());
+ return true;
+ }
+ if (arg == "--context-file") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ std::ifstream file(argv[i], std::ios::binary);
+ if (!file) {
+ fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+ invalid_param = true;
+ return true;
+ }
+ params.context_files.push_back(argv[i]);
+ return true;
+ }
+ if (arg == "--chunk-size") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.chunk_size = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--chunk-separator") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.chunk_separator = argv[i];
+ return true;
+ }
+ if (arg == "--junk") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.n_junk = std::stoi(argv[i]);
+ return true;
+ }
+ if (arg == "--pos") {
+ if (++i >= argc) {
+ invalid_param = true;
+ return true;
+ }
+ params.i_pos = std::stoi(argv[i]);
+ return true;
+ }
#ifndef LOG_DISABLE_LOGS
// Parse args for logging parameters
if (log_param_single_parse(argv[i])) {
@@ -1345,285 +1564,314 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
return false;
}
-void gpt_params_handle_model_default(gpt_params & params) {
- if (!params.hf_repo.empty()) {
- // short-hand to avoid specifying --hf-file -> default it to --model
- if (params.hf_file.empty()) {
- if (params.model.empty()) {
- throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n");
- }
- params.hf_file = params.model;
- } else if (params.model.empty()) {
- std::string cache_directory = get_cache_directory();
- const bool success = create_directory_with_parents(cache_directory);
- if (!success) {
- throw std::runtime_error("failed to create cache directory: " + cache_directory);
- }
- params.model = cache_directory + string_split(params.hf_file, '/').back();
- }
- } else if (!params.model_url.empty()) {
- if (params.model.empty()) {
- auto f = string_split(params.model_url, '#').front();
- f = string_split(f, '?').front();
- f = string_split(f, '/').back();
- params.model = "models/" + f;
- }
- } else if (params.model.empty()) {
- params.model = DEFAULT_MODEL_PATH;
- }
-}
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
- bool invalid_param = false;
- std::string arg;
- const std::string arg_prefix = "--";
- llama_sampling_params & sparams = params.sparams;
-
- for (int i = 1; i < argc; i++) {
- arg = argv[i];
- if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
- std::replace(arg.begin(), arg.end(), '_', '-');
- }
- if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
- throw std::invalid_argument("error: unknown argument: " + arg);
- }
- if (invalid_param) {
- throw std::invalid_argument("error: invalid parameter for argument: " + arg);
- }
- }
-
- if (params.prompt_cache_all &&
- (params.interactive || params.interactive_first ||
- params.instruct)) {
-
- throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
- }
-
- gpt_params_handle_model_default(params);
-
- if (params.escape) {
- process_escapes(params.prompt);
- process_escapes(params.input_prefix);
- process_escapes(params.input_suffix);
- process_escapes(sparams.cfg_negative_prompt);
- for (auto & antiprompt : params.antiprompt) {
- process_escapes(antiprompt);
- }
- }
-
- if (!params.kv_overrides.empty()) {
- params.kv_overrides.emplace_back();
- params.kv_overrides.back().key[0] = 0;
- }
-
- return true;
-}
-
-void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
const llama_sampling_params & sparams = params.sparams;
std::string sampler_type_chars;
std::string sampler_type_names;
for (const auto sampler_type : sparams.samplers_sequence) {
sampler_type_chars += static_cast(sampler_type);
- sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
+ sampler_type_names += llama_sampling_type_to_str(sampler_type) + ";";
}
sampler_type_names.pop_back();
- printf("\n");
- printf("usage: %s [options]\n", argv[0]);
- printf("\n");
- printf("options:\n");
- printf(" -h, --help show this help message and exit\n");
- printf(" --version show version and build info\n");
- printf(" -i, --interactive run in interactive mode\n");
- printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
- printf(" --interactive-first run in interactive mode and wait for input right away\n");
- printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
- printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
- printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
- printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
- printf(" -r PROMPT, --reverse-prompt PROMPT\n");
- printf(" halt generation at PROMPT, return control in interactive mode\n");
- printf(" (can be specified more than once for multiple prompts).\n");
- printf(" --color colorise output to distinguish prompt and user input from generations\n");
- printf(" -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n");
- printf(" -t N, --threads N number of threads to use during generation (default: %d)\n", params.n_threads);
- printf(" -tb N, --threads-batch N\n");
- printf(" number of threads to use during batch and prompt processing (default: same as --threads)\n");
- printf(" -td N, --threads-draft N");
- printf(" number of threads to use during generation (default: same as --threads)\n");
- printf(" -tbd N, --threads-batch-draft N\n");
- printf(" number of threads to use during batch and prompt processing (default: same as --threads-draft)\n");
- printf(" -p PROMPT, --prompt PROMPT\n");
- printf(" prompt to start generation with (default: empty)\n");
- printf(" -e, --escape process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n");
- printf(" --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n");
- printf(" --prompt-cache-all if specified, saves user input and generations to cache as well.\n");
- printf(" not supported with --interactive or other interactive options\n");
- printf(" --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n");
- printf(" --random-prompt start with a randomized prompt.\n");
- printf(" --in-prefix-bos prefix BOS to user inputs, preceding the `--in-prefix` string\n");
- printf(" --in-prefix STRING string to prefix user inputs with (default: empty)\n");
- printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
- printf(" -f FNAME, --file FNAME\n");
- printf(" prompt file to start generation.\n");
- printf(" -bf FNAME, --binary-file FNAME\n");
- printf(" binary file containing multiple choice tasks.\n");
- printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
- printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
- printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
- printf(" -ub N, --ubatch-size N\n");
- printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
- printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
- printf(" (default: %s)\n", sampler_type_names.c_str());
- printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
- printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
- printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
- printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
- printf(" --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)sparams.tfs_z);
- printf(" --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)sparams.typical_p);
- printf(" --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", sparams.penalty_last_n);
- printf(" --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)sparams.penalty_repeat);
- printf(" --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_present);
- printf(" --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)sparams.penalty_freq);
- printf(" --dynatemp-range N dynamic temperature range (default: %.1f, 0.0 = disabled)\n", (double)sparams.dynatemp_range);
- printf(" --dynatemp-exp N dynamic temperature exponent (default: %.1f)\n", (double)sparams.dynatemp_exponent);
- printf(" --mirostat N use Mirostat sampling.\n");
- printf(" Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
- printf(" (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", sparams.mirostat);
- printf(" --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)sparams.mirostat_eta);
- printf(" --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)sparams.mirostat_tau);
- printf(" -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n");
- printf(" modifies the likelihood of token appearing in the completion,\n");
- printf(" i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n");
- printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n");
- printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n");
- printf(" --grammar-file FNAME file to read grammar from\n");
- printf(" -j SCHEMA, --json-schema SCHEMA\n");
- printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n");
- printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n");
- printf(" --cfg-negative-prompt PROMPT\n");
- printf(" negative prompt to use for guidance. (default: empty)\n");
- printf(" --cfg-negative-prompt-file FNAME\n");
- printf(" negative prompt file to use for guidance. (default: empty)\n");
- printf(" --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", sparams.cfg_scale);
- printf(" --rope-scaling {none,linear,yarn}\n");
- printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n");
- printf(" --rope-scale N RoPE context scaling factor, expands context by a factor of N\n");
- printf(" --rope-freq-base N RoPE base frequency, used by NTK-aware scaling (default: loaded from model)\n");
- printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n");
- printf(" --yarn-orig-ctx N YaRN: original context size of model (default: 0 = model training context size)\n");
- printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n");
- printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n");
- printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow);
- printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast);
- printf(" --pooling {none,mean,cls}\n");
- printf(" pooling type for embeddings, use model default if unspecified\n");
- printf(" -dt N, --defrag-thold N\n");
- printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold);
- printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
- printf(" --penalize-nl penalize newline tokens\n");
- printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp);
- printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n");
- printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
- printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
- printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
- printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
- printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
- printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
- printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base\n");
- printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
- printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
- printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
- printf(" -np N, --parallel N number of parallel sequences to decode (default: %d)\n", params.n_parallel);
- printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences);
- printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
- printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
- printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
- printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
- printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n");
+ struct option_info {
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(4, 5)
+ option_info(const std::string & tags, const char * args, const char * desc, ...) : tags(tags), args(args), desc(desc) {
+ va_list args_list;
+ va_start(args_list, desc);
+ char buffer[1024];
+ vsnprintf(buffer, sizeof(buffer), desc, args_list);
+ va_end(args_list);
+ this->desc = buffer;
+ }
+
+ option_info(const std::string & grp) : grp(grp) {}
+
+ std::string tags;
+ std::string args;
+ std::string desc;
+ std::string grp;
+ };
+
+ std::vector options;
+
+ // TODO: filter by tags
+
+ options.push_back({ "general" });
+ options.push_back({ "*", "-h, --help, --usage", "print usage and exit" });
+ options.push_back({ "*", " --version", "show version and build info" });
+ options.push_back({ "*", "-v, --verbose", "print verbose information" });
+ options.push_back({ "*", " --verbose-prompt", "print a verbose prompt before generation (default: %s)", params.verbose_prompt ? "true" : "false" });
+ options.push_back({ "*", " --no-display-prompt", "don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false" });
+ options.push_back({ "*", "-co, --color", "colorise output to distinguish prompt and user input from generations (default: %s)", params.use_color ? "true" : "false" });
+ options.push_back({ "*", "-s, --seed SEED", "RNG seed (default: %d, use random seed for < 0)", params.seed });
+ options.push_back({ "*", "-t, --threads N", "number of threads to use during generation (default: %d)", params.n_threads });
+ options.push_back({ "*", "-tb, --threads-batch N", "number of threads to use during batch and prompt processing (default: same as --threads)" });
+ options.push_back({ "speculative", "-td, --threads-draft N", "number of threads to use during generation (default: same as --threads)" });
+ options.push_back({ "speculative", "-tbd, --threads-batch-draft N",
+ "number of threads to use during batch and prompt processing (default: same as --threads-draft)" });
+ options.push_back({ "speculative", " --draft N", "number of tokens to draft for speculative decoding (default: %d)", params.n_draft });
+ options.push_back({ "speculative", "-ps, --p-split N", "speculative decoding split probability (default: %.1f)", (double)params.p_split });
+ options.push_back({ "*", "-lcs, --lookup-cache-static FNAME",
+ "path to static lookup cache to use for lookup decoding (not updated by generation)" });
+ options.push_back({ "*", "-lcd, --lookup-cache-dynamic FNAME",
+ "path to dynamic lookup cache to use for lookup decoding (updated by generation)" });
+
+ options.push_back({ "*", "-c, --ctx-size N", "size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx });
+ options.push_back({ "*", "-n, --predict N", "number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)", params.n_predict });
+ options.push_back({ "*", "-b, --batch-size N", "logical maximum batch size (default: %d)", params.n_batch });
+ options.push_back({ "*", "-ub, --ubatch-size N", "physical maximum batch size (default: %d)", params.n_ubatch });
+ options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
+ options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
+ options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
+ options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
+ options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
+ options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
+ options.push_back({ "*", "-e, --escape", "process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false" });
+ options.push_back({ "*", " --no-escape", "do not process escape sequences" });
+ options.push_back({ "main", "-ptc, --print-token-count N", "print token count every N tokens (default: %d)", params.n_print });
+ options.push_back({ "main", " --prompt-cache FNAME", "file to cache prompt state for faster startup (default: none)" });
+ options.push_back({ "main", " --prompt-cache-all", "if specified, saves user input and generations to cache as well\n"
+ "not supported with --interactive or other interactive options" });
+ options.push_back({ "main", " --prompt-cache-ro", "if specified, uses the prompt cache but does not update it" });
+ options.push_back({ "main", "-r, --reverse-prompt PROMPT",
+ "halt generation at PROMPT, return control in interactive mode\n"
+ "can be specified more than once for multiple prompts" });
+ options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
+ options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
+ options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
+ options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
+ options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
+ options.push_back({ "main infill", " --in-prefix-bos", "prefix BOS to user inputs, preceding the `--in-prefix` string" });
+ options.push_back({ "main infill", " --in-prefix STRING", "string to prefix user inputs with (default: empty)" });
+ options.push_back({ "main infill", " --in-suffix STRING", "string to suffix after user inputs with (default: empty)" });
+
+ options.push_back({ "sampling" });
+ options.push_back({ "*", " --samplers SAMPLERS", "samplers that will be used for generation in the order, separated by \';\'\n"
+ "(default: %s)", sampler_type_names.c_str() });
+ options.push_back({ "*", " --sampling-seq SEQUENCE",
+ "simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str() });
+ options.push_back({ "*", " --ignore-eos", "ignore end of stream token and continue generating (implies --logit-bias EOS-inf)" });
+ options.push_back({ "*", " --penalize-nl", "penalize newline tokens (default: %s)", sparams.penalize_nl ? "true" : "false" });
+ options.push_back({ "*", " --temp N", "temperature (default: %.1f)", (double)sparams.temp });
+ options.push_back({ "*", " --top-k N", "top-k sampling (default: %d, 0 = disabled)", sparams.top_k });
+ options.push_back({ "*", " --top-p N", "top-p sampling (default: %.1f, 1.0 = disabled)", (double)sparams.top_p });
+ options.push_back({ "*", " --min-p N", "min-p sampling (default: %.1f, 0.0 = disabled)", (double)sparams.min_p });
+ options.push_back({ "*", " --tfs N", "tail free sampling, parameter z (default: %.1f, 1.0 = disabled)", (double)sparams.tfs_z });
+ options.push_back({ "*", " --typical N", "locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)sparams.typical_p });
+ options.push_back({ "*", " --repeat-last-n N", "last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", sparams.penalty_last_n });
+ options.push_back({ "*", " --repeat-penalty N", "penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)", (double)sparams.penalty_repeat });
+ options.push_back({ "*", " --presence-penalty N", "repeat alpha presence penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_present });
+ options.push_back({ "*", " --frequency-penalty N", "repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)", (double)sparams.penalty_freq });
+ options.push_back({ "*", " --dynatemp-range N", "dynamic temperature range (default: %.1f, 0.0 = disabled)", (double)sparams.dynatemp_range });
+ options.push_back({ "*", " --dynatemp-exp N", "dynamic temperature exponent (default: %.1f)", (double)sparams.dynatemp_exponent });
+ options.push_back({ "*", " --mirostat N", "use Mirostat sampling.\n"
+ "Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"
+ "(default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)", sparams.mirostat });
+ options.push_back({ "*", " --mirostat-lr N", "Mirostat learning rate, parameter eta (default: %.1f)", (double)sparams.mirostat_eta });
+ options.push_back({ "*", " --mirostat-ent N", "Mirostat target entropy, parameter tau (default: %.1f)", (double)sparams.mirostat_tau });
+ options.push_back({ "*", " -l TOKEN_ID(+/-)BIAS", "modifies the likelihood of token appearing in the completion,\n"
+ "i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"
+ "or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'" });
+ options.push_back({ "main", " --cfg-negative-prompt PROMPT",
+ "negative prompt to use for guidance (default: '%s')", sparams.cfg_negative_prompt.c_str() });
+ options.push_back({ "main", " --cfg-negative-prompt-file FNAME",
+ "negative prompt file to use for guidance" });
+ options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
+
+ options.push_back({ "grammar" });
+ options.push_back({ "*", " --grammar GRAMMAR", "BNF-like grammar to constrain generations (see samples in grammars/ dir) (default: '%s')", sparams.grammar.c_str() });
+ options.push_back({ "*", " --grammar-file FNAME", "file to read grammar from" });
+ options.push_back({ "*", "-j, --json-schema SCHEMA",
+ "JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object\n"
+ "For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
+
+ options.push_back({ "embedding" });
+ options.push_back({ "embedding", " --pooling {none,mean,cls}",
+ "pooling type for embeddings, use model default if unspecified" });
+
+ options.push_back({ "context hacking" });
+ options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
+ "RoPE frequency scaling method, defaults to linear unless specified by the model" });
+ options.push_back({ "*", " --rope-scale N", "RoPE context scaling factor, expands context by a factor of N" });
+ options.push_back({ "*", " --rope-freq-base N", "RoPE base frequency, used by NTK-aware scaling (default: loaded from model)" });
+ options.push_back({ "*", " --rope-freq-scale N", "RoPE frequency scaling factor, expands context by a factor of 1/N" });
+ options.push_back({ "*", " --yarn-orig-ctx N", "YaRN: original context size of model (default: %d = model training context size)", params.yarn_orig_ctx });
+ options.push_back({ "*", " --yarn-ext-factor N", "YaRN: extrapolation mix factor (default: %.1f, 0.0 = full interpolation)", (double)params.yarn_ext_factor });
+ options.push_back({ "*", " --yarn-attn-factor N", "YaRN: scale sqrt(t) or attention magnitude (default: %.1f)", (double)params.yarn_attn_factor });
+ options.push_back({ "*", " --yarn-beta-slow N", "YaRN: high correction dim or alpha (default: %.1f)", (double)params.yarn_beta_slow });
+ options.push_back({ "*", " --yarn-beta-fast N", "YaRN: low correction dim or beta (default: %.1f)", (double)params.yarn_beta_fast });
+ options.push_back({ "*", "-gan, --grp-attn-n N", "group-attention factor (default: %d)", params.grp_attn_n });
+ options.push_back({ "*", "-gaw, --grp-attn-w N", "group-attention width (default: %.1f)", (double)params.grp_attn_w });
+ options.push_back({ "*", "-dkvc, --dump-kv-cache", "verbose print of the KV cache" });
+ options.push_back({ "*", "-nkvo, --no-kv-offload", "disable KV offload" });
+ options.push_back({ "*", "-ctk, --cache-type-k TYPE", "KV cache data type for K (default: %s)", params.cache_type_k.c_str() });
+ options.push_back({ "*", "-ctv, --cache-type-v TYPE", "KV cache data type for V (default: %s)", params.cache_type_v.c_str() });
+
+ options.push_back({ "perplexity" });
+ options.push_back({ "perplexity", " --all-logits", "return logits for all tokens in the batch (default: %s)", params.logits_all ? "true" : "false" });
+ options.push_back({ "perplexity", " --hellaswag", "compute HellaSwag score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --hellaswag-tasks N", "number of tasks to use when computing the HellaSwag score (default: %zu)", params.hellaswag_tasks });
+ options.push_back({ "perplexity", " --winogrande", "compute Winogrande score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --winogrande-tasks N", "number of tasks to use when computing the Winogrande score (default: %zu)", params.winogrande_tasks });
+ options.push_back({ "perplexity", " --multiple-choice", "compute multiple choice score over random tasks from datafile supplied with -f" });
+ options.push_back({ "perplexity", " --multiple-choice-tasks N",
+ "number of tasks to use when computing the multiple choice score (default: %zu)", params.multiple_choice_tasks });
+ options.push_back({ "perplexity", " --kl-divergence", "computes KL-divergence to logits provided via --kl-divergence-base" });
+ options.push_back({ "perplexity", " --ppl-stride N", "stride for perplexity calculation (default: %d)", params.ppl_stride });
+ options.push_back({ "perplexity", " --ppl-output-type {0,1}",
+ "output type for perplexity calculation (default: %d)", params.ppl_output_type });
+
+ options.push_back({ "parallel" });
+ options.push_back({ "*", "-dt, --defrag-thold N", "KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold });
+ options.push_back({ "*", "-np, --parallel N", "number of parallel sequences to decode (default: %d)", params.n_parallel });
+ options.push_back({ "*", "-ns, --sequences N", "number of sequences to decode (default: %d)", params.n_sequences });
+ options.push_back({ "*", "-cb, --cont-batching", "enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled" });
+
+ options.push_back({ "multi-modality" });
+ options.push_back({ "*", " --mmproj FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md" });
+ options.push_back({ "*", " --image FILE", "path to an image file. use with multimodal models. Specify multiple times for batching" });
+
+ options.push_back({ "backend" });
+ options.push_back({ "*", " --rpc SERVERS", "comma separated list of RPC servers" });
if (llama_supports_mlock()) {
- printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
+ options.push_back({ "*", " --mlock", "force system to keep model in RAM rather than swapping or compressing" });
}
if (llama_supports_mmap()) {
- printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
+ options.push_back({ "*", " --no-mmap", "do not memory-map model (slower load but may reduce pageouts if not using mlock)" });
}
- printf(" --numa TYPE attempt optimizations that help on some NUMA systems\n");
- printf(" - distribute: spread execution evenly over all nodes\n");
- printf(" - isolate: only spawn threads on CPUs on the node that execution started on\n");
- printf(" - numactl: use the CPU map provided by numactl\n");
- printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
- printf(" see https://github.com/ggerganov/llama.cpp/issues/1437\n");
+ options.push_back({ "*", " --numa TYPE", "attempt optimizations that help on some NUMA systems\n"
+ " - distribute: spread execution evenly over all nodes\n"
+ " - isolate: only spawn threads on CPUs on the node that execution started on\n"
+ " - numactl: use the CPU map provided by numactl\n"
+ "if run without this previously, it is recommended to drop the system page cache before using this\n"
+ "see https://github.com/ggerganov/llama.cpp/issues/1437" });
+
if (llama_supports_gpu_offload()) {
- printf(" -ngl N, --n-gpu-layers N\n");
- printf(" number of layers to store in VRAM\n");
- printf(" -ngld N, --n-gpu-layers-draft N\n");
- printf(" number of layers to store in VRAM for the draft model\n");
- printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
- printf(" how to split the model across multiple GPUs, one of:\n");
- printf(" - none: use one GPU only\n");
- printf(" - layer (default): split layers and KV across GPUs\n");
- printf(" - row: split rows across GPUs\n");
- printf(" -ts SPLIT, --tensor-split SPLIT\n");
- printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
- printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
- printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+ options.push_back({ "*", "-ngl, --gpu-layers N",
+ "number of layers to store in VRAM" });
+ options.push_back({ "*", "-ngld, --gpu-layers-draft N",
+ "number of layers to store in VRAM for the draft model" });
+ options.push_back({ "*", "-sm, --split-mode SPLIT_MODE",
+ "how to split the model across multiple GPUs, one of:\n"
+ " - none: use one GPU only\n"
+ " - layer (default): split layers and KV across GPUs\n"
+ " - row: split rows across GPUs" });
+ options.push_back({ "*", "-ts, --tensor-split SPLIT",
+ "fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1" });
+ options.push_back({ "*", "-mg, --main-gpu i", "the GPU to use for the model (with split-mode = none),\n"
+ "or for intermediate results and KV (with split-mode = row) (default: %d)", params.main_gpu });
}
- printf(" --rpc SERVERS comma separated list of RPC servers\n");
- printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
- printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
- printf(" -gan N, --grp-attn-n N\n");
- printf(" group-attention factor (default: %d)\n", params.grp_attn_n);
- printf(" -gaw N, --grp-attn-w N\n");
- printf(" group-attention width (default: %.1f)\n", (double)params.grp_attn_w);
- printf(" -dkvc, --dump-kv-cache\n");
- printf(" verbose print of the KV cache\n");
- printf(" -nkvo, --no-kv-offload\n");
- printf(" disable KV offload\n");
- printf(" -ctk TYPE, --cache-type-k TYPE\n");
- printf(" KV cache data type for K (default: %s)\n", params.cache_type_k.c_str());
- printf(" -ctv TYPE, --cache-type-v TYPE\n");
- printf(" KV cache data type for V (default: %s)\n", params.cache_type_v.c_str());
- printf(" --simple-io use basic IO for better compatibility in subprocesses and limited consoles\n");
- printf(" --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
- printf(" --lora-scaled FNAME S apply LoRA adapter with user defined scaling S (implies --no-mmap)\n");
- printf(" --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
- printf(" --control-vector FNAME\n");
- printf(" add a control vector\n");
- printf(" --control-vector-scaled FNAME S\n");
- printf(" add a control vector with user defined scaling S\n");
- printf(" --control-vector-layer-range START END\n");
- printf(" layer range to apply the control vector(s) to, start and end inclusive\n");
- printf(" -m FNAME, --model FNAME\n");
- printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
- printf(" -md FNAME, --model-draft FNAME\n");
- printf(" draft model for speculative decoding (default: unused)\n");
- printf(" -mu MODEL_URL, --model-url MODEL_URL\n");
- printf(" model download url (default: unused)\n");
- printf(" -hfr REPO, --hf-repo REPO\n");
- printf(" Hugging Face model repository (default: unused)\n");
- printf(" -hff FILE, --hf-file FILE\n");
- printf(" Hugging Face model file (default: unused)\n");
- printf(" -ld LOGDIR, --logdir LOGDIR\n");
- printf(" path under which to save YAML logs (no logging if unset)\n");
- printf(" -lcs FNAME, --lookup-cache-static FNAME\n");
- printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n");
- printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n");
- printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n");
- printf(" --override-kv KEY=TYPE:VALUE\n");
- printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
- printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
- printf(" -ptc N, --print-token-count N\n");
- printf(" print token count every N tokens (default: %d)\n", params.n_print);
- printf(" --check-tensors check model tensor data for invalid values\n");
- printf("\n");
+
+ options.push_back({ "model" });
+ options.push_back({ "*", " --check-tensors", "check model tensor data for invalid values (default: %s)", params.check_tensors ? "true" : "false" });
+ options.push_back({ "*", " --override-kv KEY=TYPE:VALUE",
+ "advanced option to override model metadata by key. may be specified multiple times.\n"
+ "types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false" });
+ options.push_back({ "*", " --lora FNAME", "apply LoRA adapter (implies --no-mmap)" });
+ options.push_back({ "*", " --lora-scaled FNAME S", "apply LoRA adapter with user defined scaling S (implies --no-mmap)" });
+ options.push_back({ "*", " --lora-base FNAME", "optional model to use as a base for the layers modified by the LoRA adapter" });
+ options.push_back({ "*", " --control-vector FNAME", "add a control vector" });
+ options.push_back({ "*", " --control-vector-scaled FNAME SCALE",
+ "add a control vector with user defined scaling SCALE" });
+ options.push_back({ "*", " --control-vector-layer-range START END",
+ "layer range to apply the control vector(s) to, start and end inclusive" });
+ options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
+ "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
+ options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
+ options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
+ options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
+ options.push_back({ "*", "-hff, --hf-file FILE", "Hugging Face model file (default: unused)" });
+
+ options.push_back({ "retrieval" });
+ options.push_back({ "retrieval", " --context-file FNAME", "file to load context from (repeat to specify multiple files)" });
+ options.push_back({ "retrieval", " --chunk-size N", "minimum length of embedded text chunks (default: %d)", params.chunk_size });
+ options.push_back({ "retrieval", " --chunk-separator STRING",
+ "separator between chunks (default: '%s')", params.chunk_separator.c_str() });
+
+ options.push_back({ "passkey" });
+ options.push_back({ "passkey", " --junk N", "number of times to repeat the junk text (default: %d)", params.n_junk });
+ options.push_back({ "passkey", " --pos N", "position of the passkey in the junk text (default: %d)", params.i_pos });
+
+ options.push_back({ "bench" });
+ options.push_back({ "bench", "-pps", "is the prompt shared across parallel sequences (default: %s)", params.is_pp_shared ? "true" : "false" });
+ options.push_back({ "bench", "-npp n0,n1,...", "number of prompt tokens" });
+ options.push_back({ "bench", "-ntg n0,n1,...", "number of text generation tokens" });
+ options.push_back({ "bench", "-npl n0,n1,...", "number of parallel prompts" });
+
+ options.push_back({ "server" });
+ options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
+ options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
+ options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
+ options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
+ options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
+ options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
+ options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
+ options.push_back({ "server", " --ssl-cert-file FNAME", "path to file a PEM-encoded SSL certificate" });
+ options.push_back({ "server", " --timeout N", "server read/write timeout in seconds (default: %d)", params.timeout_read });
+ options.push_back({ "server", " --system-prompt-file FNAME",
+ "set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications" });
+ options.push_back({ "server", " --log-format {text,json}",
+ "log output format: json or text (default: json)" });
+ options.push_back({ "server", " --metrics", "enable prometheus compatible metrics endpoint (default: %s)", params.endpoint_metrics ? "enabled" : "disabled" });
+ options.push_back({ "server", " --no-slots", "disables slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled" });
+ options.push_back({ "server", " --slot-save-path PATH", "path to save slot kv cache (default: disabled)" });
+ options.push_back({ "server", " --chat-template JINJA_TEMPLATE",
+ "set custom jinja chat template (default: template taken from model's metadata)\n"
+ "only commonly used templates are accepted:\n"
+ "https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
+
#ifndef LOG_DISABLE_LOGS
- log_print_usage();
+ options.push_back({ "logging" });
+ options.push_back({ "*", " --simple-io", "use basic IO for better compatibility in subprocesses and limited consoles" });
+ options.push_back({ "*", "-ld, --logdir LOGDIR", "path under which to save YAML logs (no logging if unset)" });
+ options.push_back({ "logging", " --log-test", "Run simple logging test" });
+ options.push_back({ "logging", " --log-disable", "Disable trace logs" });
+ options.push_back({ "logging", " --log-enable", "Enable trace logs" });
+ options.push_back({ "logging", " --log-file FNAME", "Specify a log filename (without extension)" });
+ options.push_back({ "logging", " --log-new", "Create a separate new log file on start. "
+ "Each log file will have unique name: \"..log\"" });
+ options.push_back({ "logging", " --log-append", "Don't truncate the old log file." });
#endif // LOG_DISABLE_LOGS
+
+ printf("usage: %s [options]\n", argv[0]);
+
+ for (const auto & o : options) {
+ if (!o.grp.empty()) {
+ printf("\n%s:\n\n", o.grp.c_str());
+ continue;
+ }
+ printf(" %-32s", o.args.c_str());
+ if (o.args.length() > 30) {
+ printf("\n%34s", "");
+ }
+
+ const auto desc = o.desc;
+ size_t start = 0;
+ size_t end = desc.find('\n');
+ while (end != std::string::npos) {
+ printf("%s\n%34s", desc.substr(start, end - start).c_str(), "");
+ start = end + 1;
+ end = desc.find('\n', start);
+ }
+
+ printf("%s\n", desc.substr(start).c_str());
+ }
+ printf("\n");
}
-std::string get_system_info(const gpt_params & params) {
+std::string gpt_params_get_system_info(const gpt_params & params) {
std::ostringstream os;
os << "system_info: n_threads = " << params.n_threads;
@@ -1635,27 +1883,141 @@ std::string get_system_info(const gpt_params & params) {
return os.str();
}
-std::string gpt_random_prompt(std::mt19937 & rng) {
- const int r = rng() % 10;
- switch (r) {
- case 0: return "So";
- case 1: return "Once upon a time";
- case 2: return "When";
- case 3: return "The";
- case 4: return "After";
- case 5: return "If";
- case 6: return "import";
- case 7: return "He";
- case 8: return "She";
- case 9: return "They";
+//
+// String utils
+//
+
+std::vector string_split(std::string input, char separator) {
+ std::vector parts;
+ size_t separator_pos = input.find(separator);
+ while (separator_pos != std::string::npos) {
+ std::string part = input.substr(0, separator_pos);
+ parts.emplace_back(part);
+ input = input.substr(separator_pos + 1);
+ separator_pos = input.find(separator);
+ }
+ parts.emplace_back(input);
+ return parts;
+}
+
+std::string string_strip(const std::string & str) {
+ size_t start = 0;
+ size_t end = str.size();
+ while (start < end && std::isspace(str[start])) {
+ start++;
+ }
+ while (end > start && std::isspace(str[end - 1])) {
+ end--;
+ }
+ return str.substr(start, end - start);
+}
+
+std::string string_get_sortable_timestamp() {
+ using clock = std::chrono::system_clock;
+
+ const clock::time_point current_time = clock::now();
+ const time_t as_time_t = clock::to_time_t(current_time);
+ char timestamp_no_ns[100];
+ std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
+
+ const int64_t ns = std::chrono::duration_cast(
+ current_time.time_since_epoch() % 1000000000).count();
+ char timestamp_ns[11];
+ snprintf(timestamp_ns, 11, "%09" PRId64, ns);
+
+ return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
+}
+
+void string_process_escapes(std::string & input) {
+ std::size_t input_len = input.length();
+ std::size_t output_idx = 0;
+
+ for (std::size_t input_idx = 0; input_idx < input_len; ++input_idx) {
+ if (input[input_idx] == '\\' && input_idx + 1 < input_len) {
+ switch (input[++input_idx]) {
+ case 'n': input[output_idx++] = '\n'; break;
+ case 'r': input[output_idx++] = '\r'; break;
+ case 't': input[output_idx++] = '\t'; break;
+ case '\'': input[output_idx++] = '\''; break;
+ case '\"': input[output_idx++] = '\"'; break;
+ case '\\': input[output_idx++] = '\\'; break;
+ case 'x':
+ // Handle \x12, etc
+ if (input_idx + 2 < input_len) {
+ const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
+ char *err_p = nullptr;
+ const long val = std::strtol(x, &err_p, 16);
+ if (err_p == x + 2) {
+ input_idx += 2;
+ input[output_idx++] = char(val);
+ break;
+ }
+ }
+ // fall through
+ default: input[output_idx++] = '\\';
+ input[output_idx++] = input[input_idx]; break;
+ }
+ } else {
+ input[output_idx++] = input[input_idx];
+ }
}
- GGML_UNREACHABLE();
+ input.resize(output_idx);
}
+bool string_parse_kv_override(const char * data, std::vector & overrides) {
+ const char * sep = strchr(data, '=');
+ if (sep == nullptr || sep - data >= 128) {
+ fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+ return false;
+ }
+ llama_model_kv_override kvo;
+ std::strncpy(kvo.key, data, sep - data);
+ kvo.key[sep - data] = 0;
+ sep++;
+ if (strncmp(sep, "int:", 4) == 0) {
+ sep += 4;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+ kvo.val_i64 = std::atol(sep);
+ } else if (strncmp(sep, "float:", 6) == 0) {
+ sep += 6;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+ kvo.val_f64 = std::atof(sep);
+ } else if (strncmp(sep, "bool:", 5) == 0) {
+ sep += 5;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+ if (std::strcmp(sep, "true") == 0) {
+ kvo.val_bool = true;
+ } else if (std::strcmp(sep, "false") == 0) {
+ kvo.val_bool = false;
+ } else {
+ fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+ return false;
+ }
+ } else if (strncmp(sep, "str:", 4) == 0) {
+ sep += 4;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR;
+ if (strlen(sep) > 127) {
+ fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data);
+ return false;
+ }
+ strncpy(kvo.val_str, sep, 127);
+ kvo.val_str[127] = '\0';
+ } else {
+ fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+ return false;
+ }
+ overrides.emplace_back(std::move(kvo));
+ return true;
+}
+
+//
+// Filesystem utils
+//
+
// Validate if a filename is safe to use
// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function
-bool validate_file_name(const std::string & filename) {
+bool fs_validate_filename(const std::string & filename) {
if (!filename.length()) {
// Empty filename invalid
return false;
@@ -1724,120 +2086,198 @@ bool validate_file_name(const std::string & filename) {
return true;
}
-//
-// String utils
-//
+// returns true if successful, false otherwise
+bool fs_create_directory_with_parents(const std::string & path) {
+#ifdef _WIN32
+ std::wstring_convert> converter;
+ std::wstring wpath = converter.from_bytes(path);
-std::vector string_split(std::string input, char separator) {
- std::vector parts;
- size_t separator_pos = input.find(separator);
- while (separator_pos != std::string::npos) {
- std::string part = input.substr(0, separator_pos);
- parts.emplace_back(part);
- input = input.substr(separator_pos + 1);
- separator_pos = input.find(separator);
+ // if the path already exists, check whether it's a directory
+ const DWORD attributes = GetFileAttributesW(wpath.c_str());
+ if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+ return true;
}
- parts.emplace_back(input);
- return parts;
-}
-std::string string_strip(const std::string & str) {
- size_t start = 0;
- size_t end = str.size();
- while (start < end && std::isspace(str[start])) {
- start++;
- }
- while (end > start && std::isspace(str[end - 1])) {
- end--;
- }
- return str.substr(start, end - start);
-}
+ size_t pos_slash = 0;
-std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) {
- std::unordered_map sampler_canonical_name_map {
- {"top_k", llama_sampler_type::TOP_K},
- {"top_p", llama_sampler_type::TOP_P},
- {"typical_p", llama_sampler_type::TYPICAL_P},
- {"min_p", llama_sampler_type::MIN_P},
- {"tfs_z", llama_sampler_type::TFS_Z},
- {"temperature", llama_sampler_type::TEMPERATURE}
- };
+ // process path from front to back, procedurally creating directories
+ while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
+ const std::wstring subpath = wpath.substr(0, pos_slash);
+ const wchar_t * test = subpath.c_str();
- // since samplers names are written multiple ways
- // make it ready for both system names and input names
- std::unordered_map sampler_alt_name_map {
- {"top-k", llama_sampler_type::TOP_K},
- {"top-p", llama_sampler_type::TOP_P},
- {"nucleus", llama_sampler_type::TOP_P},
- {"typical-p", llama_sampler_type::TYPICAL_P},
- {"typical", llama_sampler_type::TYPICAL_P},
- {"min-p", llama_sampler_type::MIN_P},
- {"tfs-z", llama_sampler_type::TFS_Z},
- {"tfs", llama_sampler_type::TFS_Z},
- {"temp", llama_sampler_type::TEMPERATURE}
- };
+ const bool success = CreateDirectoryW(test, NULL);
+ if (!success) {
+ const DWORD error = GetLastError();
- std::vector sampler_types;
- sampler_types.reserve(names.size());
- for (const auto & name : names)
- {
- auto sampler_item = sampler_canonical_name_map.find(name);
- if (sampler_item != sampler_canonical_name_map.end())
- {
- sampler_types.push_back(sampler_item->second);
- }
- else
- {
- if (allow_alt_names)
- {
- sampler_item = sampler_alt_name_map.find(name);
- if (sampler_item != sampler_alt_name_map.end())
- {
- sampler_types.push_back(sampler_item->second);
+ // if the path already exists, ensure that it's a directory
+ if (error == ERROR_ALREADY_EXISTS) {
+ const DWORD attributes = GetFileAttributesW(subpath.c_str());
+ if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
+ return false;
}
+ } else {
+ return false;
}
}
+
+ pos_slash += 1;
}
- return sampler_types;
-}
-std::vector sampler_types_from_chars(const std::string & names_string) {
- std::unordered_map sampler_name_map {
- {'k', llama_sampler_type::TOP_K},
- {'p', llama_sampler_type::TOP_P},
- {'y', llama_sampler_type::TYPICAL_P},
- {'m', llama_sampler_type::MIN_P},
- {'f', llama_sampler_type::TFS_Z},
- {'t', llama_sampler_type::TEMPERATURE}
- };
+ return true;
+#else
+ // if the path already exists, check whether it's a directory
+ struct stat info;
+ if (stat(path.c_str(), &info) == 0) {
+ return S_ISDIR(info.st_mode);
+ }
- std::vector sampler_types;
- sampler_types.reserve(names_string.size());
- for (const auto & c : names_string) {
- const auto sampler_item = sampler_name_map.find(c);
- if (sampler_item != sampler_name_map.end()) {
- sampler_types.push_back(sampler_item->second);
+ size_t pos_slash = 1; // skip leading slashes for directory creation
+
+ // process path from front to back, procedurally creating directories
+ while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
+ const std::string subpath = path.substr(0, pos_slash);
+ struct stat info;
+
+ // if the path already exists, ensure that it's a directory
+ if (stat(subpath.c_str(), &info) == 0) {
+ if (!S_ISDIR(info.st_mode)) {
+ return false;
+ }
+ } else {
+ // create parent directories
+ const int ret = mkdir(subpath.c_str(), 0755);
+ if (ret != 0) {
+ return false;
+ }
}
+
+ pos_slash += 1;
}
- return sampler_types;
+
+ return true;
+#endif // _WIN32
}
-std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
- switch (sampler_type) {
- case llama_sampler_type::TOP_K: return "top_k";
- case llama_sampler_type::TFS_Z: return "tfs_z";
- case llama_sampler_type::TYPICAL_P: return "typical_p";
- case llama_sampler_type::TOP_P: return "top_p";
- case llama_sampler_type::MIN_P: return "min_p";
- case llama_sampler_type::TEMPERATURE: return "temperature";
- default : return "";
+std::string fs_get_cache_directory() {
+ std::string cache_directory = "";
+ auto ensure_trailing_slash = [](std::string p) {
+ // Make sure to add trailing slash
+ if (p.back() != DIRECTORY_SEPARATOR) {
+ p += DIRECTORY_SEPARATOR;
+ }
+ return p;
+ };
+ if (getenv("LLAMA_CACHE")) {
+ cache_directory = std::getenv("LLAMA_CACHE");
+ } else {
+#ifdef __linux__
+ if (std::getenv("XDG_CACHE_HOME")) {
+ cache_directory = std::getenv("XDG_CACHE_HOME");
+ } else {
+ cache_directory = std::getenv("HOME") + std::string("/.cache/");
+ }
+#elif defined(__APPLE__)
+ cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
+#elif defined(_WIN32)
+ cache_directory = std::getenv("LOCALAPPDATA");
+#endif // __linux__
+ cache_directory = ensure_trailing_slash(cache_directory);
+ cache_directory += "llama.cpp";
}
+ return ensure_trailing_slash(cache_directory);
}
+
//
// Model utils
//
+std::tuple llama_init_from_gpt_params(gpt_params & params) {
+ auto mparams = llama_model_params_from_gpt_params(params);
+
+ llama_model * model = nullptr;
+
+ if (!params.hf_repo.empty() && !params.hf_file.empty()) {
+ model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
+ } else if (!params.model_url.empty()) {
+ model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+ } else {
+ model = llama_load_model_from_file(params.model.c_str(), mparams);
+ }
+
+ if (model == NULL) {
+ fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ auto cparams = llama_context_params_from_gpt_params(params);
+
+ llama_context * lctx = llama_new_context_with_model(model, cparams);
+ if (lctx == NULL) {
+ fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ if (!params.control_vectors.empty()) {
+ if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
+ if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
+
+ const auto cvec = llama_control_vector_load(params.control_vectors);
+ if (cvec.n_embd == -1) {
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+
+ int err = llama_control_vector_apply(lctx,
+ cvec.data.data(),
+ cvec.data.size(),
+ cvec.n_embd,
+ params.control_vector_layer_start,
+ params.control_vector_layer_end);
+ if (err) {
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+ }
+
+ for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
+ const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
+ float lora_scale = std::get<1>(params.lora_adapter[i]);
+ int err = llama_model_apply_lora_from_file(model,
+ lora_adapter.c_str(),
+ lora_scale,
+ ((i > 0) || params.lora_base.empty())
+ ? NULL
+ : params.lora_base.c_str(),
+ params.n_threads);
+ if (err != 0) {
+ fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
+ llama_free(lctx);
+ llama_free_model(model);
+ return std::make_tuple(nullptr, nullptr);
+ }
+ }
+
+ if (params.ignore_eos) {
+ params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
+ }
+
+ if (params.warmup) {
+ LOG("warming up the model with an empty run\n");
+
+ std::vector tmp = { llama_token_bos(model), llama_token_eos(model), };
+ llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
+ llama_kv_cache_clear(lctx);
+ llama_synchronize(lctx);
+ llama_reset_timings(lctx);
+ }
+
+ return std::make_tuple(model, lctx);
+}
+
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
auto mparams = llama_model_default_params();
@@ -1923,27 +2363,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
return cparams;
}
-void llama_batch_clear(struct llama_batch & batch) {
- batch.n_tokens = 0;
-}
-
-void llama_batch_add(
- struct llama_batch & batch,
- llama_token id,
- llama_pos pos,
- const std::vector & seq_ids,
- bool logits) {
- batch.token [batch.n_tokens] = id;
- batch.pos [batch.n_tokens] = pos;
- batch.n_seq_id[batch.n_tokens] = seq_ids.size();
- for (size_t i = 0; i < seq_ids.size(); ++i) {
- batch.seq_id[batch.n_tokens][i] = seq_ids[i];
- }
- batch.logits [batch.n_tokens] = logits;
-
- batch.n_tokens++;
-}
-
#ifdef LLAMA_USE_CURL
static bool starts_with(const std::string & str, const std::string & prefix) {
@@ -2274,90 +2693,29 @@ struct llama_model * llama_load_model_from_hf(
#endif // LLAMA_USE_CURL
-std::tuple llama_init_from_gpt_params(gpt_params & params) {
- auto mparams = llama_model_params_from_gpt_params(params);
+//
+// Batch utils
+//
- llama_model * model = nullptr;
+void llama_batch_clear(struct llama_batch & batch) {
+ batch.n_tokens = 0;
+}
- if (!params.hf_repo.empty() && !params.hf_file.empty()) {
- model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams);
- } else if (!params.model_url.empty()) {
- model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
- } else {
- model = llama_load_model_from_file(params.model.c_str(), mparams);
+void llama_batch_add(
+ struct llama_batch & batch,
+ llama_token id,
+ llama_pos pos,
+ const std::vector & seq_ids,
+ bool logits) {
+ batch.token [batch.n_tokens] = id;
+ batch.pos [batch.n_tokens] = pos;
+ batch.n_seq_id[batch.n_tokens] = seq_ids.size();
+ for (size_t i = 0; i < seq_ids.size(); ++i) {
+ batch.seq_id[batch.n_tokens][i] = seq_ids[i];
}
+ batch.logits [batch.n_tokens] = logits;
- if (model == NULL) {
- fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
- return std::make_tuple(nullptr, nullptr);
- }
-
- auto cparams = llama_context_params_from_gpt_params(params);
-
- llama_context * lctx = llama_new_context_with_model(model, cparams);
- if (lctx == NULL) {
- fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
- llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
- }
-
- if (!params.control_vectors.empty()) {
- if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
- if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
-
- const auto cvec = llama_control_vector_load(params.control_vectors);
- if (cvec.n_embd == -1) {
- llama_free(lctx);
- llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
- }
-
- int err = llama_control_vector_apply(lctx,
- cvec.data.data(),
- cvec.data.size(),
- cvec.n_embd,
- params.control_vector_layer_start,
- params.control_vector_layer_end);
- if (err) {
- llama_free(lctx);
- llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
- }
- }
-
- for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
- const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
- float lora_scale = std::get<1>(params.lora_adapter[i]);
- int err = llama_model_apply_lora_from_file(model,
- lora_adapter.c_str(),
- lora_scale,
- ((i > 0) || params.lora_base.empty())
- ? NULL
- : params.lora_base.c_str(),
- params.n_threads);
- if (err != 0) {
- fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
- llama_free(lctx);
- llama_free_model(model);
- return std::make_tuple(nullptr, nullptr);
- }
- }
-
- if (params.ignore_eos) {
- params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
- }
-
- if (params.warmup) {
- LOG("warming up the model with an empty run\n");
-
- std::vector tmp = { llama_token_bos(model), llama_token_eos(model), };
- llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
- llama_kv_cache_clear(lctx);
- llama_synchronize(lctx);
- llama_reset_timings(lctx);
- }
-
- return std::make_tuple(model, lctx);
+ batch.n_tokens++;
}
//
@@ -2445,346 +2803,17 @@ bool llama_should_add_bos_token(const llama_model * model) {
return add_bos != -1 ? bool(add_bos) : (llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
}
-//
-// YAML utils
-//
-
-// returns true if successful, false otherwise
-bool create_directory_with_parents(const std::string & path) {
-#ifdef _WIN32
- std::wstring_convert> converter;
- std::wstring wpath = converter.from_bytes(path);
-
- // if the path already exists, check whether it's a directory
- const DWORD attributes = GetFileAttributesW(wpath.c_str());
- if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
- return true;
- }
-
- size_t pos_slash = 0;
-
- // process path from front to back, procedurally creating directories
- while ((pos_slash = path.find('\\', pos_slash)) != std::string::npos) {
- const std::wstring subpath = wpath.substr(0, pos_slash);
- const wchar_t * test = subpath.c_str();
-
- const bool success = CreateDirectoryW(test, NULL);
- if (!success) {
- const DWORD error = GetLastError();
-
- // if the path already exists, ensure that it's a directory
- if (error == ERROR_ALREADY_EXISTS) {
- const DWORD attributes = GetFileAttributesW(subpath.c_str());
- if (attributes == INVALID_FILE_ATTRIBUTES || !(attributes & FILE_ATTRIBUTE_DIRECTORY)) {
- return false;
- }
- } else {
- return false;
- }
- }
-
- pos_slash += 1;
- }
-
- return true;
-#else
- // if the path already exists, check whether it's a directory
- struct stat info;
- if (stat(path.c_str(), &info) == 0) {
- return S_ISDIR(info.st_mode);
- }
-
- size_t pos_slash = 1; // skip leading slashes for directory creation
-
- // process path from front to back, procedurally creating directories
- while ((pos_slash = path.find('/', pos_slash)) != std::string::npos) {
- const std::string subpath = path.substr(0, pos_slash);
- struct stat info;
-
- // if the path already exists, ensure that it's a directory
- if (stat(subpath.c_str(), &info) == 0) {
- if (!S_ISDIR(info.st_mode)) {
- return false;
- }
- } else {
- // create parent directories
- const int ret = mkdir(subpath.c_str(), 0755);
- if (ret != 0) {
- return false;
- }
- }
-
- pos_slash += 1;
- }
-
- return true;
-#endif // _WIN32
-}
-
-std::string get_cache_directory() {
- std::string cache_directory = "";
- if (getenv("LLAMA_CACHE")) {
- cache_directory = std::getenv("LLAMA_CACHE");
- if (cache_directory.back() != DIRECTORY_SEPARATOR) {
- cache_directory += DIRECTORY_SEPARATOR;
- }
- } else {
-#ifdef __linux__
- if (std::getenv("XDG_CACHE_HOME")) {
- cache_directory = std::getenv("XDG_CACHE_HOME");
- } else {
- cache_directory = std::getenv("HOME") + std::string("/.cache/");
- }
-#elif defined(__APPLE__)
- cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
-#elif defined(_WIN32)
- cache_directory = std::getenv("APPDATA");
-#endif // __linux__
- cache_directory += "llama.cpp";
- cache_directory += DIRECTORY_SEPARATOR;
- }
- return cache_directory;
-}
-
-void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data) {
- if (data.empty()) {
- fprintf(stream, "%s:\n", prop_name);
- return;
- }
-
- fprintf(stream, "%s: [", prop_name);
- for (size_t i = 0; i < data.size() - 1; ++i) {
- fprintf(stream, "%e, ", data[i]);
- }
- fprintf(stream, "%e]\n", data.back());
-}
-
-void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data) {
- if (data.empty()) {
- fprintf(stream, "%s:\n", prop_name);
- return;
- }
-
- fprintf(stream, "%s: [", prop_name);
- for (size_t i = 0; i < data.size() - 1; ++i) {
- fprintf(stream, "%d, ", data[i]);
- }
- fprintf(stream, "%d]\n", data.back());
-}
-
-void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data) {
- std::string data_str(data == NULL ? "" : data);
-
- if (data_str.empty()) {
- fprintf(stream, "%s:\n", prop_name);
- return;
- }
-
- size_t pos_start = 0;
- size_t pos_found = 0;
-
- if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
- data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
- data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
- data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
- data_str = "\"" + data_str + "\"";
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
- return;
- }
-
- if (data_str.find('\n') == std::string::npos) {
- fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
- return;
- }
-
- fprintf(stream, "%s: |\n", prop_name);
- while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
- fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
- pos_start = pos_found + 1;
- }
-}
-
-std::string get_sortable_timestamp() {
- using clock = std::chrono::system_clock;
-
- const clock::time_point current_time = clock::now();
- const time_t as_time_t = clock::to_time_t(current_time);
- char timestamp_no_ns[100];
- std::strftime(timestamp_no_ns, 100, "%Y_%m_%d-%H_%M_%S", std::localtime(&as_time_t));
-
- const int64_t ns = std::chrono::duration_cast(
- current_time.time_since_epoch() % 1000000000).count();
- char timestamp_ns[11];
- snprintf(timestamp_ns, 11, "%09" PRId64, ns);
-
- return std::string(timestamp_no_ns) + "." + std::string(timestamp_ns);
-}
-
-void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const llama_context * lctx,
- const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) {
- const llama_sampling_params & sparams = params.sparams;
-
- fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
- fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
- fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
- fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
- fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
- fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
- fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
- fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
- fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
- fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
- fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
- fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
- fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
- fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
- fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
- fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
- fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
- fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
- fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
- fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
- fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
- fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
- fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
-
-#ifdef NDEBUG
- fprintf(stream, "debug: false\n");
-#else
- fprintf(stream, "debug: true\n");
-#endif // NDEBUG
-
- fprintf(stream, "model_desc: %s\n", model_desc);
- fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
-
-#ifdef __OPTIMIZE__
- fprintf(stream, "optimize: true\n");
-#else
- fprintf(stream, "optimize: false\n");
-#endif // __OPTIMIZE__
-
- fprintf(stream, "time: %s\n", timestamp.c_str());
-
- fprintf(stream, "\n");
- fprintf(stream, "###############\n");
- fprintf(stream, "# User Inputs #\n");
- fprintf(stream, "###############\n");
- fprintf(stream, "\n");
-
- fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
- fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
- dump_string_yaml_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
- fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
- fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
- fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
- fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
- fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
- fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
- fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
- dump_string_yaml_multiline(stream, "grammar", sparams.grammar.c_str());
- fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
- fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
- fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
-
- const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
- const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
- fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
-
- dump_string_yaml_multiline(stream, "in_prefix", params.input_prefix.c_str());
- fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
- dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
- fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
- fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
- fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
- fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
- fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
- fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
-
- fprintf(stream, "logit_bias:\n");
- for (std::pair lb : sparams.logit_bias) {
- if (ignore_eos && lb.first == logit_bias_eos->first) {
- continue;
- }
- fprintf(stream, " %d: %f", lb.first, lb.second);
- }
-
- fprintf(stream, "lora:\n");
- for (std::tuple la : params.lora_adapter) {
- if (std::get<1>(la) != 1.0f) {
- continue;
- }
- fprintf(stream, " - %s\n", std::get<0>(la).c_str());
- }
- fprintf(stream, "lora_scaled:\n");
- for (std::tuple la : params.lora_adapter) {
- if (std::get<1>(la) == 1.0f) {
- continue;
- }
- fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
- }
- fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
- fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
- fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
- fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
- fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
- fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
- fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
- fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
- fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
- fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
- fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
- fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
- fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
- fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
- fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
- fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
- fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
- fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
- dump_string_yaml_multiline(stream, "prompt", params.prompt.c_str());
- fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
- fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
- fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
- dump_vector_int_yaml(stream, "prompt_tokens", prompt_tokens);
- fprintf(stream, "random_prompt: %s # default: false\n", params.random_prompt ? "true" : "false");
- fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
-
- fprintf(stream, "reverse_prompt:\n");
- for (std::string ap : params.antiprompt) {
- size_t pos = 0;
- while ((pos = ap.find('\n', pos)) != std::string::npos) {
- ap.replace(pos, 1, "\\n");
- pos += 1;
- }
-
- fprintf(stream, " - %s\n", ap.c_str());
- }
-
- fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
- fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
- fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
- fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
- fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
- fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
- fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
-
- const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
- dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
-
- fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
- fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
- fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
- fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
- fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
- fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
- fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
- fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
+bool llama_chat_verify_template(const std::string & tmpl) {
+ llama_chat_message chat[] = {{"user", "test"}};
+ int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
+ return res >= 0;
}
//
// KV cache utils
//
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
+void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
@@ -2807,7 +2836,7 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
printf("\n=== Done dumping\n");
}
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
+void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
@@ -2855,6 +2884,10 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
printf("\n=== Done dumping\n");
}
+//
+// Embedding utils
+//
+
void llama_embd_normalize(const float * inp, float * out, int n) {
double sum = 0.0;
for (int i = 0; i < n; i++) {
@@ -3039,3 +3072,222 @@ llama_control_vector_data llama_control_vector_load(const std::vector & data) {
+ if (data.empty()) {
+ fprintf(stream, "%s:\n", prop_name);
+ return;
+ }
+
+ fprintf(stream, "%s: [", prop_name);
+ for (size_t i = 0; i < data.size() - 1; ++i) {
+ fprintf(stream, "%e, ", data[i]);
+ }
+ fprintf(stream, "%e]\n", data.back());
+}
+
+void yaml_dump_vector_int(FILE * stream, const char * prop_name, const std::vector & data) {
+ if (data.empty()) {
+ fprintf(stream, "%s:\n", prop_name);
+ return;
+ }
+
+ fprintf(stream, "%s: [", prop_name);
+ for (size_t i = 0; i < data.size() - 1; ++i) {
+ fprintf(stream, "%d, ", data[i]);
+ }
+ fprintf(stream, "%d]\n", data.back());
+}
+
+void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data) {
+ std::string data_str(data == NULL ? "" : data);
+
+ if (data_str.empty()) {
+ fprintf(stream, "%s:\n", prop_name);
+ return;
+ }
+
+ size_t pos_start = 0;
+ size_t pos_found = 0;
+
+ if (std::isspace(data_str[0]) || std::isspace(data_str.back())) {
+ data_str = std::regex_replace(data_str, std::regex("\n"), "\\n");
+ data_str = std::regex_replace(data_str, std::regex("\""), "\\\"");
+ data_str = std::regex_replace(data_str, std::regex(R"(\\[^n"])"), R"(\$&)");
+ data_str = "\"" + data_str + "\"";
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+ return;
+ }
+
+ if (data_str.find('\n') == std::string::npos) {
+ fprintf(stream, "%s: %s\n", prop_name, data_str.c_str());
+ return;
+ }
+
+ fprintf(stream, "%s: |\n", prop_name);
+ while ((pos_found = data_str.find('\n', pos_start)) != std::string::npos) {
+ fprintf(stream, " %s\n", data_str.substr(pos_start, pos_found-pos_start).c_str());
+ pos_start = pos_found + 1;
+ }
+}
+
+void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
+ const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc) {
+ const llama_sampling_params & sparams = params.sparams;
+
+ fprintf(stream, "build_commit: %s\n", LLAMA_COMMIT);
+ fprintf(stream, "build_number: %d\n", LLAMA_BUILD_NUMBER);
+ fprintf(stream, "cpu_has_arm_fma: %s\n", ggml_cpu_has_arm_fma() ? "true" : "false");
+ fprintf(stream, "cpu_has_avx: %s\n", ggml_cpu_has_avx() ? "true" : "false");
+ fprintf(stream, "cpu_has_avx_vnni: %s\n", ggml_cpu_has_avx_vnni() ? "true" : "false");
+ fprintf(stream, "cpu_has_avx2: %s\n", ggml_cpu_has_avx2() ? "true" : "false");
+ fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
+ fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
+ fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
+ fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false");
+ fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false");
+ fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false");
+ fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
+ fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
+ fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
+ fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
+ fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
+ fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
+ fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
+ fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
+ fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
+ fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
+ fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
+
+#ifdef NDEBUG
+ fprintf(stream, "debug: false\n");
+#else
+ fprintf(stream, "debug: true\n");
+#endif // NDEBUG
+
+ fprintf(stream, "model_desc: %s\n", model_desc);
+ fprintf(stream, "n_vocab: %d # output size of the final layer, 32001 for some models\n", llama_n_vocab(llama_get_model(lctx)));
+
+#ifdef __OPTIMIZE__
+ fprintf(stream, "optimize: true\n");
+#else
+ fprintf(stream, "optimize: false\n");
+#endif // __OPTIMIZE__
+
+ fprintf(stream, "time: %s\n", timestamp.c_str());
+
+ fprintf(stream, "\n");
+ fprintf(stream, "###############\n");
+ fprintf(stream, "# User Inputs #\n");
+ fprintf(stream, "###############\n");
+ fprintf(stream, "\n");
+
+ fprintf(stream, "alias: %s # default: unknown\n", params.model_alias.c_str());
+ fprintf(stream, "batch_size: %d # default: 512\n", params.n_batch);
+ yaml_dump_string_multiline(stream, "cfg_negative_prompt", sparams.cfg_negative_prompt.c_str());
+ fprintf(stream, "cfg_scale: %f # default: 1.0\n", sparams.cfg_scale);
+ fprintf(stream, "chunks: %d # default: -1 (unlimited)\n", params.n_chunks);
+ fprintf(stream, "color: %s # default: false\n", params.use_color ? "true" : "false");
+ fprintf(stream, "ctx_size: %d # default: 512\n", params.n_ctx);
+ fprintf(stream, "escape: %s # default: false\n", params.escape ? "true" : "false");
+ fprintf(stream, "file: # never logged, see prompt instead. Can still be specified for input.\n");
+ fprintf(stream, "frequency_penalty: %f # default: 0.0 \n", sparams.penalty_freq);
+ yaml_dump_string_multiline(stream, "grammar", sparams.grammar.c_str());
+ fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
+ fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
+ fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
+
+ const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx)));
+ const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY;
+ fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false");
+
+ yaml_dump_string_multiline(stream, "in_prefix", params.input_prefix.c_str());
+ fprintf(stream, "in_prefix_bos: %s # default: false\n", params.input_prefix_bos ? "true" : "false");
+ yaml_dump_string_multiline(stream, "in_suffix", params.input_prefix.c_str());
+ fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
+ fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
+ fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
+ fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
+
+ fprintf(stream, "logit_bias:\n");
+ for (std::pair lb : sparams.logit_bias) {
+ if (ignore_eos && lb.first == logit_bias_eos->first) {
+ continue;
+ }
+ fprintf(stream, " %d: %f", lb.first, lb.second);
+ }
+
+ fprintf(stream, "lora:\n");
+ for (std::tuple la : params.lora_adapter) {
+ if (std::get<1>(la) != 1.0f) {
+ continue;
+ }
+ fprintf(stream, " - %s\n", std::get<0>(la).c_str());
+ }
+ fprintf(stream, "lora_scaled:\n");
+ for (std::tuple la : params.lora_adapter) {
+ if (std::get<1>(la) == 1.0f) {
+ continue;
+ }
+ fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
+ }
+ fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
+ fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
+ fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
+ fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);
+ fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau);
+ fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta);
+ fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
+ fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH);
+ fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
+ fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
+ fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers);
+ fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict);
+ fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs);
+ fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false");
+ fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false");
+ fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type);
+ fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride);
+ fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present);
+ yaml_dump_string_multiline(stream, "prompt", params.prompt.c_str());
+ fprintf(stream, "prompt_cache: %s\n", params.path_prompt_cache.c_str());
+ fprintf(stream, "prompt_cache_all: %s # default: false\n", params.prompt_cache_all ? "true" : "false");
+ fprintf(stream, "prompt_cache_ro: %s # default: false\n", params.prompt_cache_ro ? "true" : "false");
+ yaml_dump_vector_int(stream, "prompt_tokens", prompt_tokens);
+ fprintf(stream, "repeat_penalty: %f # default: 1.1\n", sparams.penalty_repeat);
+
+ fprintf(stream, "reverse_prompt:\n");
+ for (std::string ap : params.antiprompt) {
+ size_t pos = 0;
+ while ((pos = ap.find('\n', pos)) != std::string::npos) {
+ ap.replace(pos, 1, "\\n");
+ pos += 1;
+ }
+
+ fprintf(stream, " - %s\n", ap.c_str());
+ }
+
+ fprintf(stream, "rope_freq_base: %f # default: 10000.0\n", params.rope_freq_base);
+ fprintf(stream, "rope_freq_scale: %f # default: 1.0\n", params.rope_freq_scale);
+ fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
+ fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
+ fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
+ fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
+ fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
+
+ const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
+ yaml_dump_vector_float(stream, "tensor_split", tensor_split_vector);
+
+ fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);
+ fprintf(stream, "threads: %d # default: %u\n", params.n_threads, std::thread::hardware_concurrency());
+ fprintf(stream, "top_k: %d # default: 40\n", sparams.top_k);
+ fprintf(stream, "top_p: %f # default: 0.95\n", sparams.top_p);
+ fprintf(stream, "min_p: %f # default: 0.0\n", sparams.min_p);
+ fprintf(stream, "typical_p: %f # default: 1.0\n", sparams.typical_p);
+ fprintf(stream, "verbose_prompt: %s # default: false\n", params.verbose_prompt ? "true" : "false");
+ fprintf(stream, "display_prompt: %s # default: true\n", params.display_prompt ? "true" : "false");
+}
diff --git a/common/common.h b/common/common.h
index a8e5e50e6..e0a08a61b 100644
--- a/common/common.h
+++ b/common/common.h
@@ -27,7 +27,7 @@
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \
- fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
+ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0)
@@ -35,14 +35,18 @@
// build info
extern int LLAMA_BUILD_NUMBER;
-extern char const *LLAMA_COMMIT;
-extern char const *LLAMA_COMPILER;
-extern char const *LLAMA_BUILD_TARGET;
+extern char const * LLAMA_COMMIT;
+extern char const * LLAMA_COMPILER;
+extern char const * LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info;
-int get_math_cpu_count();
-int32_t get_num_physical_cores();
+//
+// CPU utils
+//
+
+int32_t cpu_get_num_physical_cores();
+int32_t cpu_get_num_math();
//
// CLI argument parsing
@@ -51,12 +55,12 @@ int32_t get_num_physical_cores();
struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
- int32_t n_threads = get_math_cpu_count();
+ int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
- int32_t n_ctx = 512; // context size
+ int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@@ -95,23 +99,23 @@ struct gpt_params {
// // sampling parameters
struct llama_sampling_params sparams;
- std::string model = ""; // model path
- std::string model_draft = ""; // draft model for speculative decoding
+ std::string model = ""; // model path
+ std::string model_draft = ""; // draft model for speculative decoding
std::string model_alias = "unknown"; // model alias
- std::string model_url = ""; // model url to download
- std::string hf_repo = ""; // HF repo
- std::string hf_file = ""; // HF file
+ std::string model_url = ""; // model url to download
+ std::string hf_repo = ""; // HF repo
+ std::string hf_file = ""; // HF file
std::string prompt = "";
- std::string prompt_file = ""; // store the external prompt file name
- std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
- std::string input_prefix = ""; // string to prefix user inputs with
- std::string input_suffix = ""; // string to suffix user inputs with
- std::vector antiprompt; // string upon seeing which more user input is prompted
- std::string logdir = ""; // directory in which to save YAML log files
+ std::string prompt_file = ""; // store the external prompt file name
+ std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
+ std::string input_prefix = ""; // string to prefix user inputs with
+ std::string input_suffix = ""; // string to suffix user inputs with
+ std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
- std::string logits_file = ""; // file for saving *all* logits
+ std::string logits_file = ""; // file for saving *all* logits
+ std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector kv_overrides;
// TODO: avoid tuple, use struct
@@ -123,8 +127,8 @@ struct gpt_params {
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
- int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
- int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
+ int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
+ int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
@@ -138,18 +142,17 @@ struct gpt_params {
bool kl_divergence = false; // compute KL divergence
- bool random_prompt = false; // do not randomize prompt if none provided
+ bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
+ bool special = false; // enable special token output
bool interactive = false; // interactive mode
- bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
+ bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
- bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
- bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
- bool interactive_first = false; // wait for user input immediately
+ bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
@@ -157,10 +160,10 @@ struct gpt_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
- bool instruct = false; // instruction mode (used for Alpaca models)
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
+ bool verbose = false;
bool verbose_prompt = false; // print prompt tokens before generation
bool display_prompt = true; // print prompt before generation
bool infill = false; // use infill mode
@@ -175,37 +178,92 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector image; // path to image file(s)
+
+ // server params
+ int32_t port = 8080;
+ int32_t timeout_read = 600;
+ int32_t timeout_write = timeout_read;
+ int32_t n_threads_http = -1;
+
+ std::string hostname = "127.0.0.1";
+ std::string public_path = "";
+ std::string chat_template = "";
+ std::string system_prompt = "";
+
+ std::vector api_keys;
+
+ std::string ssl_file_key = "";
+ std::string ssl_file_cert = "";
+
+ bool endpoint_slots = true;
+ bool endpoint_metrics = false;
+
+ bool log_json = false;
+
+ std::string slot_save_path;
+
+ // batched-bench params
+ bool is_pp_shared = false;
+
+ std::vector n_pp;
+ std::vector n_tg;
+ std::vector n_pl;
+
+ // retrieval params
+ std::vector context_files; // context files to embed
+
+ int32_t chunk_size = 64; // chunk size for context embedding
+
+ std::string chunk_separator = "\n"; // chunk separator for context embedding
+
+ // passkey params
+ int32_t n_junk = 250; // number of times to repeat the junk text
+ int32_t i_pos = -1; // position of the passkey in the junk text
};
void gpt_params_handle_model_default(gpt_params & params);
-bool parse_kv_override(const char * data, std::vector & overrides);
+bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
+bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
+bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
+void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
-bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
-
-std::string get_system_info(const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-void process_escapes(std::string& input);
-
-bool validate_file_name(const std::string & filename);
+std::string gpt_params_get_system_info(const gpt_params & params);
//
// String utils
//
-std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names);
-std::vector sampler_types_from_chars(const std::string & names_string);
std::vector string_split(std::string input, char separator);
+
std::string string_strip(const std::string & str);
-std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
+std::string string_get_sortable_timestamp();
+
+template
+static std::vector string_split(const std::string & str, char delim) {
+ std::vector values;
+ std::istringstream str_stream(str);
+ std::string token;
+ while (std::getline(str_stream, token, delim)) {
+ T value;
+ std::istringstream token_stream(token);
+ token_stream >> value;
+ values.push_back(value);
+ }
+ return values;
+}
+
+bool string_parse_kv_override(const char * data, std::vector & overrides);
+void string_process_escapes(std::string & input);
+
+//
+// Filesystem utils
+//
+
+bool fs_validate_filename(const std::string & filename);
+bool fs_create_directory_with_parents(const std::string & path);
+
+std::string fs_get_cache_directory();
//
// Model utils
@@ -277,29 +335,21 @@ std::string llama_detokenize_bpe(
bool llama_should_add_bos_token(const llama_model * model);
//
-// YAML utils
+// Chat template utils
//
-bool create_directory_with_parents(const std::string & path);
-std::string get_cache_directory();
-void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector & data);
-void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector & data);
-void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
-std::string get_sortable_timestamp();
-
-void dump_non_result_info_yaml(
- FILE * stream, const gpt_params & params, const llama_context * lctx,
- const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc);
+// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
+bool llama_chat_verify_template(const std::string & tmpl);
//
// KV cache utils
//
// Dump the KV cache view with the number of sequences per cell.
-void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
+void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output).
-void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
//
// Embedding utils
@@ -333,6 +383,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector & data);
+void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector & data);
+void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
+
+void yaml_dump_non_result_info(
+ FILE * stream, const gpt_params & params, const llama_context * lctx,
+ const std::string & timestamp, const std::vector & prompt_tokens, const char * model_desc);
+
diff --git a/common/sampling.cpp b/common/sampling.cpp
index 7fc2e2158..f1f803516 100644
--- a/common/sampling.cpp
+++ b/common/sampling.cpp
@@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) {
for (auto sampler_type : params.samplers_sequence) {
- const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
+ const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
if (!sampler_type_name.empty()) {
result += "-> " + sampler_type_name + " ";
}
@@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
return result;
}
+std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
+ switch (sampler_type) {
+ case llama_sampler_type::TOP_K: return "top_k";
+ case llama_sampler_type::TFS_Z: return "tfs_z";
+ case llama_sampler_type::TYPICAL_P: return "typical_p";
+ case llama_sampler_type::TOP_P: return "top_p";
+ case llama_sampler_type::MIN_P: return "min_p";
+ case llama_sampler_type::TEMPERATURE: return "temperature";
+ default : return "";
+ }
+}
+
+std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names) {
+ std::unordered_map sampler_canonical_name_map {
+ {"top_k", llama_sampler_type::TOP_K},
+ {"top_p", llama_sampler_type::TOP_P},
+ {"typical_p", llama_sampler_type::TYPICAL_P},
+ {"min_p", llama_sampler_type::MIN_P},
+ {"tfs_z", llama_sampler_type::TFS_Z},
+ {"temperature", llama_sampler_type::TEMPERATURE}
+ };
+
+ // since samplers names are written multiple ways
+ // make it ready for both system names and input names
+ std::unordered_map sampler_alt_name_map {
+ {"top-k", llama_sampler_type::TOP_K},
+ {"top-p", llama_sampler_type::TOP_P},
+ {"nucleus", llama_sampler_type::TOP_P},
+ {"typical-p", llama_sampler_type::TYPICAL_P},
+ {"typical", llama_sampler_type::TYPICAL_P},
+ {"min-p", llama_sampler_type::MIN_P},
+ {"tfs-z", llama_sampler_type::TFS_Z},
+ {"tfs", llama_sampler_type::TFS_Z},
+ {"temp", llama_sampler_type::TEMPERATURE}
+ };
+
+ std::vector sampler_types;
+ sampler_types.reserve(names.size());
+ for (const auto & name : names)
+ {
+ auto sampler_item = sampler_canonical_name_map.find(name);
+ if (sampler_item != sampler_canonical_name_map.end())
+ {
+ sampler_types.push_back(sampler_item->second);
+ }
+ else
+ {
+ if (allow_alt_names)
+ {
+ sampler_item = sampler_alt_name_map.find(name);
+ if (sampler_item != sampler_alt_name_map.end())
+ {
+ sampler_types.push_back(sampler_item->second);
+ }
+ }
+ }
+ }
+ return sampler_types;
+}
+
+std::vector llama_sampling_types_from_chars(const std::string & names_string) {
+ std::unordered_map sampler_name_map {
+ {'k', llama_sampler_type::TOP_K},
+ {'p', llama_sampler_type::TOP_P},
+ {'y', llama_sampler_type::TYPICAL_P},
+ {'m', llama_sampler_type::MIN_P},
+ {'f', llama_sampler_type::TFS_Z},
+ {'t', llama_sampler_type::TEMPERATURE}
+ };
+
+ std::vector sampler_types;
+ sampler_types.reserve(names_string.size());
+ for (const auto & c : names_string) {
+ const auto sampler_item = sampler_name_map.find(c);
+ if (sampler_item != sampler_name_map.end()) {
+ sampler_types.push_back(sampler_item->second);
+ }
+ }
+ return sampler_types;
+}
+
// no reasons to expose this function in header
static void sampler_queue(
struct llama_context * ctx_main,
diff --git a/common/sampling.h b/common/sampling.h
index 655732ad1..eeaa53b8b 100644
--- a/common/sampling.h
+++ b/common/sampling.h
@@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params);
+std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
+
+std::vector llama_sampling_types_from_names(const std::vector & names, bool allow_alt_names);
+std::vector llama_sampling_types_from_chars(const std::string & names_string);
+
// this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call
diff --git a/common/train.cpp b/common/train.cpp
index 0dbfd24df..fef1e57c9 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
params.custom_n_ctx = false;
- params.use_flash = true;
+ params.use_flash = false;
params.use_checkpointing = true;
params.sample_start = "";
@@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
void finish_processing_train_args(struct train_params_common * params) {
if (params->escape) {
- process_escapes(params->sample_start);
+ string_process_escapes(params->sample_start);
}
}
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 1923b88ba..84b72348d 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -81,6 +81,7 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+ {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
]
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 6357d4034..ad071b974 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -14,6 +14,7 @@ from pathlib import Path
from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
+import math
import numpy as np
import torch
@@ -24,8 +25,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
-from convert import LlamaHfVocab
-
logger = logging.getLogger("hf-to-gguf")
@@ -312,11 +311,10 @@ class Model:
data = data.astype(np.float32)
data_qtype = gguf.GGMLQuantizationType.F32
- block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
+ shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
+
# reverse shape to make it similar to the internal ggml dimension order
- shape_str = f"""{{{', '.join(str(n) for n in reversed(
- (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
- )}}}"""
+ shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
# n_dims is implicit in the shape
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
@@ -473,6 +471,9 @@ class Model:
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
+ if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
+ # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
+ res = "smaug-bpe"
if res is None:
logger.warning("\n")
@@ -631,7 +632,7 @@ class Model:
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_llama_hf(self):
- vocab = LlamaHfVocab(self.dir_model)
+ vocab = gguf.LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
@@ -672,6 +673,44 @@ class GPTNeoXModel(Model):
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ del bid # unused
+
+ n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
+ n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
+
+ tensors: list[tuple[str, Tensor]] = []
+
+ if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
+ # Map bloom-style qkv_linear to gpt-style qkv_linear
+ # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
+ # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
+ qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
+ data_torch = torch.cat(
+ (
+ qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
+ qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
+ qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
+ ),
+ dim=0,
+ )
+ logger.info("re-format attention.linear_qkv.weight")
+ elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
+ qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
+ data_torch = torch.cat(
+ (
+ qkv_bias[:, 0, :].reshape((n_embed,)),
+ qkv_bias[:, 1, :].reshape((n_embed,)),
+ qkv_bias[:, 2, :].reshape((n_embed,)),
+ ),
+ dim=0,
+ )
+ logger.info("re-format attention.linear_qkv.bias")
+
+ tensors.append((self.map_tensor_name(name), data_torch))
+
+ return tensors
+
@Model.register("BloomForCausalLM")
class BloomModel(Model):
@@ -1276,6 +1315,17 @@ class LlamaModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+ if tokenizer_config_file.is_file():
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+ tokenizer_config_json = json.load(f)
+ if "add_prefix_space" in tokenizer_config_json:
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
+
+ # Apply to granite small models only
+ if self.hparams.get("vocab_size", 32000) == 49152:
+ self.gguf_writer.add_add_bos_token(False)
+
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
@@ -1290,9 +1340,9 @@ class LlamaModel(Model):
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
- if name.endswith("q_proj.weight"):
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
- if name.endswith("k_proj.weight"):
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
@@ -1784,23 +1834,59 @@ class Phi3MiniModel(Model):
def set_gguf_parameters(self):
block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
- rot_pct = 1.0
n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"])
+ n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
rms_eps = self.find_hparam(["rms_norm_eps"])
+ max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
+ orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
+ rope_dims = n_embd // n_head
self.gguf_writer.add_name("Phi3")
- self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"]))
-
+ self.gguf_writer.add_context_length(max_pos_embds)
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
self.gguf_writer.add_embedding_length(n_embd)
- self.gguf_writer.add_feed_forward_length(8192)
+ self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head)
- self.gguf_writer.add_head_count_kv(n_head)
+ self.gguf_writer.add_head_count_kv(n_head_kv)
self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
- self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head)
+ self.gguf_writer.add_rope_dimension_count(rope_dims)
+ self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
self.gguf_writer.add_file_type(self.ftype)
+ # write rope scaling for long context (128k) model
+ rope_scaling = self.find_hparam(['rope_scaling'], True)
+ if (rope_scaling is None):
+ return
+
+ scale = max_pos_embds / orig_max_pos_embds
+
+ rope_scaling_type = rope_scaling.get('type', '').lower()
+ if len(rope_scaling_type) == 0:
+ raise KeyError('Missing the required key rope_scaling.type')
+
+ if rope_scaling_type == 'su':
+ attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
+ elif rope_scaling_type == 'yarn':
+ attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
+ else:
+ raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
+
+ self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
+
+ long_factors = rope_scaling.get('long_factor', None)
+ short_factors = rope_scaling.get('short_factor', None)
+
+ if long_factors is None or short_factors is None:
+ raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
+
+ if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
+ raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
+
+ self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
+ self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
+
@Model.register("PlamoForCausalLM")
class PlamoModel(Model):
@@ -2318,7 +2404,8 @@ class CommandR2Model(Model):
# max_position_embeddings = 8192 in config.json but model was actually
# trained on 128k context length
- self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
+ # aya-23 models don't have model_max_length specified
+ self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
def set_gguf_parameters(self):
super().set_gguf_parameters()
@@ -2391,6 +2478,236 @@ class JinaBertV2Model(BertModel):
self.gguf_writer.add_add_eos_token(True)
+@Model.register("ArcticForCausalLM")
+class ArcticModel(Model):
+ model_arch = gguf.MODEL_ARCH.ARCTIC
+
+ def set_vocab(self):
+ # The reason for using a custom implementation here is that the
+ # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
+ # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
+ from sentencepiece import SentencePieceProcessor
+
+ tokenizer_path = self.dir_model / 'tokenizer.model'
+
+ if not tokenizer_path.is_file():
+ logger.error(f'Error: Missing {tokenizer_path}')
+ sys.exit(1)
+
+ # Read the whole vocabulary from the tokenizer.model file
+ tokenizer = SentencePieceProcessor()
+ tokenizer.LoadFromFile(str(tokenizer_path))
+
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+ scores: list[float] = [-10000.0] * vocab_size
+ toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+
+ for token_id in range(tokenizer.vocab_size()):
+
+ piece = tokenizer.IdToPiece(token_id)
+ text = piece.encode("utf-8")
+ score = tokenizer.GetScore(token_id)
+
+ toktype = SentencePieceTokenTypes.NORMAL
+ if tokenizer.IsUnknown(token_id):
+ toktype = SentencePieceTokenTypes.UNKNOWN
+ elif tokenizer.IsControl(token_id):
+ toktype = SentencePieceTokenTypes.CONTROL
+ elif tokenizer.IsUnused(token_id):
+ toktype = SentencePieceTokenTypes.UNUSED
+ elif tokenizer.IsByte(token_id):
+ toktype = SentencePieceTokenTypes.BYTE
+
+ tokens[token_id] = text
+ scores[token_id] = score
+ toktypes[token_id] = toktype
+
+ # Use the added_tokens_decoder field from tokeniser_config.json as the source
+ # of information about added/redefined tokens and modify them accordingly.
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+ if tokenizer_config_file.is_file():
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+ tokenizer_config_json = json.load(f)
+
+ if "added_tokens_decoder" in tokenizer_config_json:
+ added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
+ for token_id, token_json in added_tokens_decoder.items():
+ token_id = int(token_id)
+ if (token_id >= vocab_size):
+ logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+ continue
+
+ token_content = token_json["content"]
+ token_type = SentencePieceTokenTypes.USER_DEFINED
+ token_score = -10000.0
+
+ # Map unk_token to UNKNOWN, other special tokens to CONTROL
+ # Set the score to 0.0 as in the original tokenizer.model
+ if ("special" in token_json) and token_json["special"]:
+ if token_content == tokenizer_config_json["unk_token"]:
+ token_type = SentencePieceTokenTypes.UNKNOWN
+ else:
+ token_type = SentencePieceTokenTypes.CONTROL
+ token_score = 0.0
+
+ logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
+ tokens[token_id] = token_content.encode("utf-8")
+ toktypes[token_id] = token_type
+ scores[token_id] = token_score
+
+ self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ hparams = self.hparams
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+ self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
+ _experts: list[dict[str, Tensor]] | None = None
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ n_head = self.hparams["num_attention_heads"]
+ n_kv_head = self.hparams.get("num_key_value_heads")
+
+ if name.endswith("q_proj.weight"):
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+ if name.endswith("k_proj.weight"):
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+
+ # process the experts separately
+ if name.find("block_sparse_moe.experts") != -1:
+ n_experts = self.hparams["num_local_experts"]
+
+ assert bid is not None
+
+ if self._experts is None:
+ self._experts = [{} for _ in range(self.block_count)]
+
+ self._experts[bid][name] = data_torch
+
+ if len(self._experts[bid]) >= n_experts * 3:
+ tensors: list[tuple[str, Tensor]] = []
+
+ # merge the experts into a single 3d tensor
+ for wid in ["w1", "w2", "w3"]:
+ datas: list[Tensor] = []
+
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
+ datas.append(self._experts[bid][ename])
+ del self._experts[bid][ename]
+
+ data_torch = torch.stack(datas, dim=0)
+
+ merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
+
+ new_name = self.map_tensor_name(merged_name)
+
+ tensors.append((new_name, data_torch))
+ return tensors
+ else:
+ return []
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+ def write_tensors(self):
+ super().write_tensors()
+
+ if self._experts is not None:
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
+ experts = [k for d in self._experts for k in d.keys()]
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("DeepseekV2ForCausalLM")
+class DeepseekV2Model(Model):
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK2
+
+ def set_vocab(self):
+ self._set_vocab_gpt2()
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ hparams = self.hparams
+
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_value_length(hparams["v_head_dim"])
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+ self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+ if self.hparams["rope_scaling"].get("type") == "yarn":
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
+
+ _experts: list[dict[str, Tensor]] | None = None
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # process the experts separately
+ if name.find("mlp.experts") != -1:
+ n_experts = self.hparams["n_routed_experts"]
+ assert bid is not None
+
+ if self._experts is None:
+ self._experts = [{} for _ in range(self.block_count)]
+
+ self._experts[bid][name] = data_torch
+
+ if len(self._experts[bid]) >= n_experts * 3:
+ tensors: list[tuple[str, Tensor]] = []
+
+ # merge the experts into a single 3d tensor
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
+ datas: list[Tensor] = []
+
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+ datas.append(self._experts[bid][ename])
+ del self._experts[bid][ename]
+
+ data_torch = torch.stack(datas, dim=0)
+
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+ new_name = self.map_tensor_name(merged_name)
+
+ tensors.append((new_name, data_torch))
+ return tensors
+ else:
+ return []
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+ def write_tensors(self):
+ super().write_tensors()
+
+ if self._experts is not None:
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
+ experts = [k for d in self._experts for k in d.keys()]
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts}")
+
+
###### CONVERSION LOGIC ######
@@ -2523,7 +2840,12 @@ def main() -> None:
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
- model_class = Model.from_model_architecture(hparams["architectures"][0])
+ try:
+ model_class = Model.from_model_architecture(hparams["architectures"][0])
+ except NotImplementedError:
+ logger.error(f"Model {hparams['architectures'][0]} is not supported")
+ sys.exit(1)
+
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
logger.info("Set model parameters")
diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md
index 48769cdf6..138124248 100644
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b40ee4ccb..53002f8e1 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
- add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md
index bf951baf7..fa4baf640 100644
--- a/examples/batched-bench/README.md
+++ b/examples/batched-bench/README.md
@@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
-./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ]
+./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
-./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
+./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches
-./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
+./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
```
## Sample results
diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 2924d8116..718f0a61a 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -28,67 +28,27 @@ static std::vector parse_list(char * p) {
return ret;
}
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
+ LOG_TEE("\n");
+}
+
int main(int argc, char ** argv) {
gpt_params params;
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] \n" , argv[0]);
- printf(" , and PL are comma-separated lists of numbers without spaces\n\n");
- printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
- return 1 ;
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
- int n_kv_max = 2048;
- int n_batch = 2048;
- int n_ubatch = 512;
- bool flash_attn = false;
- int is_pp_shared = 0;
- int n_gpu_layers = 0;
+ int is_pp_shared = params.is_pp_shared;
- std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
- std::vector n_tg = { 128, 256, };
- std::vector n_pl = { 1, 2, 4, 8, 16, 32, };
- //std::vector n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- n_kv_max = std::atoi(argv[2]);
- }
-
- if (argc >= 4) {
- n_batch = std::atoi(argv[3]);
- }
-
- if (argc >= 5) {
- n_ubatch = std::atoi(argv[4]);
- }
-
- if (argc >= 6) {
- flash_attn = std::atoi(argv[5]);
- }
-
- if (argc >= 7) {
- is_pp_shared = std::atoi(argv[6]);
- }
-
- if (argc >= 8) {
- n_gpu_layers = std::atoi(argv[7]);
- }
-
- if (argc >= 9) {
- n_pp = parse_list(argv[8]);
- }
-
- if (argc >= 10) {
- n_tg = parse_list(argv[9]);
- }
-
- if (argc >= 11) {
- n_pl = parse_list(argv[10]);
- }
+ std::vector n_pp = params.n_pp;
+ std::vector n_tg = params.n_tg;
+ std::vector n_pl = params.n_pl;
// init LLM
@@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- const std::vector t_split(llama_max_devices(), 0.0f);
-
- model_params.n_gpu_layers = n_gpu_layers;
- model_params.tensor_split = t_split.data();
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
return 1;
}
- llama_context_params ctx_params = llama_context_default_params();
-
- ctx_params.seed = 1234;
- ctx_params.n_ctx = n_kv_max;
- ctx_params.n_batch = n_batch;
- ctx_params.n_ubatch = n_ubatch;
- ctx_params.flash_attn = flash_attn;
-
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
return 1;
}
+ const int32_t n_kv_max = llama_n_ctx(ctx);
+
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens
@@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
- LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+ LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
diff --git a/examples/batched/README.md b/examples/batched/README.md
index 5d7303317..ed204c308 100644
--- a/examples/batched/README.md
+++ b/examples/batched/README.md
@@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt
```bash
-./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
+./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
...
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index be30d20bf..62d9b144d 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -7,48 +7,31 @@
#include
#include
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
+ LOG_TEE("\n");
+}
+
int main(int argc, char ** argv) {
gpt_params params;
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
- return 1 ;
+ params.prompt = "Hello my name is";
+ params.n_predict = 32;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
+
// number of parallel batches
- int n_parallel = 1;
+ int n_parallel = params.n_parallel;
// total length of the sequences including the prompt
- int n_len = 32;
-
- // number of layers to offload to the GPU
- int n_gpu_layers = 0;
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- params.prompt = argv[2];
- }
-
- if (argc >= 4) {
- n_parallel = std::atoi(argv[3]);
- }
-
- if (argc >= 5) {
- n_len = std::atoi(argv[4]);
- }
-
- if (argc >= 6) {
- n_gpu_layers = std::atoi(argv[5]);
- }
-
- if (params.prompt.empty()) {
- params.prompt = "Hello my name is";
- }
-
- process_escapes(params.prompt);
+ int n_predict = 32;
// init LLM
@@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- model_params.n_gpu_layers = n_gpu_layers;
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
std::vector tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true);
- const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
+ const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
- ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
- ctx_params.n_batch = std::max(n_len, n_parallel);
- ctx_params.n_seq_max = n_parallel;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@@ -93,9 +70,9 @@ int main(int argc, char ** argv) {
return 1;
}
- const int n_ctx = llama_n_ctx(ctx);
+ const int n_ctx = llama_n_ctx(ctx);
- LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
+ LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
@@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
- while (n_cur <= n_len) {
+ while (n_cur <= n_predict) {
// prepare the next batch
llama_batch_clear(batch);
@@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? -> mark the stream as finished
- if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+ if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {
diff --git a/examples/beam-search/CMakeLists.txt b/examples/beam-search/CMakeLists.txt
deleted file mode 100644
index f0e37468b..000000000
--- a/examples/beam-search/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET beam-search)
-add_executable(${TARGET} beam-search.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp
deleted file mode 100644
index 3d34378a5..000000000
--- a/examples/beam-search/beam-search.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-#include "common.h"
-#include "llama.h"
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include
-#include
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-# define NOMINMAX
-#endif
-#include
-#include
-#endif
-
-// Used for debugging to print out beam tokens.
-struct ostream_beam_view {
- llama_context * ctx;
- llama_beam_view beam_view;
-};
-
-static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
- os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
- for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
- os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
- }
- return os << ')';
-}
-
-// Put here anything you want back in beam_search_callback().
-struct beam_search_callback_data {
- llama_context * ctx;
- std::vector response;
-};
-
-// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
-// For example, eob can be flagged due to maximum token length, stop words, etc.
-static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
- return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
-}
-
-// Function matching type llama_beam_search_callback_fn_t.
-// Custom callback example is called each time the beams lengths increase:
-// * Show progress by printing ',' following by number of convergent beam tokens if any.
-// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
-// This is also called when the stop condition is met.
-// Collect tokens into std::vector response which is pointed to by callback_data.
-static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
- auto& callback_data = *static_cast(callback_data_ptr);
- // Mark beams as EOS as needed.
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- llama_beam_view& beam_view = beams_state.beam_views[i];
- if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
- beam_view.eob = true;
- }
- }
- printf(","); // Show progress
- if (const size_t n = beams_state.common_prefix_length) {
- callback_data.response.resize(callback_data.response.size() + n);
- assert(0u < beams_state.n_beams);
- const llama_token * tokens = beams_state.beam_views[0].tokens;
- std::copy(tokens, tokens + n, callback_data.response.end() - n);
- printf("%zu", n);
- }
- fflush(stdout);
-#if 1 // DEBUG: print current beams for this iteration
- std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
- for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
- std::cout << "beams["< 3 )
- {
- params.prompt = argv[3];
- }
-
- if ( params.prompt.empty() )
- {
- params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
- }
-
- //---------------------------------
- // Init LLM :
- //---------------------------------
-
- llama_backend_init();
- llama_numa_init(params.numa);
-
- llama_model * model;
- llama_context * ctx;
-
- std::tie(model, ctx) = llama_init_from_gpt_params( params );
-
- if ( model == NULL )
- {
- fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
- return 1;
- }
-
- //---------------------------------
- // Tokenize the prompt :
- //---------------------------------
-
- std::vector tokens_list = llama_tokenize(ctx, params.prompt, true);
-
- const size_t max_context_size = llama_n_ctx( ctx );
- const size_t max_tokens_list_size = max_context_size - 4 ;
-
- if (tokens_list.size() > max_tokens_list_size)
- {
- fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
- __func__ , tokens_list.size() , max_tokens_list_size );
- return 1;
- }
-
- fprintf( stderr, "\n\n" );
-
- // Print the tokens from the prompt :
-
- for( auto id : tokens_list )
- {
- std::cout << llama_token_to_piece(ctx, id);
- }
- std::cout << std::flush;
-
- int n_past = 0;
-
- if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
- {
- fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
- return 1;
- }
- n_past += tokens_list.size();
-
- beam_search_callback_data callback_data{ctx, {}};
- size_t const beam_width = static_cast(params.n_beams);
- int const n_predict = 256;
- llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
-
- std::cout << "\n\n";
- for (llama_token const token_id : callback_data.response) {
- std::cout << llama_token_to_piece(ctx,token_id);
- }
- std::cout << std::endl;
-
- llama_free( ctx );
- llama_free_model( model );
-
- llama_backend_free();
-
- return 0;
-}
diff --git a/convert.py b/examples/convert-legacy-llama.py
similarity index 82%
rename from convert.py
rename to examples/convert-legacy-llama.py
index da1247957..fd8401015 100755
--- a/convert.py
+++ b/examples/convert-legacy-llama.py
@@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
import numpy as np
-from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+ # use .parent.parent since we are in "examples" directory
+ sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
+
import gguf
+from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
@@ -380,306 +382,6 @@ class Metadata:
return metadata
-#
-# vocab
-#
-
-
-@runtime_checkable
-class BaseVocab(Protocol):
- tokenizer_model: ClassVar[str]
- name: ClassVar[str]
-
-
-class NoVocab(BaseVocab):
- tokenizer_model = "no_vocab"
- name = "no_vocab"
-
- def __repr__(self) -> str:
- return ""
-
-
-@runtime_checkable
-class Vocab(BaseVocab, Protocol):
- vocab_size: int
- added_tokens_dict: dict[str, int]
- added_tokens_list: list[str]
- fname_tokenizer: Path
-
- def __init__(self, base_path: Path): ...
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
-
-
-class BpeVocab(Vocab):
- tokenizer_model = "gpt2"
- name = "bpe"
-
- def __init__(self, base_path: Path):
- added_tokens: dict[str, int] = {}
-
- if (fname_tokenizer := base_path / 'vocab.json').exists():
- # "slow" tokenizer
- with open(fname_tokenizer, encoding="utf-8") as f:
- self.vocab = json.load(f)
-
- try:
- # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
- with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
- added_tokens = json.load(f)
- except FileNotFoundError:
- pass
- else:
- # "fast" tokenizer
- fname_tokenizer = base_path / FAST_TOKENIZER_FILE
-
- # if this fails, FileNotFoundError propagates to caller
- with open(fname_tokenizer, encoding="utf-8") as f:
- tokenizer_json = json.load(f)
-
- tokenizer_model: dict[str, Any] = tokenizer_json['model']
- if (
- tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
- or tokenizer_json['decoder']['type'] != 'ByteLevel'
- ):
- raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
-
- self.vocab = tokenizer_model["vocab"]
-
- if (added := tokenizer_json.get('added_tokens')) is not None:
- # Added tokens here can be duplicates of the main vocabulary.
- added_tokens = {item['content']: item['id']
- for item in added
- if item['content'] not in self.vocab}
-
- vocab_size = len(self.vocab)
- expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
- actual_ids = sorted(added_tokens.values())
- if expected_ids != actual_ids:
- expected_end_id = vocab_size + len(actual_ids) - 1
- raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
- f"{vocab_size} - {expected_end_id}; got {actual_ids}")
-
- items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
- self.added_tokens_dict = added_tokens
- self.added_tokens_list = [text for (text, idx) in items]
- self.vocab_size_base = vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
- self.fname_tokenizer = fname_tokenizer
-
- def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
-
- for i, _ in enumerate(self.vocab):
- yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- score = -1000.0
- yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.bpe_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
-class SentencePieceVocab(Vocab):
- tokenizer_model = "llama"
- name = "spm"
-
- def __init__(self, base_path: Path):
- added_tokens: dict[str, int] = {}
- if (fname_tokenizer := base_path / 'tokenizer.model').exists():
- # normal location
- try:
- with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
- added_tokens = json.load(f)
- except FileNotFoundError:
- pass
- elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
- # not found in alternate location either
- raise FileNotFoundError('Cannot find tokenizer.model')
-
- self.sentencepiece_tokenizer = SentencePieceProcessor()
- self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
- vocab_size = self.sentencepiece_tokenizer.vocab_size()
-
- new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
- expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
- actual_new_ids = sorted(new_tokens.keys())
-
- if expected_new_ids != actual_new_ids:
- raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
-
- # Token pieces that were added to the base vocabulary.
- self.added_tokens_dict = added_tokens
- self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
- self.vocab_size_base = vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
- self.fname_tokenizer = fname_tokenizer
-
- def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- tokenizer = self.sentencepiece_tokenizer
- for i in range(tokenizer.vocab_size()):
- piece = tokenizer.IdToPiece(i)
- text = piece.encode("utf-8")
- score: float = tokenizer.GetScore(i)
-
- toktype = gguf.TokenType.NORMAL
- if tokenizer.IsUnknown(i):
- toktype = gguf.TokenType.UNKNOWN
- if tokenizer.IsControl(i):
- toktype = gguf.TokenType.CONTROL
-
- # NOTE: I think added_tokens are user defined.
- # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
- # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
-
- if tokenizer.IsUnused(i):
- toktype = gguf.TokenType.UNUSED
- if tokenizer.IsByte(i):
- toktype = gguf.TokenType.BYTE
-
- yield text, score, toktype
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- score = -1000.0
- yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.sentencepiece_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
-class LlamaHfVocab(Vocab):
- tokenizer_model = "llama"
- name = "hfft"
-
- def __init__(self, base_path: Path):
- fname_tokenizer = base_path / FAST_TOKENIZER_FILE
- # if this fails, FileNotFoundError propagates to caller
- with open(fname_tokenizer, encoding='utf-8') as f:
- tokenizer_json = json.load(f)
-
- # pre-check so we know if we need transformers
- tokenizer_model: dict[str, Any] = tokenizer_json['model']
- is_llama3 = (
- tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
- and not tokenizer_model.get('byte_fallback', True)
- )
- if is_llama3:
- raise TypeError('Llama 3 must be converted with BpeVocab')
-
- if not is_llama3 and (
- tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
- or tokenizer_json['decoder']['type'] != 'Sequence'
- ):
- raise FileNotFoundError('Cannot find Llama BPE tokenizer')
-
- try:
- from transformers import AutoTokenizer
- except ImportError as e:
- raise ImportError(
- "To use LlamaHfVocab, please install the `transformers` package. "
- "You can install it with `pip install transformers`."
- ) from e
-
- # Allow the tokenizer to default to slow or fast versions.
- # Explicitly set tokenizer to use local paths.
- self.tokenizer = AutoTokenizer.from_pretrained(
- base_path,
- cache_dir=base_path,
- local_files_only=True,
- )
- assert self.tokenizer.is_fast # assume tokenizer.json is used
-
- # Initialize lists and dictionaries for added tokens
- self.added_tokens_list = []
- self.added_tokens_dict = dict()
- self.added_tokens_ids = set()
-
- # Process added tokens
- for tok, tokidx in sorted(
- self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
- ):
- # Only consider added tokens that are not in the base vocabulary
- if tokidx >= self.tokenizer.vocab_size:
- self.added_tokens_list.append(tok)
- self.added_tokens_dict[tok] = tokidx
- self.added_tokens_ids.add(tokidx)
-
- # Store special tokens and their IDs
- self.specials = {
- tok: self.tokenizer.get_vocab()[tok]
- for tok in self.tokenizer.all_special_tokens
- }
- self.special_ids = set(self.tokenizer.all_special_ids)
-
- # Set vocabulary sizes
- self.vocab_size_base = self.tokenizer.vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
-
- self.fname_tokenizer = fname_tokenizer
-
- def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- reverse_vocab = {
- id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
- }
-
- for token_id in range(self.vocab_size_base):
- # Skip processing added tokens here
- if token_id in self.added_tokens_ids:
- continue
-
- # Convert token text to bytes
- token_text = reverse_vocab[token_id].encode("utf-8")
-
- # Yield token text, score, and type
- yield token_text, self.get_token_score(token_id), self.get_token_type(
- token_id, token_text, self.special_ids # Reuse already stored special IDs
- )
-
- def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
- # Special case for byte tokens
- if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
- return gguf.TokenType.BYTE
-
- # Determine token type based on whether it's a special token
- return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
-
- def get_token_score(self, token_id: int) -> float:
- # Placeholder for actual logic to determine the token's score
- # This needs to be implemented based on specific requirements
- return -1000.0 # Default score
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- if text in self.specials:
- toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
- score = self.get_token_score(self.specials[text])
- else:
- toktype = gguf.TokenType.USER_DEFINED
- score = -1000.0
-
- yield text.encode("utf-8"), score, toktype
-
- def has_newline_token(self):
- return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.hf_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
#
# data loading
# TODO: reuse (probably move to gguf.py?)
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 746c3fbef..8ca9f8915 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
params.samples_start_after_nl = false;
params.use_adam = true;
- params.use_flash = true;
+ params.use_flash = false;
params.use_scratch = true;
// only adam
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 0c921ed69..244751e00 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = gpt_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
@@ -107,7 +105,7 @@ int main(int argc, char ** argv) {
// print system information
{
fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
// split the prompt into lines
diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp
index e670d3769..64cd338c2 100644
--- a/examples/eval-callback/eval-callback.cpp
+++ b/examples/eval-callback/eval-callback.cpp
@@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
}
int main(int argc, char ** argv) {
-
callback_data cb_data;
gpt_params params;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = gpt_random_prompt(rng);
- }
llama_backend_init();
llama_numa_init(params.numa);
@@ -176,7 +174,7 @@ int main(int argc, char ** argv) {
// print system information
{
fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
bool OK = run(ctx, params);
diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp
index 22743b1bf..22425730f 100644
--- a/examples/finetune/finetune.cpp
+++ b/examples/finetune/finetune.cpp
@@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// not capturing these, to silcence warnings
const int rope_mode = 0;
- return ggml_rope_custom(ctx,
- t, KQ_pos, n_rot, rope_mode, n_ctx, 0,
+ return ggml_rope_ext(ctx,
+ t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};
@@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
struct ggml_tensor * t16;
if (enable_flash_attn) {
- t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
+ GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
+ //t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
} else {
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);
diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh
index 7ca6fa7f2..3bc0fa471 100755
--- a/examples/gguf-split/tests.sh
+++ b/examples/gguf-split/tests.sh
@@ -41,7 +41,7 @@ echo PASS
echo
# 2b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS
echo
@@ -51,7 +51,7 @@ echo PASS
echo
# 3b. Test the merged model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS
echo
@@ -61,7 +61,7 @@ echo PASS
echo
# 4b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS
echo
@@ -71,7 +71,7 @@ echo
#echo
# 5b. Test the merged model is loading properly
-#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
+#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS
#echo
@@ -81,7 +81,7 @@ echo PASS
echo
# 6b. Test the sharded model is loading properly
-$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
+$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS
echo
diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp
index 52fd719b3..213515791 100644
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) {
gpt_params params;
+
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp
index 82b19fc4f..e050c09d2 100644
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -533,7 +533,6 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
int main(int argc, char ** argv) {
-
StatParams sparams;
std::string prev_result_file;
std::string combine_files;
@@ -581,7 +580,9 @@ int main(int argc, char ** argv) {
gpt_params params;
params.n_batch = 512;
- if (!gpt_params_parse(args.size(), args.data(), params)) {
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -597,9 +598,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = gpt_random_prompt(rng);
- }
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
@@ -667,7 +665,7 @@ int main(int argc, char ** argv) {
// print system information
{
fprintf(stderr, "\n");
- fprintf(stderr, "%s\n", get_system_info(params).c_str());
+ fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
index afac145f6..0e4ec79c6 100644
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@@ -50,9 +50,9 @@ static void write_logfile(
return;
}
- const std::string timestamp = get_sortable_timestamp();
+ const std::string timestamp = string_get_sortable_timestamp();
- const bool success = create_directory_with_parents(params.logdir);
+ const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
@@ -70,7 +70,7 @@ static void write_logfile(
fprintf(logfile, "binary: infill\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
@@ -78,8 +78,8 @@ static void write_logfile(
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
- dump_string_yaml_multiline(logfile, "output", output.c_str());
- dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+ yaml_dump_string_multiline(logfile, "output", output.c_str());
+ yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
@@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
g_params = ¶ms;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
- if (params.instruct) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for instruct mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (params.chatml) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for chatml mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (!params.antiprompt.empty()) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
return 0;
}
- if (params.random_prompt) {
- printf("\n************\n");
- printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
- if (!params.path_prompt_cache.empty()) {
- printf("\n************\n");
- printf("%s: infill does not support prompt caching\n", __func__);
- printf("************\n\n");
-
- return 0;
- }
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
@@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
- llama_context * ctx_guidance = NULL;
+
g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
- if (sparams.cfg_scale > 1.f) {
- struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
- ctx_guidance = llama_new_context_with_model(model, lparams);
- }
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
@@ -236,7 +198,7 @@ int main(int argc, char ** argv) {
// print system information
{
LOG_TEE("\n");
- LOG_TEE("%s\n", get_system_info(params).c_str());
+ LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
}
const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1);
@@ -273,25 +235,6 @@ int main(int argc, char ** argv) {
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
- // Tokenize negative prompt
- std::vector guidance_inp;
- int guidance_offset = 0;
- int original_prompt_len = 0;
- if (ctx_guidance) {
- LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
-
- guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
- LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
-
- std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true);
- LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
-
- original_prompt_len = original_inp.size();
- guidance_offset = (int)guidance_inp.size() - original_prompt_len;
- LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
- LOG("guidance_offset: %s", log_tostr(guidance_offset));
- }
-
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
@@ -319,15 +262,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
- if (ctx_guidance) {
- LOG_TEE("\n");
- LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
- LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
- for (int i = 0; i < (int) guidance_inp.size(); i++) {
- LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
- }
- }
-
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
@@ -395,12 +329,11 @@ int main(int argc, char ** argv) {
is_interacting = params.interactive_first;
}
- bool input_echo = true;
+ bool input_echo = true;
- int n_past = 0;
- int n_remain = params.n_predict;
- int n_consumed = 0;
- int n_past_guidance = 0;
+ int n_past = 0;
+ int n_remain = params.n_predict;
+ int n_consumed = 0;
std::vector input_tokens; g_input_tokens = &input_tokens;
std::vector output_tokens; g_output_tokens = &output_tokens;
@@ -410,7 +343,6 @@ int main(int argc, char ** argv) {
console::set_display(console::prompt);
std::vector embd;
- std::vector embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
@@ -436,7 +368,7 @@ int main(int argc, char ** argv) {
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
- if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) {
+ if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
@@ -453,11 +385,7 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
- if (ctx_guidance) {
- n_past_guidance -= n_discard;
- }
-
- LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
+ LOG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
@@ -465,45 +393,6 @@ int main(int argc, char ** argv) {
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
-
- if (ctx_guidance) {
- int input_size = 0;
- llama_token * input_buf = NULL;
-
- if (n_past_guidance < (int) guidance_inp.size()) {
- // Guidance context should have the same data with these modifications:
- //
- // * Replace the initial prompt
- // * Shift everything by guidance_offset
- embd_guidance = guidance_inp;
- if (embd.begin() + original_prompt_len < embd.end()) {
- embd_guidance.insert(
- embd_guidance.end(),
- embd.begin() + original_prompt_len,
- embd.end()
- );
- }
-
- input_buf = embd_guidance.data();
- input_size = embd_guidance.size();
-
- LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
- } else {
- input_buf = embd.data();
- input_size = embd.size();
- }
-
- for (int i = 0; i < input_size; i += params.n_batch) {
- int n_eval = std::min(input_size - i, params.n_batch);
- if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
- LOG_TEE("%s : failed to eval\n", __func__);
- return 1;
- }
-
- n_past_guidance += n_eval;
- }
- }
-
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
@@ -525,11 +414,9 @@ int main(int argc, char ** argv) {
}
embd.clear();
- embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
-
- const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
+ const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
llama_sampling_accept(ctx_sampling, ctx, id, true);
@@ -583,7 +470,6 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
-
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
@@ -621,8 +507,8 @@ int main(int argc, char ** argv) {
if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
- process_escapes(params.input_prefix);
- process_escapes(params.input_suffix);
+ string_process_escapes(params.input_prefix);
+ string_process_escapes(params.input_suffix);
}
suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
@@ -644,7 +530,6 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd.clear();
- embd_guidance.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
@@ -751,7 +636,6 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
- if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);
diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index 857840564..fd95b35f4 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -162,7 +162,7 @@ $ ./llama-bench -o csv
```
```csv
-build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
```
@@ -179,7 +179,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
- "opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@@ -210,7 +209,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
- "opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
- opencl INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
@@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
+INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 8b965e199..5c31548a6 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -41,20 +41,6 @@ static std::string join(const std::vector & values, const std::string & delim
return str.str();
}
-template
-static std::vector split(const std::string & str, char delim) {
- std::vector values;
- std::istringstream str_stream(str);
- std::string token;
- while (std::getline(str_stream, token, delim)) {
- T value;
- std::istringstream token_stream(token);
- token_stream >> value;
- values.push_back(value);
- }
- return values;
-}
-
template
static std::vector transform_to_str(const std::vector & values, F f) {
std::vector str_values;
@@ -140,10 +126,11 @@ static std::string get_gpu_info() {
}
// command line params
-enum output_formats {CSV, JSON, MARKDOWN, SQL};
+enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) {
switch (format) {
+ case NONE: return "none";
case CSV: return "csv";
case JSON: return "json";
case MARKDOWN: return "md";
@@ -152,6 +139,23 @@ static const char * output_format_str(output_formats format) {
}
}
+static bool output_format_from_str(const std::string & s, output_formats & format) {
+ if (s == "none") {
+ format = NONE;
+ } else if (s == "csv") {
+ format = CSV;
+ } else if (s == "json") {
+ format = JSON;
+ } else if (s == "md") {
+ format = MARKDOWN;
+ } else if (s == "sql") {
+ format = SQL;
+ } else {
+ return false;
+ }
+ return true;
+}
+
static const char * split_mode_str(llama_split_mode mode) {
switch (mode) {
case LLAMA_SPLIT_MODE_NONE: return "none";
@@ -178,6 +182,7 @@ struct cmd_params {
std::vector type_v;
std::vector n_threads;
std::vector n_gpu_layers;
+ std::vector rpc_servers;
std::vector split_mode;
std::vector main_gpu;
std::vector no_kv_offload;
@@ -189,30 +194,33 @@ struct cmd_params {
int reps;
bool verbose;
output_formats output_format;
+ output_formats output_format_stderr;
};
static const cmd_params cmd_params_defaults = {
- /* model */ {"models/7B/ggml-model-q4_0.gguf"},
- /* n_prompt */ {512},
- /* n_gen */ {128},
- /* n_pg */ {{512, 128}},
- /* n_batch */ {2048},
- /* n_ubatch */ {512},
- /* type_k */ {GGML_TYPE_F16},
- /* type_v */ {GGML_TYPE_F16},
- /* n_threads */ {get_math_cpu_count()},
- /* n_gpu_layers */ {99},
- /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
- /* main_gpu */ {0},
- /* no_kv_offload */ {false},
- /* flash_attn */ {false},
- /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)},
- /* use_mmap */ {true},
- /* embeddings */ {false},
- /* numa */ GGML_NUMA_STRATEGY_DISABLED,
- /* reps */ 5,
- /* verbose */ false,
- /* output_format */ MARKDOWN
+ /* model */ {"models/7B/ggml-model-q4_0.gguf"},
+ /* n_prompt */ {512},
+ /* n_gen */ {128},
+ /* n_pg */ {},
+ /* n_batch */ {2048},
+ /* n_ubatch */ {512},
+ /* type_k */ {GGML_TYPE_F16},
+ /* type_v */ {GGML_TYPE_F16},
+ /* n_threads */ {cpu_get_num_math()},
+ /* n_gpu_layers */ {99},
+ /* rpc_servers */ {""},
+ /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
+ /* main_gpu */ {0},
+ /* no_kv_offload */ {false},
+ /* flash_attn */ {false},
+ /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)},
+ /* use_mmap */ {true},
+ /* embeddings */ {false},
+ /* numa */ GGML_NUMA_STRATEGY_DISABLED,
+ /* reps */ 5,
+ /* verbose */ false,
+ /* output_format */ MARKDOWN,
+ /* output_format_stderr */ NONE,
};
static void print_usage(int /* argc */, char ** argv) {
@@ -230,6 +238,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+ printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@@ -240,6 +249,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ts, --tensor-split (default: 0)\n");
printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
+ printf(" -oe, --output-err (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@@ -281,6 +291,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.verbose = cmd_params_defaults.verbose;
params.output_format = cmd_params_defaults.output_format;
+ params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
for (int i = 1; i < argc; i++) {
@@ -297,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], ',');
+ auto p = string_split(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
@@ -329,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
std::vector types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
@@ -359,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
std::vector types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
@@ -375,21 +386,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+ } else if (arg == "-rpc" || arg == "--rpc") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.rpc_servers.push_back(argv[i]);
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
std::vector modes;
for (const auto & m : p) {
llama_split_mode mode;
@@ -411,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- params.main_gpu = split(argv[i], split_delim);
+ params.main_gpu = string_split(argv[i], split_delim);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
@@ -435,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
break;
}
- auto p = split(argv[i], split_delim);
+ auto p = string_split(argv[i], split_delim);
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
break;
}
- for (auto ts : split(argv[i], split_delim)) {
+ for (auto ts : string_split(argv[i], split_delim)) {
// split string by ; and /
const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
@@ -484,18 +501,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
- if (argv[i] == std::string("csv")) {
- params.output_format = CSV;
- } else if (argv[i] == std::string("json")) {
- params.output_format = JSON;
- } else if (argv[i] == std::string("md")) {
- params.output_format = MARKDOWN;
- } else if (argv[i] == std::string("sql")) {
- params.output_format = SQL;
- } else {
+ invalid_param = !output_format_from_str(argv[i], params.output_format);
+ } else if (arg == "-oe" || arg == "--output-err") {
+ if (++i >= argc) {
invalid_param = true;
break;
}
+ invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else {
@@ -519,6 +531,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
+ if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@@ -541,6 +554,7 @@ struct cmd_params_instance {
ggml_type type_v;
int n_threads;
int n_gpu_layers;
+ std::string rpc_servers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
@@ -553,6 +567,9 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
+ if (!rpc_servers.empty()) {
+ mparams.rpc_servers = rpc_servers.c_str();
+ }
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
@@ -564,6 +581,7 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model &&
n_gpu_layers == other.n_gpu_layers &&
+ rpc_servers == other.rpc_servers &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
@@ -592,6 +610,7 @@ static std::vector get_cmd_params_instances(const cmd_param
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
+ for (const auto & rpc : params.rpc_servers)
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
@@ -618,6 +637,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@@ -643,6 +663,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@@ -668,6 +689,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@@ -687,11 +709,11 @@ struct test {
static const std::string build_commit;
static const int build_number;
static const bool cuda;
- static const bool opencl;
static const bool vulkan;
static const bool kompute;
static const bool metal;
static const bool sycl;
+ static const bool rpc;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
@@ -775,9 +797,6 @@ struct test {
if (cuda) {
return GGML_CUDA_NAME;
}
- if (opencl) {
- return "OpenCL";
- }
if (vulkan) {
return "Vulkan";
}
@@ -790,6 +809,9 @@ struct test {
if (sycl) {
return GGML_SYCL_NAME;
}
+ if (rpc) {
+ return "RPC";
+ }
if (gpu_blas) {
return "GPU BLAS";
}
@@ -803,7 +825,7 @@ struct test {
static const std::vector & get_fields() {
static const std::vector fields = {
"build_commit", "build_number",
- "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
+ "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
@@ -829,7 +851,7 @@ struct test {
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
- if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
+ if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL;
@@ -858,8 +880,8 @@ struct test {
}
std::vector values = {
build_commit, std::to_string(build_number),
- std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
- std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
+ std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
+ std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
@@ -887,13 +909,13 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cuda();
-const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
+const bool test::rpc = !!ggml_cpu_has_rpc();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
@@ -1254,6 +1276,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
(void) user_data;
}
+static std::unique_ptr create_printer(output_formats format) {
+ switch (format) {
+ case NONE:
+ return nullptr;
+ case CSV:
+ return std::unique_ptr(new csv_printer());
+ case JSON:
+ return std::unique_ptr(new json_printer());
+ case MARKDOWN:
+ return std::unique_ptr(new markdown_printer());
+ case SQL:
+ return std::unique_ptr(new sql_printer());
+ }
+ GGML_ASSERT(false);
+}
+
int main(int argc, char ** argv) {
// try to set locale for unicode characters in markdown
setlocale(LC_CTYPE, ".UTF-8");
@@ -1280,26 +1318,18 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa);
// initialize printer
- std::unique_ptr p;
- switch (params.output_format) {
- case CSV:
- p.reset(new csv_printer());
- break;
- case JSON:
- p.reset(new json_printer());
- break;
- case MARKDOWN:
- p.reset(new markdown_printer());
- break;
- case SQL:
- p.reset(new sql_printer());
- break;
- default:
- assert(false);
- exit(1);
+ std::unique_ptr p = create_printer(params.output_format);
+ std::unique_ptr p_err = create_printer(params.output_format_stderr);
+
+ if (p) {
+ p->fout = stdout;
+ p->print_header(params);
+ }
+
+ if (p_err) {
+ p_err->fout = stderr;
+ p_err->print_header(params);
}
- p->fout = stdout;
- p->print_header(params);
std::vector params_instances = get_cmd_params_instances(params);
@@ -1357,7 +1387,15 @@ int main(int argc, char ** argv) {
t.samples_ns.push_back(t_ns);
}
- p->print_test(t);
+ if (p) {
+ p->print_test(t);
+ fflush(p->fout);
+ }
+
+ if (p_err) {
+ p_err->print_test(t);
+ fflush(p_err->fout);
+ }
llama_print_timings(ctx);
@@ -1366,7 +1404,13 @@ int main(int argc, char ** argv) {
llama_free_model(lmodel);
- p->print_footer();
+ if (p) {
+ p->print_footer();
+ }
+
+ if (p_err) {
+ p_err->print_footer();
+ }
llama_backend_free();
diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts
index d42140efe..8d1b37195 100644
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -7,8 +7,6 @@ android {
namespace = "com.example.llama"
compileSdk = 34
- ndkVersion = "26.1.10909125"
-
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
@@ -20,17 +18,6 @@ android {
vectorDrawables {
useSupportLibrary = true
}
- ndk {
- // Add NDK properties if wanted, e.g.
- // abiFilters += listOf("arm64-v8a")
- }
- externalNativeBuild {
- cmake {
- arguments += "-DCMAKE_BUILD_TYPE=Release"
- cppFlags += listOf()
- arguments += listOf()
- }
- }
}
buildTypes {
@@ -55,17 +42,6 @@ android {
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
- packaging {
- resources {
- excludes += "/META-INF/{AL2.0,LGPL2.1}"
- }
- }
- externalNativeBuild {
- cmake {
- path = file("src/main/cpp/CMakeLists.txt")
- version = "3.22.1"
- }
- }
}
dependencies {
@@ -78,6 +54,7 @@ dependencies {
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
+ implementation(project(":llama"))
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
index be95e2221..45ac29938 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,5 +1,6 @@
package com.example.llama
+import android.llama.cpp.LLamaAndroid
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
@@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
-class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
+class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
@@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch {
try {
- llm.unload()
+ llamaAndroid.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
@@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
messages += ""
viewModelScope.launch {
- llm.send(text)
+ llamaAndroid.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
@@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch {
try {
val start = System.nanoTime()
- val warmupResult = llm.bench(pp, tg, pl, nr)
+ val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
@@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
return@launch
}
- messages += llm.bench(512, 128, 1, 3)
+ messages += llamaAndroid.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
@@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
fun load(pathToModel: String) {
viewModelScope.launch {
try {
- llm.load(pathToModel)
+ llamaAndroid.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)
diff --git a/examples/llama.android/build.gradle.kts b/examples/llama.android/build.gradle.kts
index 50ebc8211..acd1ada7d 100644
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -2,4 +2,5 @@
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
+ id("com.android.library") version "8.2.0" apply false
}
diff --git a/examples/llama.android/llama/.gitignore b/examples/llama.android/llama/.gitignore
new file mode 100644
index 000000000..796b96d1c
--- /dev/null
+++ b/examples/llama.android/llama/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt
similarity index 98%
rename from examples/llama.android/app/src/main/cpp/CMakeLists.txt
rename to examples/llama.android/llama/CMakeLists.txt
index 4536974a5..a5618cac0 100644
--- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
- llama-android.cpp)
+ llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts
new file mode 100644
index 000000000..0a3806172
--- /dev/null
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -0,0 +1,68 @@
+plugins {
+ id("com.android.library")
+ id("org.jetbrains.kotlin.android")
+}
+
+android {
+ namespace = "android.llama.cpp"
+ compileSdk = 34
+
+ defaultConfig {
+ minSdk = 33
+
+ testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+ consumerProguardFiles("consumer-rules.pro")
+ ndk {
+ // Add NDK properties if wanted, e.g.
+ // abiFilters += listOf("arm64-v8a")
+ }
+ externalNativeBuild {
+ cmake {
+ arguments += "-DCMAKE_BUILD_TYPE=Release"
+ cppFlags += listOf()
+ arguments += listOf()
+
+ cppFlags("")
+ }
+ }
+ }
+
+ buildTypes {
+ release {
+ isMinifyEnabled = false
+ proguardFiles(
+ getDefaultProguardFile("proguard-android-optimize.txt"),
+ "proguard-rules.pro"
+ )
+ }
+ }
+ externalNativeBuild {
+ cmake {
+ path("src/main/cpp/CMakeLists.txt")
+ version = "3.22.1"
+ }
+ }
+ compileOptions {
+ sourceCompatibility = JavaVersion.VERSION_1_8
+ targetCompatibility = JavaVersion.VERSION_1_8
+ }
+ kotlinOptions {
+ jvmTarget = "1.8"
+ }
+
+ packaging {
+ resources {
+ excludes += "/META-INF/{AL2.0,LGPL2.1}"
+ }
+ }
+}
+
+dependencies {
+
+ implementation("androidx.core:core-ktx:1.12.0")
+ implementation("androidx.appcompat:appcompat:1.6.1")
+ implementation("com.google.android.material:material:1.11.0")
+ testImplementation("junit:junit:4.13.2")
+ androidTestImplementation("androidx.test.ext:junit:1.1.5")
+ androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
+}
diff --git a/examples/llama.android/llama/consumer-rules.pro b/examples/llama.android/llama/consumer-rules.pro
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/llama.android/llama/proguard-rules.pro b/examples/llama.android/llama/proguard-rules.pro
new file mode 100644
index 000000000..f1b424510
--- /dev/null
+++ b/examples/llama.android/llama/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+# http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+# public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
new file mode 100644
index 000000000..05d6ab5d2
--- /dev/null
+++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
@@ -0,0 +1,24 @@
+package android.llama.cpp
+
+import androidx.test.platform.app.InstrumentationRegistry
+import androidx.test.ext.junit.runners.AndroidJUnit4
+
+import org.junit.Test
+import org.junit.runner.RunWith
+
+import org.junit.Assert.*
+
+/**
+ * Instrumented test, which will execute on an Android device.
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+@RunWith(AndroidJUnit4::class)
+class ExampleInstrumentedTest {
+ @Test
+ fun useAppContext() {
+ // Context of the app under test.
+ val appContext = InstrumentationRegistry.getInstrumentation().targetContext
+ assertEquals("android.llama.cpp.test", appContext.packageName)
+ }
+}
diff --git a/examples/llama.android/llama/src/main/AndroidManifest.xml b/examples/llama.android/llama/src/main/AndroidManifest.xml
new file mode 100644
index 000000000..8bdb7e14b
--- /dev/null
+++ b/examples/llama.android/llama/src/main/AndroidManifest.xml
@@ -0,0 +1,4 @@
+
+
+
+
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
new file mode 100644
index 000000000..42ebaad49
--- /dev/null
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,49 @@
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html.
+# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
+
+# Sets the minimum CMake version required for this project.
+cmake_minimum_required(VERSION 3.22.1)
+
+# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
+# Since this is the top level CMakeLists.txt, the project name is also accessible
+# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
+# build script scope).
+project("llama-android")
+
+include(FetchContent)
+FetchContent_Declare(
+ llama
+ GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+ GIT_TAG master
+)
+
+# Also provides "common"
+FetchContent_MakeAvailable(llama)
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds them for you.
+# Gradle automatically packages shared libraries with your APK.
+#
+# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
+# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
+# is preferred for the same purpose.
+#
+# In order to load a library into your app from Java/Kotlin, you must call
+# System.loadLibrary() and pass the name of the library defined here;
+# for GameActivity/NativeActivity derived applications, the same library name must be
+# used in the AndroidManifest.xml file.
+add_library(${CMAKE_PROJECT_NAME} SHARED
+ # List C/C++ source files with relative paths to this CMakeLists.txt.
+ llama-android.cpp)
+
+# Specifies libraries CMake should link to your target library. You
+# can link libraries from various origins, such as libraries defined in this
+# build script, prebuilt third-party libraries, or Android system libraries.
+target_link_libraries(${CMAKE_PROJECT_NAME}
+ # List libraries link to the target library
+ llama
+ common
+ android
+ log)
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
similarity index 92%
rename from examples/llama.android/app/src/main/cpp/llama-android.cpp
rename to examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 4af9de303..874158ef0 100644
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
extern "C"
JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
+Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0);
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
+Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast(model));
}
extern "C"
JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast(jmodel);
if (!model) {
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast(context));
}
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
llama_backend_free();
}
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL);
}
extern "C"
JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_bench_1model(
+Java_android_llama_cpp_LLamaAndroid_bench_1model(
JNIEnv *env,
jobject,
jlong context_pointer,
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
llama_backend_init();
}
extern "C"
JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
+Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info());
}
extern "C"
JNIEXPORT jint JNICALL
-Java_com_example_llama_Llm_completion_1init(
+Java_android_llama_cpp_LLamaAndroid_completion_1init(
JNIEnv *env,
jobject,
jlong context_pointer,
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
extern "C"
JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_completion_1loop(
+Java_android_llama_cpp_LLamaAndroid_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast(context));
}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
similarity index 97%
rename from examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
rename to examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
index d86afee37..6c63e54e0 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -1,4 +1,4 @@
-package com.example.llama
+package android.llama.cpp
import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import kotlin.concurrent.thread
-class Llm {
+class LLamaAndroid {
private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal = ThreadLocal.withInitial { State.Idle }
@@ -165,8 +165,8 @@ class Llm {
}
// Enforce only one instance of Llm.
- private val _instance: Llm = Llm()
+ private val _instance: LLamaAndroid = LLamaAndroid()
- fun instance(): Llm = _instance
+ fun instance(): LLamaAndroid = _instance
}
}
diff --git a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
new file mode 100644
index 000000000..cbbb974d3
--- /dev/null
+++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
@@ -0,0 +1,17 @@
+package android.llama.cpp
+
+import org.junit.Test
+
+import org.junit.Assert.*
+
+/**
+ * Example local unit test, which will execute on the development machine (host).
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+class ExampleUnitTest {
+ @Test
+ fun addition_isCorrect() {
+ assertEquals(4, 2 + 2)
+ }
+}
diff --git a/examples/llama.android/settings.gradle.kts b/examples/llama.android/settings.gradle.kts
index 2ba32c4fa..c7c1a034a 100644
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -15,3 +15,4 @@ dependencyResolutionManagement {
rootProject.name = "LlamaAndroid"
include(":app")
+include(":llama")
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 413e433dd..74f021dec 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
--projector-type ldpv2
```
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
-python ./convert.py path/to/MobileVLM-1.7B
+python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 4fb0cf381..8d1ae5270 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
-5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
-python ./convert.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
6) Then convert the model to gguf format:
```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava-cli using the 1.6 model version:
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 45bdad689..ca3631384 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
+/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
index a6d67e5d7..8c7dd2ae3 100644
--- a/examples/llava/llava-cli.cpp
+++ b/examples/llava/llava-cli.cpp
@@ -112,9 +112,12 @@ struct llava_context {
struct llama_model * model = NULL;
};
-static void show_additional_info(int /*argc*/, char ** argv) {
- LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
- LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\n example usage:\n");
+ LOG_TEE("\n %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
+ LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
- show_additional_info(argc, argv);
+ print_usage(argc, argv, params);
return 1;
}
@@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
- gpt_print_usage(argc, argv, params);
- show_additional_info(argc, argv);
+ print_usage(argc, argv, {});
return 1;
}
auto model = llava_init(¶ms);
diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
index f80f727a7..17cb4d5e5 100644
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -1,3 +1,3 @@
--r ../../requirements/requirements-convert.txt
+-r ../../requirements/requirements-convert-legacy-llama.txt
pillow~=10.2.0
torch~=2.1.1
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
index 9c3540b20..fb20ad93f 100644
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@@ -37,7 +37,8 @@ struct ngram_container {
int main(int argc, char ** argv) {
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -174,7 +175,7 @@ int main(int argc, char ** argv) {
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
- dump_kv_cache_view_seqs(kvc_view, 40);
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
}
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
index 1c230c966..d713f6f21 100644
--- a/examples/lookup/lookup-create.cpp
+++ b/examples/lookup/lookup-create.cpp
@@ -14,8 +14,10 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
+
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
index 87ecc0a4f..0b171c872 100644
--- a/examples/lookup/lookup-stats.cpp
+++ b/examples/lookup/lookup-stats.cpp
@@ -16,6 +16,7 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
index eebbd00a5..80ecd925d 100644
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@@ -15,6 +15,7 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -121,7 +122,7 @@ int main(int argc, char ** argv){
// debug
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
- dump_kv_cache_view_seqs(kvc_view, 40);
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
}
// print current draft sequence
diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md
index edf20d8db..a88e92f23 100644
--- a/examples/main-cmake-pkg/README.md
+++ b/examples/main-cmake-pkg/README.md
@@ -8,16 +8,14 @@ Because this example is "outside of the source tree", it is important to first b
### Considerations
-When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
+When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
### Build llama.cpp and install to C:\LlamaCPP directory
-In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
-
```cmd
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
```
@@ -27,7 +25,7 @@ cmake --install build --prefix C:/LlamaCPP
```cmd
cd ..\examples\main-cmake-pkg
-cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
+cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/MyLlamaApp
```
diff --git a/examples/main/README.md b/examples/main/README.md
index ee930f4e7..4eaa68475 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
#### Unix-based systems (Linux, macOS, etc.):
```bash
-./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
+./main -m models/7B/ggml-model.bin --ignore-eos -n -1
```
#### Windows:
```powershell
-main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
+main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
```
## Common Options
@@ -80,7 +80,6 @@ The `main` program provides several ways to interact with the LLaMA models using
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
-- `--random-prompt`: Start with a randomized prompt.
## Interaction
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 832b51ee0..b97b7b793 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -60,9 +60,9 @@ static void write_logfile(
return;
}
- const std::string timestamp = get_sortable_timestamp();
+ const std::string timestamp = string_get_sortable_timestamp();
- const bool success = create_directory_with_parents(params.logdir);
+ const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
@@ -80,7 +80,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
@@ -88,8 +88,8 @@ static void write_logfile(
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
- dump_string_yaml_multiline(logfile, "output", output.c_str());
- dump_vector_int_yaml(logfile, "output_tokens", output_tokens);
+ yaml_dump_string_multiline(logfile, "output", output.c_str());
+ yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
@@ -122,8 +122,10 @@ int main(int argc, char ** argv) {
g_params = ¶ms;
if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
+
llama_sampling_params & sparams = params.sparams;
#ifndef LOG_DISABLE_LOGS
@@ -180,9 +182,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
- if (params.random_prompt) {
- params.prompt = gpt_random_prompt(rng);
- }
LOG("%s: llama backend init\n", __func__);
llama_backend_init();
@@ -219,7 +218,7 @@ int main(int argc, char ** argv) {
// print system information
{
LOG_TEE("\n");
- LOG_TEE("%s\n", get_system_info(params).c_str());
+ LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
}
std::string path_session = params.path_prompt_cache;
@@ -250,11 +249,8 @@ int main(int argc, char ** argv) {
std::vector embd_inp;
- if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
+ if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
- if (params.chatml) {
- params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
- }
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
} else {
LOG("use session tokens\n");
@@ -332,37 +328,13 @@ int main(int argc, char ** argv) {
}
// number of tokens to keep when resetting context
- if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
+ if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
params.n_keep = (int)embd_inp.size();
} else {
params.n_keep += add_bos; // always keep the BOS token
}
- // prefix & suffix for instruct mode
- const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
- const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
-
- LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
- LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
-
- // chatml prefix & suffix
- const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
- const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
-
- LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
- LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
-
- // in instruct mode, we inject a prefix and a suffix to each input by the user
- if (params.instruct) {
- params.interactive_first = true;
- params.antiprompt.emplace_back("### Instruction:\n\n");
- }
- // similar for chatml mode
- else if (params.chatml) {
- params.interactive_first = true;
- params.antiprompt.emplace_back("<|im_start|>user\n");
- }
- else if (params.conversation) {
+ if (params.conversation) {
params.interactive_first = true;
}
@@ -474,12 +446,12 @@ int main(int argc, char ** argv) {
LOG_TEE("\n\n");
if (params.interactive) {
- const char *control_message;
+ const char * control_message;
if (params.multiline_input) {
- control_message = " - To return control to LLaMa, end your input with '\\'.\n"
+ control_message = " - To return control to the AI, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n";
} else {
- control_message = " - Press Return to return control to LLaMa.\n"
+ control_message = " - Press Return to return control to the AI.\n"
" - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n";
}
@@ -740,18 +712,26 @@ int main(int argc, char ** argv) {
// display text
if (input_echo && display) {
for (auto id : embd) {
- const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
- printf("%s", token_str.c_str());
+ const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+ // Console/Stream Output
+ fprintf(stdout, "%s", token_str.c_str());
+
+ // Record Displayed Tokens To Log
+ // Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) {
+ // Incoming Requested Tokens
input_tokens.push_back(id);
} else {
+ // Outgoing Generated Tokens
output_tokens.push_back(id);
output_ss << token_str;
}
+
+ fflush(stdout);
}
- fflush(stdout);
}
+
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
@@ -815,15 +795,13 @@ int main(int argc, char ** argv) {
is_interacting = true;
printf("\n");
- } else if (params.instruct || params.chatml) {
- is_interacting = true;
}
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
- if (params.conversation || params.instruct || params.chatml) {
+ if (params.conversation) {
printf("\n> ");
}
@@ -866,24 +844,12 @@ int main(int argc, char ** argv) {
const size_t original_size = embd_inp.size();
- // instruct mode: insert instruction prefix
- if (params.instruct && !is_antiprompt) {
- LOG("inserting instruction prefix\n");
- n_consumed = embd_inp.size();
- embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
- }
- // chatml mode: insert user chat prefix
- if (params.chatml && !is_antiprompt) {
- LOG("inserting chatml prefix\n");
- n_consumed = embd_inp.size();
- embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
- }
if (params.escape) {
- process_escapes(buffer);
+ string_process_escapes(buffer);
}
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
- const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
+ const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@@ -892,17 +858,6 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
- // instruct mode: insert response suffix
- if (params.instruct) {
- LOG("inserting instruction suffix\n");
- embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
- }
- // chatml mode: insert assistant chat suffix
- if (params.chatml) {
- LOG("inserting chatml suffix\n");
- embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
- }
-
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
@@ -927,7 +882,7 @@ int main(int argc, char ** argv) {
}
// end of generation
- if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
+ if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n");
break;
}
diff --git a/examples/make-ggml.py b/examples/make-ggml.py
deleted file mode 100755
index c73485ebf..000000000
--- a/examples/make-ggml.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""
-This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
-
-Usage:
-python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
-
-Arguments:
-- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
-- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
-- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
-- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
-- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
-- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
-
-Old quant types (some base model types require these):
-- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
-- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
-- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
-- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
-
-New quant types (recommended):
-- Q2_K: smallest, extreme quality loss - not recommended
-- Q3_K: alias for Q3_K_M
-- Q3_K_S: very small, very high quality loss
-- Q3_K_M: very small, very high quality loss
-- Q3_K_L: small, substantial quality loss
-- Q4_K: alias for Q4_K_M
-- Q4_K_S: small, significant quality loss
-- Q4_K_M: medium, balanced quality - recommended
-- Q5_K: alias for Q5_K_M
-- Q5_K_S: large, low quality loss - recommended
-- Q5_K_M: large, very low quality loss - recommended
-- Q6_K: very large, extremely low quality loss
-- Q8_0: very large, extremely low quality loss - not recommended
-- F16: extremely large, virtually no quality loss - not recommended
-- F32: absolutely huge, lossless - not recommended
-"""
-import subprocess
-subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
-
-import argparse
-import os
-from huggingface_hub import snapshot_download
-
-def main(model, model_type, outname, outdir, quants, keep_fp16):
- if not os.path.isdir(model):
- print(f"Model not found at {model}. Downloading...")
- try:
- if outname is None:
- outname = model.split('/')[-1]
- model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
- except Exception as e:
- raise Exception(f"Could not download the model: {e}")
-
- if outdir is None:
- outdir = f'../models/{outname}'
-
- if not os.path.isfile(f"{model}/config.json"):
- raise Exception(f"Could not find config.json in {model}")
-
- os.makedirs(outdir, exist_ok=True)
-
- print("Building llama.cpp")
- subprocess.run(f"cd .. && make quantize", shell=True, check=True)
-
- fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
-
- print(f"Making unquantised GGUF at {fp16}")
- if not os.path.isfile(fp16):
- if model_type != "llama":
- subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
- else:
- subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
- else:
- print(f"Unquantised GGML already exists at: {fp16}")
-
- print("Making quants")
- for type in quants:
- outfile = f"{outdir}/{outname}.gguf.{type}.bin"
- print(f"Making {type} : {outfile}")
- subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
-
- if not keep_fp16:
- os.remove(fp16)
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
- parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
- parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
- parser.add_argument('--outname', default=None, help='Output model(s) name')
- parser.add_argument('--outdir', default=None, help='Output directory')
- parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
- parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
-
- args = parser.parse_args()
-
- main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7c5595d6e..7faeaec97 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -100,7 +100,8 @@ int main(int argc, char ** argv) {
gpt_params params;
- if (gpt_params_parse(argc, argv, params) == false) {
+ if (!gpt_params_parse(argc, argv, params)) {
+ gpt_params_print_usage(argc, argv, params);
return 1;
}
@@ -210,7 +211,7 @@ int main(int argc, char ** argv) {
while (true) {
if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view);
- dump_kv_cache_view_seqs(kvc_view, 40);
+ llama_kv_cache_dump_view_seqs(kvc_view, 40);
}
llama_batch_clear(batch);
diff --git a/examples/passkey/README.md b/examples/passkey/README.md
index 4a22bb559..9e7a119ba 100644
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -8,5 +8,5 @@ See the following PRs for more info:
### Usage
```bash
-make -j && ./passkey ./models/llama-7b-v2/ggml-model-f16.gguf 250
+make -j && ./passkey -m ./models/llama-7b-v2/ggml-model-f16.gguf --junk 250
```
diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp
index f2ef9ca10..d03215cd1 100644
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@@ -6,46 +6,32 @@
#include
#include
+static void print_usage(int argc, char ** argv, const gpt_params & params) {
+ gpt_params_print_usage(argc, argv, params);
+
+ LOG_TEE("\nexample usage:\n");
+ LOG_TEE("\n %s -m model.gguf --junk 250 --pos 90 --keep 32 --grp-attn-n 2 [--seed 1234]\n", argv[0]);
+ LOG_TEE("\n");
+}
+
int main(int argc, char ** argv) {
gpt_params params;
- if (argc == 1 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH N_JUNK N_GRP I_POS SEED\n" , argv[0]);
- return 1 ;
+ params.n_junk = 250;
+ params.n_keep = 32;
+ params.i_pos = -1;
+
+ if (!gpt_params_parse(argc, argv, params)) {
+ print_usage(argc, argv, params);
+ return 1;
}
- int seed = -1;
+ srand(params.seed == LLAMA_DEFAULT_SEED ? time(NULL) : params.seed);
- int n_junk = 250; // number of times to repeat the junk text
- int n_keep = 32; // number of tokens in the prompt prefix
- int n_grp = 1; // if more than 1 - perform LongLM SelfExtend
- int i_pos = -1; // position of the passkey in the junk text
-
- if (argc >= 2) {
- params.model = argv[1];
- }
-
- if (argc >= 3) {
- n_junk = std::stoi(argv[2]);
- }
-
- if (argc >= 4) {
- n_grp = std::stoi(argv[3]);
- }
-
- if (argc >= 5) {
- i_pos = std::stoi(argv[4]);
- }
-
- if (argc >= 6) {
- seed = std::stoi(argv[5]);
- }
-
- if (seed == -1) {
- seed = time(NULL);
- }
-
- srand(seed);
+ int n_junk = params.n_junk;
+ int n_keep = params.n_keep;
+ int n_grp = params.grp_attn_n;
+ int i_pos = params.i_pos;
if (i_pos == -1) {
i_pos = rand() % n_junk;
@@ -76,9 +62,7 @@ int main(int argc, char ** argv) {
// initialize the model
- llama_model_params model_params = llama_model_default_params();
-
- model_params.n_gpu_layers = 99; // offload all layers to the GPU
+ llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@@ -89,13 +73,9 @@ int main(int argc, char ** argv) {
// initialize the context
- llama_context_params ctx_params = llama_context_default_params();
+ llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
- ctx_params.seed = seed;
- ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
- ctx_params.n_batch = 512;
- ctx_params.n_threads = params.n_threads;
- ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+ ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
GGML_ASSERT(ctx_params.n_batch % n_grp == 0 && "n_batch must be divisible by n_grp");
@@ -135,7 +115,7 @@ int main(int argc, char ** argv) {
LOG_TEE("prompt tokens: %d\n", n_tokens_all);
//LOG_TEE("prompt: %s\n", params.prompt.c_str());
- llama_batch batch = llama_batch_init(512, 0, 1);
+ llama_batch batch = llama_batch_init(params.n_batch, 0, 1);
int n_past = 0;
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index bae014e6f..0bd78c21a 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -44,9 +44,9 @@ static void write_logfile(
return;
}
- const std::string timestamp = get_sortable_timestamp();
+ const std::string timestamp = string_get_sortable_timestamp();
- const bool success = create_directory_with_parents(params.logdir);
+ const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str());
@@ -64,7 +64,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n");
char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc));
- dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc);
+ yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
fprintf(logfile, "\n");
fprintf(logfile, "######################\n");
@@ -72,9 +72,9 @@ static void write_logfile(
fprintf(logfile, "######################\n");
fprintf(logfile, "\n");
- dump_vector_float_yaml(logfile, "logits", results.logits);
+ yaml_dump_vector_float(logfile, "logits", results.logits);
fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
- dump_vector_float_yaml(logfile, "probs", results.probs);
+ yaml_dump_vector_float(logfile, "probs", results.probs);
llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile);
@@ -1032,7 +1032,7 @@ struct winogrande_entry {
std::vector seq_tokens[2];
};
-static std::vector