diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile
index 059fd2695..c01006efe 100644
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile
index 6ecf3bcc7..0314d469b 100644
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]
diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile
index 432fb5dad..6d5943a2f 100644
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@@ -18,7 +18,7 @@ COPY . .
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
ENV LC_ALL=C.utf8
diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile
index b937a4829..23f428944 100644
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
-RUN make
+RUN make -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile
index 274b91b71..7516c8313 100644
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+ rm /etc/apt/sources.list.d/intel-graphics.list && \
+ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+ chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
diff --git a/.devops/main-rocm.Dockerfile b/.devops/main-rocm.Dockerfile
index 0a706dc73..37576d68e 100644
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT [ "/app/main" ]
diff --git a/.devops/main.Dockerfile b/.devops/main.Dockerfile
index 3ab1decd6..763d75fce 100644
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@@ -9,7 +9,7 @@ WORKDIR /app
COPY . .
-RUN make
+RUN make -j$(nproc)
FROM ubuntu:$UBUNTU_VERSION as runtime
diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile
index 59a52ba21..7f5228185 100644
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile
index a8e451fa9..13d00b737 100644
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+ rm /etc/apt/sources.list.d/intel-graphics.list && \
+ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+ chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
@@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
+RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
+ echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
+ chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
+ rm /etc/apt/sources.list.d/intel-graphics.list && \
+ wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
+ echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
+ chmod 644 /usr/share/keyrings/intel-graphics.gpg
+
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile
index c02a31dd8..a6b76dee8 100644
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
-RUN make
+RUN make -j$(nproc)
ENTRYPOINT [ "/app/server" ]
diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile
index be964e0e8..0d09d3627 100644
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@@ -11,7 +11,7 @@ COPY . .
ENV LLAMA_CURL=1
-RUN make
+RUN make -j$(nproc)
FROM ubuntu:$UBUNTU_VERSION as runtime
diff --git a/.devops/tools.sh b/.devops/tools.sh
index 3a7d274e4..97424c3aa 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -8,7 +8,7 @@ arg1="$1"
shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
- python3 ./convert.py "$@"
+ python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
diff --git a/.github/ISSUE_TEMPLATE/01-bug-low.yml b/.github/ISSUE_TEMPLATE/01-bug-low.yml
new file mode 100644
index 000000000..bfb9d9a06
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/01-bug-low.yml
@@ -0,0 +1,50 @@
+name: Low Severity Bugs
+description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
+title: "Bug: "
+labels: ["bug-unconfirmed", "low severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/02-bug-medium.yml b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
new file mode 100644
index 000000000..e8297eea0
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/02-bug-medium.yml
@@ -0,0 +1,50 @@
+name: Medium Severity Bug
+description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
+title: "Bug: "
+labels: ["bug-unconfirmed", "medium severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/03-bug-high.yml b/.github/ISSUE_TEMPLATE/03-bug-high.yml
new file mode 100644
index 000000000..3c9d50d16
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/03-bug-high.yml
@@ -0,0 +1,50 @@
+name: High Severity Bug
+description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
+title: "Bug: "
+labels: ["bug-unconfirmed", "high severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/04-bug-critical.yml b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
new file mode 100644
index 000000000..d089d5fa1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/04-bug-critical.yml
@@ -0,0 +1,50 @@
+name: Critical Severity Bug
+description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
+title: "Bug: "
+labels: ["bug-unconfirmed", "critical severity"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Thanks for taking the time to fill out this bug report!
+ Please include information about your system, the steps to reproduce the bug,
+ and the version of llama.cpp that you are using.
+ If possible, please provide a minimal code example that reproduces the bug.
+ - type: textarea
+ id: what-happened
+ attributes:
+ label: What happened?
+ description: Also tell us, what did you expect to happen?
+ placeholder: Tell us what you see!
+ validations:
+ required: true
+ - type: textarea
+ id: version
+ attributes:
+ label: Name and Version
+ description: Which executable and which version of our software are you running? (use `--version` to get a version string)
+ placeholder: |
+ $./main --version
+ version: 2999 (42b4109e)
+ built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
+ validations:
+ required: true
+ - type: dropdown
+ id: operating-system
+ attributes:
+ label: What operating system are you seeing the problem on?
+ multiple: true
+ options:
+ - Linux
+ - Mac
+ - Windows
+ - BSD
+ - Other? (Please let us know in description)
+ validations:
+ required: false
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/05-enhancement.yml b/.github/ISSUE_TEMPLATE/05-enhancement.yml
new file mode 100644
index 000000000..58fca7318
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/05-enhancement.yml
@@ -0,0 +1,51 @@
+name: Enhancement
+description: Used to request enhancements for llama.cpp
+title: "Feature Request: "
+labels: ["enhancement"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+
+ - type: checkboxes
+ id: prerequisites
+ attributes:
+ label: Prerequisites
+ description: Please confirm the following before submitting your enhancement request.
+ options:
+ - label: I am running the latest code. Mention the version if possible as well.
+ required: true
+ - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+ required: true
+ - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
+ required: true
+ - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+ required: true
+
+ - type: textarea
+ id: feature-description
+ attributes:
+ label: Feature Description
+ description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
+ placeholder: Detailed description of the enhancement
+ validations:
+ required: true
+
+ - type: textarea
+ id: motivation
+ attributes:
+ label: Motivation
+ description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
+ placeholder: Explanation of why this feature is needed and its benefits
+ validations:
+ required: true
+
+ - type: textarea
+ id: possible-implementation
+ attributes:
+ label: Possible Implementation
+ description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
+ placeholder: Detailed description of potential implementation
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/06-research.yml b/.github/ISSUE_TEMPLATE/06-research.yml
new file mode 100644
index 000000000..3ae4e9f8c
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/06-research.yml
@@ -0,0 +1,52 @@
+name: Research
+description: Track new technical research area
+title: "Research: "
+labels: ["research 🔬"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+
+ - type: checkboxes
+ id: research-stage
+ attributes:
+ label: Research Stage
+ description: Track general state of this research ticket
+ options:
+ - label: Background Research (Let's try to avoid reinventing the wheel)
+ - label: Hypothesis Formed (How do you think this will work and it's effect?)
+ - label: Strategy / Implementation Forming
+ - label: Analysis of results
+ - label: Debrief / Documentation (So people in the future can learn from us)
+
+ - type: textarea
+ id: background
+ attributes:
+ label: Previous existing literature and research
+ description: Whats the current state of the art and whats the motivation for this research?
+
+ - type: textarea
+ id: hypothesis
+ attributes:
+ label: Hypothesis
+ description: How do you think this will work and it's effect?
+
+ - type: textarea
+ id: implementation
+ attributes:
+ label: Implementation
+ description: Got an approach? e.g. a PR ready to go?
+
+ - type: textarea
+ id: analysis
+ attributes:
+ label: Analysis
+ description: How does the proposed implementation behave?
+
+ - type: textarea
+ id: logs
+ attributes:
+ label: Relevant log output
+ description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
+ render: shell
diff --git a/.github/ISSUE_TEMPLATE/07-refactor.yml b/.github/ISSUE_TEMPLATE/07-refactor.yml
new file mode 100644
index 000000000..3a68d3d53
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/07-refactor.yml
@@ -0,0 +1,28 @@
+name: Refactor (Maintainers)
+description: Used to track refactoring opportunities
+title: "Refactor: "
+labels: ["refactor"]
+body:
+ - type: markdown
+ attributes:
+ value: |
+ Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+ Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+
+ - type: textarea
+ id: background-description
+ attributes:
+ label: Background Description
+ description: Please provide a detailed written description of the pain points you are trying to solve.
+ placeholder: Detailed description behind your motivation to request refactor
+ validations:
+ required: true
+
+ - type: textarea
+ id: possible-approaches
+ attributes:
+ label: Possible Refactor Approaches
+ description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
+ placeholder: Your idea of possible refactoring opportunity/approaches
+ validations:
+ required: false
diff --git a/.github/ISSUE_TEMPLATE/bug.md b/.github/ISSUE_TEMPLATE/bug.md
deleted file mode 100644
index 49812832c..000000000
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ /dev/null
@@ -1,11 +0,0 @@
----
-name: Bug template
-about: Used to report bugs in llama.cpp
-labels: ["bug-unconfirmed"]
-assignees: ''
-
----
-
-Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
-
-If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 000000000..c88134dbb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,13 @@
+blank_issues_enabled: true
+contact_links:
+ - name: Got an idea?
+ url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+ about: Pop it there. It may then become an enhancement ticket.
+ - name: Got a question?
+ url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+ about: Ask a question there!
+ - name: Want to contribute?
+ url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+ about: Head to the contribution guide page of the wiki for areas you can help with
+
+
diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md
deleted file mode 100644
index dcffda750..000000000
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ /dev/null
@@ -1,28 +0,0 @@
----
-name: Enhancement template
-about: Used to request enhancements for llama.cpp
-labels: ["enhancement"]
-assignees: ''
-
----
-
-# Prerequisites
-
-Please answer the following questions for yourself before submitting an issue.
-
-- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
-- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
-- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
-- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
-
-# Feature Description
-
-Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
-
-# Motivation
-
-Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
-
-# Possible Implementation
-
-If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
diff --git a/.github/labeler.yml b/.github/labeler.yml
index a67f78044..97d739b58 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,5 +1,16 @@
# https://github.com/actions/labeler
-
+Kompute:
+ - changed-files:
+ - any-glob-to-any-file:
+ - ggml-kompute.h
+ - ggml-kompute.cpp
+ - README-kompute.md
+Apple Metal:
+ - changed-files:
+ - any-glob-to-any-file:
+ - ggml-metal.h
+ - ggml-metal.cpp
+ - README-metal.md
SYCL:
- changed-files:
- any-glob-to-any-file:
@@ -9,6 +20,7 @@ SYCL:
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
+ - ggml-cuda.h
- ggml-cuda/**
Vulkan:
- changed-files:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ef02ff669..60cf7bdc4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ else()
set(INS_ENB ON)
endif()
+option(LLAMA_SVE "llama: enable SVE" OFF)
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
@@ -627,6 +628,10 @@ if (LLAMA_SYCL)
add_compile_definitions(GGML_SYCL_F16)
endif()
+ if (LLAMA_CUDA_FORCE_MMQ)
+ add_compile_definitions(GGML_SYCL_FORCE_MMQ)
+ endif()
+
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
@@ -1040,6 +1045,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
+ if (LLAMA_SVE)
+ list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
+ endif()
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@@ -1306,7 +1314,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
install(TARGETS llama LIBRARY PUBLIC_HEADER)
install(
- FILES convert.py
+ FILES convert-hf-to-gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
diff --git a/CMakePresets.json b/CMakePresets.json
index ad1af7ecc..e2b7a79e3 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -1,4 +1,4 @@
-{
+{
"version": 4,
"configurePresets": [
{
@@ -40,6 +40,10 @@
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
- { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
+ { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
+
+ { "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
+ { "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
+ { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
]
}
diff --git a/Makefile b/Makefile
index fe63cbd60..aced0e370 100644
--- a/Makefile
+++ b/Makefile
@@ -441,6 +441,9 @@ endif # JETSON_EOL_MODULE_DETECT
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG
+ifdef LLAMA_CUDA_DEBUG
+ MK_NVCCFLAGS += --device-debug
+endif # LLAMA_CUDA_DEBUG
ifdef LLAMA_CUDA_NVCC
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
else
@@ -568,6 +571,7 @@ ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
+ MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
diff --git a/README-sycl.md b/README-sycl.md
index cfa248a95..37f0306dc 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
-| OS | Status | Verified |
-|---------|---------|------------------------------------|
-| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
-| Windows | Support | Windows 11 |
+| OS | Status | Verified |
+|---------|---------|------------------------------------------------|
+| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
+| Windows | Support | Windows 11 |
## Hardware
@@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
-| Intel Arc Series | Support | Arc 770, 730M |
+| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |
diff --git a/README.md b/README.md
index 2ee267fdf..e8bf1e246 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,9 @@

-[](https://opensource.org/licenses/MIT) [](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[](https://opensource.org/licenses/MIT)
+[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@@ -20,7 +22,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
-- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
+- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
+- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@@ -200,9 +203,14 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
+- [AIKit](https://github.com/sozercan/aikit) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
+**Tools:**
+
+- [akx/ggify](https://github.com/akx/ggify) – download PyTorch models from HuggingFace Hub and convert them to GGML
+
---
Here is a typical run using LLaMA v2 13B on M2 Ultra:
@@ -311,8 +319,6 @@ In order to build llama.cpp you have four different options.
make
```
- **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
-
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@@ -324,23 +330,32 @@ In order to build llama.cpp you have four different options.
make
```
+ - Notes:
+ - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
+ - For faster repeated compilation, install [ccache](https://ccache.dev/).
+ - For debug builds, run `make LLAMA_DEBUG=1`
+
- Using `CMake`:
- ```bash
- cmake -B build
- cmake --build build --config Release
- ```
+ ```bash
+ cmake -B build
+ cmake --build build --config Release
+ ```
- **Note**: for `Debug` builds, there are two cases:
+ **Notes**:
- - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
+ - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
+ - For faster repeated compilation, install [ccache](https://ccache.dev/).
+ - For debug builds, there are two cases:
+
+ 1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
- - Multi-config generators (`-G` param set to Visual Studio, XCode...):
+ 2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
@@ -375,6 +390,14 @@ In order to build llama.cpp you have four different options.
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
the instructions for use and activate this options in this document below.
+### Homebrew
+
+On Mac and Linux, the homebrew package manager can be used via
+```
+brew install llama.cpp
+```
+The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+
### Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@@ -473,7 +496,8 @@ Building the program with BLAS support may lead to some performance improvements
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
- | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+ | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
+ | LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
@@ -692,7 +716,8 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
-Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
+Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
+It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
# obtain the official LLaMA model weights and place them in ./models
@@ -709,10 +734,10 @@ ls ./models
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
-python3 convert.py models/mymodel/
+python3 convert-hf-to-gguf.py models/mymodel/
# [Optional] for models using BPE tokenizers
-python convert.py models/mymodel/ --vocab-type bpe
+python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
diff --git a/ci/run.sh b/ci/run.sh
index 940299025..3fc5f48b2 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
- python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
+ python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
diff --git a/common/common.cpp b/common/common.cpp
index 1a284c56d..0c573c57f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1156,6 +1156,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.interactive_specials = true;
return true;
}
+ if (arg == "--special") {
+ params.special = true;
+ return true;
+ }
if (arg == "--embedding") {
params.embedding = true;
return true;
@@ -1614,6 +1618,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
printf(" -h, --help show this help message and exit\n");
printf(" --version show version and build info\n");
printf(" -i, --interactive run in interactive mode\n");
+ printf(" --special special tokens output enabled\n");
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
@@ -2135,11 +2140,15 @@ bool fs_create_directory_with_parents(const std::string & path) {
std::string fs_get_cache_directory() {
std::string cache_directory = "";
+ auto ensure_trailing_slash = [](std::string p) {
+ // Make sure to add trailing slash
+ if (p.back() != DIRECTORY_SEPARATOR) {
+ p += DIRECTORY_SEPARATOR;
+ }
+ return p;
+ };
if (getenv("LLAMA_CACHE")) {
cache_directory = std::getenv("LLAMA_CACHE");
- if (cache_directory.back() != DIRECTORY_SEPARATOR) {
- cache_directory += DIRECTORY_SEPARATOR;
- }
} else {
#ifdef __linux__
if (std::getenv("XDG_CACHE_HOME")) {
@@ -2150,12 +2159,12 @@ std::string fs_get_cache_directory() {
#elif defined(__APPLE__)
cache_directory = std::getenv("HOME") + std::string("/Library/Caches/");
#elif defined(_WIN32)
- cache_directory = std::getenv("APPDATA");
+ cache_directory = std::getenv("LOCALAPPDATA");
#endif // __linux__
+ cache_directory = ensure_trailing_slash(cache_directory);
cache_directory += "llama.cpp";
- cache_directory += DIRECTORY_SEPARATOR;
}
- return cache_directory;
+ return ensure_trailing_slash(cache_directory);
}
@@ -3137,6 +3146,7 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
fprintf(stream, "cpu_has_gpublas: %s\n", ggml_cpu_has_gpublas() ? "true" : "false");
fprintf(stream, "cpu_has_neon: %s\n", ggml_cpu_has_neon() ? "true" : "false");
+ fprintf(stream, "cpu_has_sve: %s\n", ggml_cpu_has_sve() ? "true" : "false");
fprintf(stream, "cpu_has_f16c: %s\n", ggml_cpu_has_f16c() ? "true" : "false");
fprintf(stream, "cpu_has_fp16_va: %s\n", ggml_cpu_has_fp16_va() ? "true" : "false");
fprintf(stream, "cpu_has_wasm_simd: %s\n", ggml_cpu_has_wasm_simd() ? "true" : "false");
diff --git a/common/common.h b/common/common.h
index 039042c81..079fae033 100644
--- a/common/common.h
+++ b/common/common.h
@@ -156,6 +156,7 @@ struct gpt_params {
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
+ bool special = false; // enable special token output
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
diff --git a/common/train.cpp b/common/train.cpp
index 2d41a1d29..fef1e57c9 100644
--- a/common/train.cpp
+++ b/common/train.cpp
@@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
params.custom_n_ctx = false;
- params.use_flash = true;
+ params.use_flash = false;
params.use_checkpointing = true;
params.sample_start = "";
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py
index 1923b88ba..84b72348d 100755
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -81,6 +81,7 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+ {"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
]
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 5a00a5e89..ad071b974 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -25,8 +25,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
-from convert import LlamaHfVocab
-
logger = logging.getLogger("hf-to-gguf")
@@ -313,11 +311,10 @@ class Model:
data = data.astype(np.float32)
data_qtype = gguf.GGMLQuantizationType.F32
- block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
+ shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
+
# reverse shape to make it similar to the internal ggml dimension order
- shape_str = f"""{{{', '.join(str(n) for n in reversed(
- (*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
- )}}}"""
+ shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
# n_dims is implicit in the shape
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
@@ -474,6 +471,9 @@ class Model:
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
+ if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
+ # ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
+ res = "smaug-bpe"
if res is None:
logger.warning("\n")
@@ -632,7 +632,7 @@ class Model:
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_llama_hf(self):
- vocab = LlamaHfVocab(self.dir_model)
+ vocab = gguf.LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
@@ -1315,6 +1315,17 @@ class LlamaModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+ if tokenizer_config_file.is_file():
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+ tokenizer_config_json = json.load(f)
+ if "add_prefix_space" in tokenizer_config_json:
+ self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
+
+ # Apply to granite small models only
+ if self.hparams.get("vocab_size", 32000) == 49152:
+ self.gguf_writer.add_add_bos_token(False)
+
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
@@ -1329,9 +1340,9 @@ class LlamaModel(Model):
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
- if name.endswith("q_proj.weight"):
+ if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
- if name.endswith("k_proj.weight"):
+ if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
@@ -2393,7 +2404,8 @@ class CommandR2Model(Model):
# max_position_embeddings = 8192 in config.json but model was actually
# trained on 128k context length
- self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
+ # aya-23 models don't have model_max_length specified
+ self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
def set_gguf_parameters(self):
super().set_gguf_parameters()
@@ -2466,6 +2478,236 @@ class JinaBertV2Model(BertModel):
self.gguf_writer.add_add_eos_token(True)
+@Model.register("ArcticForCausalLM")
+class ArcticModel(Model):
+ model_arch = gguf.MODEL_ARCH.ARCTIC
+
+ def set_vocab(self):
+ # The reason for using a custom implementation here is that the
+ # snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
+ # tokenizer.model and used them as BOS and EOS instead of adding new tokens.
+ from sentencepiece import SentencePieceProcessor
+
+ tokenizer_path = self.dir_model / 'tokenizer.model'
+
+ if not tokenizer_path.is_file():
+ logger.error(f'Error: Missing {tokenizer_path}')
+ sys.exit(1)
+
+ # Read the whole vocabulary from the tokenizer.model file
+ tokenizer = SentencePieceProcessor()
+ tokenizer.LoadFromFile(str(tokenizer_path))
+
+ vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
+
+ tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
+ scores: list[float] = [-10000.0] * vocab_size
+ toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
+
+ for token_id in range(tokenizer.vocab_size()):
+
+ piece = tokenizer.IdToPiece(token_id)
+ text = piece.encode("utf-8")
+ score = tokenizer.GetScore(token_id)
+
+ toktype = SentencePieceTokenTypes.NORMAL
+ if tokenizer.IsUnknown(token_id):
+ toktype = SentencePieceTokenTypes.UNKNOWN
+ elif tokenizer.IsControl(token_id):
+ toktype = SentencePieceTokenTypes.CONTROL
+ elif tokenizer.IsUnused(token_id):
+ toktype = SentencePieceTokenTypes.UNUSED
+ elif tokenizer.IsByte(token_id):
+ toktype = SentencePieceTokenTypes.BYTE
+
+ tokens[token_id] = text
+ scores[token_id] = score
+ toktypes[token_id] = toktype
+
+ # Use the added_tokens_decoder field from tokeniser_config.json as the source
+ # of information about added/redefined tokens and modify them accordingly.
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+ if tokenizer_config_file.is_file():
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+ tokenizer_config_json = json.load(f)
+
+ if "added_tokens_decoder" in tokenizer_config_json:
+ added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
+ for token_id, token_json in added_tokens_decoder.items():
+ token_id = int(token_id)
+ if (token_id >= vocab_size):
+ logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
+ continue
+
+ token_content = token_json["content"]
+ token_type = SentencePieceTokenTypes.USER_DEFINED
+ token_score = -10000.0
+
+ # Map unk_token to UNKNOWN, other special tokens to CONTROL
+ # Set the score to 0.0 as in the original tokenizer.model
+ if ("special" in token_json) and token_json["special"]:
+ if token_content == tokenizer_config_json["unk_token"]:
+ token_type = SentencePieceTokenTypes.UNKNOWN
+ else:
+ token_type = SentencePieceTokenTypes.CONTROL
+ token_score = 0.0
+
+ logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
+ tokens[token_id] = token_content.encode("utf-8")
+ toktypes[token_id] = token_type
+ scores[token_id] = token_score
+
+ self.gguf_writer.add_tokenizer_model("llama")
+ self.gguf_writer.add_tokenizer_pre("default")
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_scores(scores)
+ self.gguf_writer.add_token_types(toktypes)
+
+ special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
+ special_vocab.add_to_gguf(self.gguf_writer)
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ hparams = self.hparams
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+ self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
+
+ _experts: list[dict[str, Tensor]] | None = None
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ n_head = self.hparams["num_attention_heads"]
+ n_kv_head = self.hparams.get("num_key_value_heads")
+
+ if name.endswith("q_proj.weight"):
+ data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+ if name.endswith("k_proj.weight"):
+ data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+
+ # process the experts separately
+ if name.find("block_sparse_moe.experts") != -1:
+ n_experts = self.hparams["num_local_experts"]
+
+ assert bid is not None
+
+ if self._experts is None:
+ self._experts = [{} for _ in range(self.block_count)]
+
+ self._experts[bid][name] = data_torch
+
+ if len(self._experts[bid]) >= n_experts * 3:
+ tensors: list[tuple[str, Tensor]] = []
+
+ # merge the experts into a single 3d tensor
+ for wid in ["w1", "w2", "w3"]:
+ datas: list[Tensor] = []
+
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
+ datas.append(self._experts[bid][ename])
+ del self._experts[bid][ename]
+
+ data_torch = torch.stack(datas, dim=0)
+
+ merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
+
+ new_name = self.map_tensor_name(merged_name)
+
+ tensors.append((new_name, data_torch))
+ return tensors
+ else:
+ return []
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+ def write_tensors(self):
+ super().write_tensors()
+
+ if self._experts is not None:
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
+ experts = [k for d in self._experts for k in d.keys()]
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts}")
+
+
+@Model.register("DeepseekV2ForCausalLM")
+class DeepseekV2Model(Model):
+ model_arch = gguf.MODEL_ARCH.DEEPSEEK2
+
+ def set_vocab(self):
+ self._set_vocab_gpt2()
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ hparams = self.hparams
+
+ self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
+ self.gguf_writer.add_vocab_size(hparams["vocab_size"])
+ if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
+ self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
+ self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
+ self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
+ self.gguf_writer.add_value_length(hparams["v_head_dim"])
+ self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
+ self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
+ self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
+ self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
+ self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
+
+ if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
+ if self.hparams["rope_scaling"].get("type") == "yarn":
+ self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
+ self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
+ self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
+
+ _experts: list[dict[str, Tensor]] | None = None
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # process the experts separately
+ if name.find("mlp.experts") != -1:
+ n_experts = self.hparams["n_routed_experts"]
+ assert bid is not None
+
+ if self._experts is None:
+ self._experts = [{} for _ in range(self.block_count)]
+
+ self._experts[bid][name] = data_torch
+
+ if len(self._experts[bid]) >= n_experts * 3:
+ tensors: list[tuple[str, Tensor]] = []
+
+ # merge the experts into a single 3d tensor
+ for w_name in ["down_proj", "gate_proj", "up_proj"]:
+ datas: list[Tensor] = []
+
+ for xid in range(n_experts):
+ ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
+ datas.append(self._experts[bid][ename])
+ del self._experts[bid][ename]
+
+ data_torch = torch.stack(datas, dim=0)
+
+ merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
+
+ new_name = self.map_tensor_name(merged_name)
+
+ tensors.append((new_name, data_torch))
+ return tensors
+ else:
+ return []
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+ def write_tensors(self):
+ super().write_tensors()
+
+ if self._experts is not None:
+ # flatten `list[dict[str, Tensor]]` into `list[str]`
+ experts = [k for d in self._experts for k in d.keys()]
+ if len(experts) > 0:
+ raise ValueError(f"Unprocessed experts: {experts}")
+
+
###### CONVERSION LOGIC ######
@@ -2598,7 +2840,12 @@ def main() -> None:
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
- model_class = Model.from_model_architecture(hparams["architectures"][0])
+ try:
+ model_class = Model.from_model_architecture(hparams["architectures"][0])
+ except NotImplementedError:
+ logger.error(f"Model {hparams['architectures'][0]} is not supported")
+ sys.exit(1)
+
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
logger.info("Set model parameters")
diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md
index 48769cdf6..138124248 100644
--- a/docs/HOWTO-add-model.md
+++ b/docs/HOWTO-add-model.md
@@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
-Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
+Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
diff --git a/convert.py b/examples/convert-legacy-llama.py
similarity index 82%
rename from convert.py
rename to examples/convert-legacy-llama.py
index da1247957..fd8401015 100755
--- a/convert.py
+++ b/examples/convert-legacy-llama.py
@@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
+from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
import numpy as np
-from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
- sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
+ # use .parent.parent since we are in "examples" directory
+ sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
+
import gguf
+from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
@@ -380,306 +382,6 @@ class Metadata:
return metadata
-#
-# vocab
-#
-
-
-@runtime_checkable
-class BaseVocab(Protocol):
- tokenizer_model: ClassVar[str]
- name: ClassVar[str]
-
-
-class NoVocab(BaseVocab):
- tokenizer_model = "no_vocab"
- name = "no_vocab"
-
- def __repr__(self) -> str:
- return ""
-
-
-@runtime_checkable
-class Vocab(BaseVocab, Protocol):
- vocab_size: int
- added_tokens_dict: dict[str, int]
- added_tokens_list: list[str]
- fname_tokenizer: Path
-
- def __init__(self, base_path: Path): ...
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
-
-
-class BpeVocab(Vocab):
- tokenizer_model = "gpt2"
- name = "bpe"
-
- def __init__(self, base_path: Path):
- added_tokens: dict[str, int] = {}
-
- if (fname_tokenizer := base_path / 'vocab.json').exists():
- # "slow" tokenizer
- with open(fname_tokenizer, encoding="utf-8") as f:
- self.vocab = json.load(f)
-
- try:
- # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
- with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
- added_tokens = json.load(f)
- except FileNotFoundError:
- pass
- else:
- # "fast" tokenizer
- fname_tokenizer = base_path / FAST_TOKENIZER_FILE
-
- # if this fails, FileNotFoundError propagates to caller
- with open(fname_tokenizer, encoding="utf-8") as f:
- tokenizer_json = json.load(f)
-
- tokenizer_model: dict[str, Any] = tokenizer_json['model']
- if (
- tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
- or tokenizer_json['decoder']['type'] != 'ByteLevel'
- ):
- raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
-
- self.vocab = tokenizer_model["vocab"]
-
- if (added := tokenizer_json.get('added_tokens')) is not None:
- # Added tokens here can be duplicates of the main vocabulary.
- added_tokens = {item['content']: item['id']
- for item in added
- if item['content'] not in self.vocab}
-
- vocab_size = len(self.vocab)
- expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
- actual_ids = sorted(added_tokens.values())
- if expected_ids != actual_ids:
- expected_end_id = vocab_size + len(actual_ids) - 1
- raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
- f"{vocab_size} - {expected_end_id}; got {actual_ids}")
-
- items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
- self.added_tokens_dict = added_tokens
- self.added_tokens_list = [text for (text, idx) in items]
- self.vocab_size_base = vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
- self.fname_tokenizer = fname_tokenizer
-
- def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
-
- for i, _ in enumerate(self.vocab):
- yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- score = -1000.0
- yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.bpe_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
-class SentencePieceVocab(Vocab):
- tokenizer_model = "llama"
- name = "spm"
-
- def __init__(self, base_path: Path):
- added_tokens: dict[str, int] = {}
- if (fname_tokenizer := base_path / 'tokenizer.model').exists():
- # normal location
- try:
- with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
- added_tokens = json.load(f)
- except FileNotFoundError:
- pass
- elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
- # not found in alternate location either
- raise FileNotFoundError('Cannot find tokenizer.model')
-
- self.sentencepiece_tokenizer = SentencePieceProcessor()
- self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
- vocab_size = self.sentencepiece_tokenizer.vocab_size()
-
- new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
- expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
- actual_new_ids = sorted(new_tokens.keys())
-
- if expected_new_ids != actual_new_ids:
- raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
-
- # Token pieces that were added to the base vocabulary.
- self.added_tokens_dict = added_tokens
- self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
- self.vocab_size_base = vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
- self.fname_tokenizer = fname_tokenizer
-
- def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- tokenizer = self.sentencepiece_tokenizer
- for i in range(tokenizer.vocab_size()):
- piece = tokenizer.IdToPiece(i)
- text = piece.encode("utf-8")
- score: float = tokenizer.GetScore(i)
-
- toktype = gguf.TokenType.NORMAL
- if tokenizer.IsUnknown(i):
- toktype = gguf.TokenType.UNKNOWN
- if tokenizer.IsControl(i):
- toktype = gguf.TokenType.CONTROL
-
- # NOTE: I think added_tokens are user defined.
- # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
- # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
-
- if tokenizer.IsUnused(i):
- toktype = gguf.TokenType.UNUSED
- if tokenizer.IsByte(i):
- toktype = gguf.TokenType.BYTE
-
- yield text, score, toktype
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- score = -1000.0
- yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.sentencepiece_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
-class LlamaHfVocab(Vocab):
- tokenizer_model = "llama"
- name = "hfft"
-
- def __init__(self, base_path: Path):
- fname_tokenizer = base_path / FAST_TOKENIZER_FILE
- # if this fails, FileNotFoundError propagates to caller
- with open(fname_tokenizer, encoding='utf-8') as f:
- tokenizer_json = json.load(f)
-
- # pre-check so we know if we need transformers
- tokenizer_model: dict[str, Any] = tokenizer_json['model']
- is_llama3 = (
- tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
- and not tokenizer_model.get('byte_fallback', True)
- )
- if is_llama3:
- raise TypeError('Llama 3 must be converted with BpeVocab')
-
- if not is_llama3 and (
- tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
- or tokenizer_json['decoder']['type'] != 'Sequence'
- ):
- raise FileNotFoundError('Cannot find Llama BPE tokenizer')
-
- try:
- from transformers import AutoTokenizer
- except ImportError as e:
- raise ImportError(
- "To use LlamaHfVocab, please install the `transformers` package. "
- "You can install it with `pip install transformers`."
- ) from e
-
- # Allow the tokenizer to default to slow or fast versions.
- # Explicitly set tokenizer to use local paths.
- self.tokenizer = AutoTokenizer.from_pretrained(
- base_path,
- cache_dir=base_path,
- local_files_only=True,
- )
- assert self.tokenizer.is_fast # assume tokenizer.json is used
-
- # Initialize lists and dictionaries for added tokens
- self.added_tokens_list = []
- self.added_tokens_dict = dict()
- self.added_tokens_ids = set()
-
- # Process added tokens
- for tok, tokidx in sorted(
- self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
- ):
- # Only consider added tokens that are not in the base vocabulary
- if tokidx >= self.tokenizer.vocab_size:
- self.added_tokens_list.append(tok)
- self.added_tokens_dict[tok] = tokidx
- self.added_tokens_ids.add(tokidx)
-
- # Store special tokens and their IDs
- self.specials = {
- tok: self.tokenizer.get_vocab()[tok]
- for tok in self.tokenizer.all_special_tokens
- }
- self.special_ids = set(self.tokenizer.all_special_ids)
-
- # Set vocabulary sizes
- self.vocab_size_base = self.tokenizer.vocab_size
- self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
-
- self.fname_tokenizer = fname_tokenizer
-
- def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- reverse_vocab = {
- id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
- }
-
- for token_id in range(self.vocab_size_base):
- # Skip processing added tokens here
- if token_id in self.added_tokens_ids:
- continue
-
- # Convert token text to bytes
- token_text = reverse_vocab[token_id].encode("utf-8")
-
- # Yield token text, score, and type
- yield token_text, self.get_token_score(token_id), self.get_token_type(
- token_id, token_text, self.special_ids # Reuse already stored special IDs
- )
-
- def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
- # Special case for byte tokens
- if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
- return gguf.TokenType.BYTE
-
- # Determine token type based on whether it's a special token
- return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
-
- def get_token_score(self, token_id: int) -> float:
- # Placeholder for actual logic to determine the token's score
- # This needs to be implemented based on specific requirements
- return -1000.0 # Default score
-
- def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- for text in self.added_tokens_list:
- if text in self.specials:
- toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
- score = self.get_token_score(self.specials[text])
- else:
- toktype = gguf.TokenType.USER_DEFINED
- score = -1000.0
-
- yield text.encode("utf-8"), score, toktype
-
- def has_newline_token(self):
- return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
-
- def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
- yield from self.hf_tokens()
- yield from self.added_tokens()
-
- def __repr__(self) -> str:
- return f""
-
-
#
# data loading
# TODO: reuse (probably move to gguf.py?)
diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
index 746c3fbef..8ca9f8915 100644
--- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
@@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
params.samples_start_after_nl = false;
params.use_adam = true;
- params.use_flash = true;
+ params.use_flash = false;
params.use_scratch = true;
// only adam
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 82a001a0a..460dc5e75 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -178,6 +178,7 @@ struct cmd_params {
std::vector type_v;
std::vector n_threads;
std::vector n_gpu_layers;
+ std::vector rpc_servers;
std::vector split_mode;
std::vector main_gpu;
std::vector no_kv_offload;
@@ -212,6 +213,7 @@ static const cmd_params cmd_params_defaults = {
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* n_gpu_layers */ {99},
+ /* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
@@ -241,6 +243,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+ printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@@ -308,32 +311,28 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
if (arg == "-h" || arg == "--help") {
print_usage(argc, argv);
exit(0);
- }
- else if (arg == "-m" || arg == "--model") {
+ } else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
- }
- else if (arg == "-p" || arg == "--n-prompt") {
+ } else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
- }
- else if (arg == "-n" || arg == "--n-gen") {
+ } else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
- }
- else if (arg == "-pg") {
+ } else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -344,24 +343,21 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
break;
}
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
- }
- else if (arg == "-b" || arg == "--batch-size") {
+ } else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
- }
- else if (arg == "-ub" || arg == "--ubatch-size") {
+ } else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
- }
- else if (arg == "-ctk" || arg == "--cache-type-k") {
+ } else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -377,8 +373,7 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
types.push_back(gt);
}
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
- }
- else if (arg == "-ctv" || arg == "--cache-type-v") {
+ } else if (arg == "-ctv" || arg == "--cache-type-v") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -394,24 +389,27 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
types.push_back(gt);
}
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
- }
- else if (arg == "-t" || arg == "--threads") {
+ } else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
- }
- else if (arg == "-ngl" || arg == "--n-gpu-layers") {
+ } else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
- }
- else if (arg == "-sm" || arg == "--split-mode") {
+ } else if (arg == "-rpc" || arg == "--rpc") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ params.rpc_servers.push_back(argv[i]);
+ } else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -422,42 +420,35 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
llama_split_mode mode;
if (m == "none") {
mode = LLAMA_SPLIT_MODE_NONE;
- }
- else if (m == "layer") {
+ } else if (m == "layer") {
mode = LLAMA_SPLIT_MODE_LAYER;
- }
- else if (m == "row") {
+ } else if (m == "row") {
mode = LLAMA_SPLIT_MODE_ROW;
- }
- else {
+ } else {
invalid_param = true;
break;
}
modes.push_back(mode);
}
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
- }
- else if (arg == "-mg" || arg == "--main-gpu") {
+ } else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.main_gpu = split(argv[i], split_delim);
- }
- else if (arg == "-nkvo" || arg == "--no-kv-offload") {
+ } else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
- }
- else if (arg == "--numa") {
+ } else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
- }
- else {
+ } else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
@@ -465,15 +456,13 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
else { invalid_param = true; break; }
}
- }
- else if (arg == "-mt" || arg == "--max-threads") {
+ } else if (arg == "-mt" || arg == "--max-threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.cpuparams.n_threads = std::stoi(argv[i]);
- }
- else if (arg == "-C" || arg == "--cpu-mask") {
+ } else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -481,8 +470,7 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
std::string mask = argv[i];
params.cpuparams.mask_valid = true;
invalid_param = !parse_cpu_mask(mask, params.cpuparams.cpumask);
- }
- else if (arg == "--prio") {
+ } else if (arg == "--prio") {
if (++i >= argc) {
invalid_param = true;
break;
@@ -581,6 +569,7 @@ static cmd_params parse_cmd_params(int argc, char** argv) {
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
+ if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@@ -603,6 +592,7 @@ struct cmd_params_instance {
ggml_type type_v;
int n_threads;
int n_gpu_layers;
+ std::string rpc_servers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
@@ -615,6 +605,9 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
+ if (!rpc_servers.empty()) {
+ mparams.rpc_servers = rpc_servers.c_str();
+ }
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
@@ -626,6 +619,7 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model &&
n_gpu_layers == other.n_gpu_layers &&
+ rpc_servers == other.rpc_servers &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
@@ -654,6 +648,7 @@ static std::vector get_cmd_params_instances(const cmd_param
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
+ for (const auto & rpc : params.rpc_servers)
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
@@ -680,6 +675,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@@ -705,6 +701,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@@ -730,6 +727,7 @@ static std::vector get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
+ /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@@ -754,6 +752,7 @@ struct test {
static const bool kompute;
static const bool metal;
static const bool sycl;
+ static const bool rpc;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
@@ -852,6 +851,9 @@ struct test {
if (sycl) {
return GGML_SYCL_NAME;
}
+ if (rpc) {
+ return "RPC";
+ }
if (gpu_blas) {
return "GPU BLAS";
}
@@ -865,7 +867,7 @@ struct test {
static const std::vector & get_fields() {
static const std::vector fields = {
"build_commit", "build_number",
- "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
+ "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
@@ -921,7 +923,7 @@ struct test {
std::vector values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
- std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
+ std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
@@ -956,6 +958,7 @@ const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
+const bool test::rpc = !!ggml_cpu_has_rpc();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
diff --git a/examples/llama.android/app/build.gradle.kts b/examples/llama.android/app/build.gradle.kts
index d42140efe..8d1b37195 100644
--- a/examples/llama.android/app/build.gradle.kts
+++ b/examples/llama.android/app/build.gradle.kts
@@ -7,8 +7,6 @@ android {
namespace = "com.example.llama"
compileSdk = 34
- ndkVersion = "26.1.10909125"
-
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
@@ -20,17 +18,6 @@ android {
vectorDrawables {
useSupportLibrary = true
}
- ndk {
- // Add NDK properties if wanted, e.g.
- // abiFilters += listOf("arm64-v8a")
- }
- externalNativeBuild {
- cmake {
- arguments += "-DCMAKE_BUILD_TYPE=Release"
- cppFlags += listOf()
- arguments += listOf()
- }
- }
}
buildTypes {
@@ -55,17 +42,6 @@ android {
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
- packaging {
- resources {
- excludes += "/META-INF/{AL2.0,LGPL2.1}"
- }
- }
- externalNativeBuild {
- cmake {
- path = file("src/main/cpp/CMakeLists.txt")
- version = "3.22.1"
- }
- }
}
dependencies {
@@ -78,6 +54,7 @@ dependencies {
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
+ implementation(project(":llama"))
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
index be95e2221..45ac29938 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
+++ b/examples/llama.android/app/src/main/java/com/example/llama/MainViewModel.kt
@@ -1,5 +1,6 @@
package com.example.llama
+import android.llama.cpp.LLamaAndroid
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
@@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
-class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
+class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
@@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch {
try {
- llm.unload()
+ llamaAndroid.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
@@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
messages += ""
viewModelScope.launch {
- llm.send(text)
+ llamaAndroid.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
@@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch {
try {
val start = System.nanoTime()
- val warmupResult = llm.bench(pp, tg, pl, nr)
+ val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
@@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
return@launch
}
- messages += llm.bench(512, 128, 1, 3)
+ messages += llamaAndroid.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
@@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
fun load(pathToModel: String) {
viewModelScope.launch {
try {
- llm.load(pathToModel)
+ llamaAndroid.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)
diff --git a/examples/llama.android/build.gradle.kts b/examples/llama.android/build.gradle.kts
index 50ebc8211..acd1ada7d 100644
--- a/examples/llama.android/build.gradle.kts
+++ b/examples/llama.android/build.gradle.kts
@@ -2,4 +2,5 @@
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
+ id("com.android.library") version "8.2.0" apply false
}
diff --git a/examples/llama.android/llama/.gitignore b/examples/llama.android/llama/.gitignore
new file mode 100644
index 000000000..796b96d1c
--- /dev/null
+++ b/examples/llama.android/llama/.gitignore
@@ -0,0 +1 @@
+/build
diff --git a/examples/llama.android/app/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/CMakeLists.txt
similarity index 98%
rename from examples/llama.android/app/src/main/cpp/CMakeLists.txt
rename to examples/llama.android/llama/CMakeLists.txt
index 4536974a5..a5618cac0 100644
--- a/examples/llama.android/app/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/CMakeLists.txt
@@ -42,7 +42,7 @@ add_subdirectory(../../../../../../ build-llama)
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
- llama-android.cpp)
+ llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts
new file mode 100644
index 000000000..0a3806172
--- /dev/null
+++ b/examples/llama.android/llama/build.gradle.kts
@@ -0,0 +1,68 @@
+plugins {
+ id("com.android.library")
+ id("org.jetbrains.kotlin.android")
+}
+
+android {
+ namespace = "android.llama.cpp"
+ compileSdk = 34
+
+ defaultConfig {
+ minSdk = 33
+
+ testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
+ consumerProguardFiles("consumer-rules.pro")
+ ndk {
+ // Add NDK properties if wanted, e.g.
+ // abiFilters += listOf("arm64-v8a")
+ }
+ externalNativeBuild {
+ cmake {
+ arguments += "-DCMAKE_BUILD_TYPE=Release"
+ cppFlags += listOf()
+ arguments += listOf()
+
+ cppFlags("")
+ }
+ }
+ }
+
+ buildTypes {
+ release {
+ isMinifyEnabled = false
+ proguardFiles(
+ getDefaultProguardFile("proguard-android-optimize.txt"),
+ "proguard-rules.pro"
+ )
+ }
+ }
+ externalNativeBuild {
+ cmake {
+ path("src/main/cpp/CMakeLists.txt")
+ version = "3.22.1"
+ }
+ }
+ compileOptions {
+ sourceCompatibility = JavaVersion.VERSION_1_8
+ targetCompatibility = JavaVersion.VERSION_1_8
+ }
+ kotlinOptions {
+ jvmTarget = "1.8"
+ }
+
+ packaging {
+ resources {
+ excludes += "/META-INF/{AL2.0,LGPL2.1}"
+ }
+ }
+}
+
+dependencies {
+
+ implementation("androidx.core:core-ktx:1.12.0")
+ implementation("androidx.appcompat:appcompat:1.6.1")
+ implementation("com.google.android.material:material:1.11.0")
+ testImplementation("junit:junit:4.13.2")
+ androidTestImplementation("androidx.test.ext:junit:1.1.5")
+ androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
+}
diff --git a/examples/llama.android/llama/consumer-rules.pro b/examples/llama.android/llama/consumer-rules.pro
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/llama.android/llama/proguard-rules.pro b/examples/llama.android/llama/proguard-rules.pro
new file mode 100644
index 000000000..f1b424510
--- /dev/null
+++ b/examples/llama.android/llama/proguard-rules.pro
@@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+# http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+# public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
diff --git a/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
new file mode 100644
index 000000000..05d6ab5d2
--- /dev/null
+++ b/examples/llama.android/llama/src/androidTest/java/android/llama/cpp/ExampleInstrumentedTest.kt
@@ -0,0 +1,24 @@
+package android.llama.cpp
+
+import androidx.test.platform.app.InstrumentationRegistry
+import androidx.test.ext.junit.runners.AndroidJUnit4
+
+import org.junit.Test
+import org.junit.runner.RunWith
+
+import org.junit.Assert.*
+
+/**
+ * Instrumented test, which will execute on an Android device.
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+@RunWith(AndroidJUnit4::class)
+class ExampleInstrumentedTest {
+ @Test
+ fun useAppContext() {
+ // Context of the app under test.
+ val appContext = InstrumentationRegistry.getInstrumentation().targetContext
+ assertEquals("android.llama.cpp.test", appContext.packageName)
+ }
+}
diff --git a/examples/llama.android/llama/src/main/AndroidManifest.xml b/examples/llama.android/llama/src/main/AndroidManifest.xml
new file mode 100644
index 000000000..8bdb7e14b
--- /dev/null
+++ b/examples/llama.android/llama/src/main/AndroidManifest.xml
@@ -0,0 +1,4 @@
+
+
+
+
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
new file mode 100644
index 000000000..42ebaad49
--- /dev/null
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -0,0 +1,49 @@
+# For more information about using CMake with Android Studio, read the
+# documentation: https://d.android.com/studio/projects/add-native-code.html.
+# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
+
+# Sets the minimum CMake version required for this project.
+cmake_minimum_required(VERSION 3.22.1)
+
+# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
+# Since this is the top level CMakeLists.txt, the project name is also accessible
+# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
+# build script scope).
+project("llama-android")
+
+include(FetchContent)
+FetchContent_Declare(
+ llama
+ GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+ GIT_TAG master
+)
+
+# Also provides "common"
+FetchContent_MakeAvailable(llama)
+
+# Creates and names a library, sets it as either STATIC
+# or SHARED, and provides the relative paths to its source code.
+# You can define multiple libraries, and CMake builds them for you.
+# Gradle automatically packages shared libraries with your APK.
+#
+# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
+# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
+# is preferred for the same purpose.
+#
+# In order to load a library into your app from Java/Kotlin, you must call
+# System.loadLibrary() and pass the name of the library defined here;
+# for GameActivity/NativeActivity derived applications, the same library name must be
+# used in the AndroidManifest.xml file.
+add_library(${CMAKE_PROJECT_NAME} SHARED
+ # List C/C++ source files with relative paths to this CMakeLists.txt.
+ llama-android.cpp)
+
+# Specifies libraries CMake should link to your target library. You
+# can link libraries from various origins, such as libraries defined in this
+# build script, prebuilt third-party libraries, or Android system libraries.
+target_link_libraries(${CMAKE_PROJECT_NAME}
+ # List libraries link to the target library
+ llama
+ common
+ android
+ log)
diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
similarity index 92%
rename from examples/llama.android/app/src/main/cpp/llama-android.cpp
rename to examples/llama.android/llama/src/main/cpp/llama-android.cpp
index 4af9de303..874158ef0 100644
--- a/examples/llama.android/app/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
extern "C"
JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
+Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0);
@@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
+Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast(model));
}
extern "C"
JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
+Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast(jmodel);
if (!model) {
@@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast(context));
}
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
llama_backend_free();
}
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL);
}
extern "C"
JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_bench_1model(
+Java_android_llama_cpp_LLamaAndroid_bench_1model(
JNIEnv *env,
jobject,
jlong context_pointer,
@@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
+Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
-Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
+Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
@@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
+Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
llama_backend_init();
}
extern "C"
JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
+Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info());
}
extern "C"
JNIEXPORT jint JNICALL
-Java_com_example_llama_Llm_completion_1init(
+Java_android_llama_cpp_LLamaAndroid_completion_1init(
JNIEnv *env,
jobject,
jlong context_pointer,
@@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
extern "C"
JNIEXPORT jstring JNICALL
-Java_com_example_llama_Llm_completion_1loop(
+Java_android_llama_cpp_LLamaAndroid_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
@@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
-Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
+Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast(context));
}
diff --git a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
similarity index 97%
rename from examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
rename to examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
index d86afee37..6c63e54e0 100644
--- a/examples/llama.android/app/src/main/java/com/example/llama/Llm.kt
+++ b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -1,4 +1,4 @@
-package com.example.llama
+package android.llama.cpp
import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher
@@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import kotlin.concurrent.thread
-class Llm {
+class LLamaAndroid {
private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal = ThreadLocal.withInitial { State.Idle }
@@ -165,8 +165,8 @@ class Llm {
}
// Enforce only one instance of Llm.
- private val _instance: Llm = Llm()
+ private val _instance: LLamaAndroid = LLamaAndroid()
- fun instance(): Llm = _instance
+ fun instance(): LLamaAndroid = _instance
}
}
diff --git a/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
new file mode 100644
index 000000000..cbbb974d3
--- /dev/null
+++ b/examples/llama.android/llama/src/test/java/android/llama/cpp/ExampleUnitTest.kt
@@ -0,0 +1,17 @@
+package android.llama.cpp
+
+import org.junit.Test
+
+import org.junit.Assert.*
+
+/**
+ * Example local unit test, which will execute on the development machine (host).
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+class ExampleUnitTest {
+ @Test
+ fun addition_isCorrect() {
+ assertEquals(4, 2 + 2)
+ }
+}
diff --git a/examples/llama.android/settings.gradle.kts b/examples/llama.android/settings.gradle.kts
index 2ba32c4fa..c7c1a034a 100644
--- a/examples/llama.android/settings.gradle.kts
+++ b/examples/llama.android/settings.gradle.kts
@@ -15,3 +15,4 @@ dependencyResolutionManagement {
rootProject.name = "LlamaAndroid"
include(":app")
+include(":llama")
diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md
index 413e433dd..74f021dec 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/examples/llava/MobileVLM-README.md
@@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
--projector-type ldpv2
```
-4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
-python ./convert.py path/to/MobileVLM-1.7B
+python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 4fb0cf381..8d1ae5270 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
-5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
+5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
-python ./convert.py ../llava-v1.5-7b --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
6) Then convert the model to gguf format:
```console
-python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava-cli using the 1.6 model version:
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 45bdad689..ca3631384 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
-/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
+/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
diff --git a/examples/llava/requirements.txt b/examples/llava/requirements.txt
index f80f727a7..17cb4d5e5 100644
--- a/examples/llava/requirements.txt
+++ b/examples/llava/requirements.txt
@@ -1,3 +1,3 @@
--r ../../requirements/requirements-convert.txt
+-r ../../requirements/requirements-convert-legacy-llama.txt
pillow~=10.2.0
torch~=2.1.1
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 42a94a8c3..23415766b 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -767,18 +767,26 @@ int main(int argc, char ** argv) {
// display text
if (input_echo && display) {
for (auto id : embd) {
- const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
- printf("%s", token_str.c_str());
+ const std::string token_str = llama_token_to_piece(ctx, id, params.special);
+ // Console/Stream Output
+ fprintf(stdout, "%s", token_str.c_str());
+
+ // Record Displayed Tokens To Log
+ // Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) {
+ // Incoming Requested Tokens
input_tokens.push_back(id);
} else {
+ // Outgoing Generated Tokens
output_tokens.push_back(id);
output_ss << token_str;
}
+
+ fflush(stdout);
}
- fflush(stdout);
}
+
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
diff --git a/examples/make-ggml.py b/examples/make-ggml.py
deleted file mode 100755
index c73485ebf..000000000
--- a/examples/make-ggml.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-"""
-This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
-
-Usage:
-python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
-
-Arguments:
-- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
-- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
-- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
-- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
-- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
-- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
-
-Old quant types (some base model types require these):
-- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
-- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
-- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
-- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
-
-New quant types (recommended):
-- Q2_K: smallest, extreme quality loss - not recommended
-- Q3_K: alias for Q3_K_M
-- Q3_K_S: very small, very high quality loss
-- Q3_K_M: very small, very high quality loss
-- Q3_K_L: small, substantial quality loss
-- Q4_K: alias for Q4_K_M
-- Q4_K_S: small, significant quality loss
-- Q4_K_M: medium, balanced quality - recommended
-- Q5_K: alias for Q5_K_M
-- Q5_K_S: large, low quality loss - recommended
-- Q5_K_M: large, very low quality loss - recommended
-- Q6_K: very large, extremely low quality loss
-- Q8_0: very large, extremely low quality loss - not recommended
-- F16: extremely large, virtually no quality loss - not recommended
-- F32: absolutely huge, lossless - not recommended
-"""
-import subprocess
-subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
-
-import argparse
-import os
-from huggingface_hub import snapshot_download
-
-def main(model, model_type, outname, outdir, quants, keep_fp16):
- if not os.path.isdir(model):
- print(f"Model not found at {model}. Downloading...")
- try:
- if outname is None:
- outname = model.split('/')[-1]
- model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
- except Exception as e:
- raise Exception(f"Could not download the model: {e}")
-
- if outdir is None:
- outdir = f'../models/{outname}'
-
- if not os.path.isfile(f"{model}/config.json"):
- raise Exception(f"Could not find config.json in {model}")
-
- os.makedirs(outdir, exist_ok=True)
-
- print("Building llama.cpp")
- subprocess.run(f"cd .. && make quantize", shell=True, check=True)
-
- fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
-
- print(f"Making unquantised GGUF at {fp16}")
- if not os.path.isfile(fp16):
- if model_type != "llama":
- subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
- else:
- subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
- else:
- print(f"Unquantised GGML already exists at: {fp16}")
-
- print("Making quants")
- for type in quants:
- outfile = f"{outdir}/{outname}.gguf.{type}.bin"
- print(f"Making {type} : {outfile}")
- subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
-
- if not keep_fp16:
- os.remove(fp16)
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
- parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
- parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
- parser.add_argument('--outname', default=None, help='Output model(s) name')
- parser.add_argument('--outdir', default=None, help='Output directory')
- parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
- parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
-
- args = parser.parse_args()
-
- main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
diff --git a/examples/server/public/index.html b/examples/server/public/index.html
index 2961999f2..4c5a34d90 100644
--- a/examples/server/public/index.html
+++ b/examples/server/public/index.html
@@ -594,7 +594,7 @@
message = html`<${Probabilities} data=${data} />`
} else {
const text = isArrayMessage ?
- data.map(msg => msg.content).join('').replace(/^\s+/, '') :
+ data.map(msg => msg.content).join('') :
data;
message = isCompletionMode ?
text :
@@ -877,19 +877,30 @@
// poor mans markdown replacement
const Markdownish = (params) => {
- const md = params.text
- .replace(/&/g, '&')
- .replace(//g, '>')
- .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1$2
')
- .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
- .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
- .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
- .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
- .replace(/```.*?\n([\s\S]*?)```/g, '$1
')
- .replace(/`(.*?)`/g, '$1
')
- .replace(/\n/gim, '
');
- return html``;
+ const chunks = params.text.split('```');
+
+ for (let i = 0; i < chunks.length; i++) {
+ if (i % 2 === 0) { // outside code block
+ chunks[i] = chunks[i]
+ .replace(/&/g, '&')
+ .replace(//g, '>')
+ .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1$2
')
+ .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
+ .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
+ .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
+ .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1')
+ .replace(/```.*?\n([\s\S]*?)```/g, '$1
')
+ .replace(/`(.*?)`/g, '$1
')
+ .replace(/\n/gim, '
');
+ } else { // inside code block
+ chunks[i] = `${chunks[i]}
`;
+ }
+ }
+
+ const restoredText = chunks.join('');
+
+ return html``;
};
const ModelGenerationInfo = (params) => {
@@ -903,6 +914,7 @@
`
}
+
// simple popover impl
const Popover = (props) => {
const isOpen = useSignal(false);
@@ -1054,4 +1066,3 @@