Merge branch 'ggerganov:master' into master

This commit is contained in:
woodx 2024-06-02 19:59:12 +08:00 committed by GitHub
commit a90b398b2f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
289 changed files with 37235 additions and 20094 deletions

View file

@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
# Enable cURL # Enable cURL
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1
RUN make RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"] ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev apt-get install -y libcurl4-openssl-dev
RUN make RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"] ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -18,7 +18,7 @@ COPY . .
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1
RUN make RUN make -j$(nproc)
ENV LC_ALL=C.utf8 ENV LC_ALL=C.utf8

View file

@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA # Enable CUDA
ENV LLAMA_CUDA=1 ENV LLAMA_CUDA=1
RUN make RUN make -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

View file

@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y git apt-get install -y git

View file

@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++ ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make RUN make -j$(nproc)
ENTRYPOINT [ "/app/main" ] ENTRYPOINT [ "/app/main" ]

View file

@ -9,7 +9,7 @@ WORKDIR /app
COPY . . COPY . .
RUN make RUN make -j$(nproc)
FROM ubuntu:$UBUNTU_VERSION as runtime FROM ubuntu:$UBUNTU_VERSION as runtime

View file

@ -214,7 +214,6 @@ effectiveStdenv.mkDerivation (
(cmakeBool "LLAMA_CUDA" useCuda) (cmakeBool "LLAMA_CUDA" useCuda)
(cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_HIPBLAS" useRocm)
(cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_METAL" useMetalKit)
(cmakeBool "LLAMA_MPI" useMpi)
(cmakeBool "LLAMA_VULKAN" useVulkan) (cmakeBool "LLAMA_VULKAN" useVulkan)
(cmakeBool "LLAMA_STATIC" enableStatic) (cmakeBool "LLAMA_STATIC" enableStatic)
] ]
@ -227,20 +226,20 @@ effectiveStdenv.mkDerivation (
) )
] ]
++ optionals useRocm [ ++ optionals useRocm [
(cmakeFeature "CMAKE_C_COMPILER" "hipcc") (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
(cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
# Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
# in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
# and select the line that matches the current nixpkgs version of rocBLAS.
# Should likely use `rocmPackages.clr.gpuTargets`.
"-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
] ]
++ optionals useMetalKit [ ++ optionals useMetalKit [
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
]; ];
# Environment variables needed for ROCm
env = optionals useRocm {
ROCM_PATH = "${rocmPackages.clr}";
HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
};
# TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
# if they haven't been added yet. # if they haven't been added yet.
postInstall = '' postInstall = ''

View file

@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
# Enable cURL # Enable cURL
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1
RUN make RUN make -j$(nproc)
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime FROM ${BASE_CUDA_RUN_CONTAINER} as runtime

View file

@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev apt-get install -y git libcurl4-openssl-dev
@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev apt-get install -y libcurl4-openssl-dev

View file

@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev apt-get install -y libcurl4-openssl-dev
RUN make RUN make -j$(nproc)
ENTRYPOINT [ "/app/server" ] ENTRYPOINT [ "/app/server" ]

View file

@ -11,7 +11,7 @@ COPY . .
ENV LLAMA_CURL=1 ENV LLAMA_CURL=1
RUN make RUN make -j$(nproc)
FROM ubuntu:$UBUNTU_VERSION as runtime FROM ubuntu:$UBUNTU_VERSION as runtime

View file

@ -8,7 +8,7 @@ arg1="$1"
shift shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert.py "$@" python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@" ./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then

50
.github/ISSUE_TEMPLATE/01-bug-low.yml vendored Normal file
View file

@ -0,0 +1,50 @@
name: Low Severity Bugs
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
title: "Bug: "
labels: ["bug-unconfirmed", "low severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -0,0 +1,50 @@
name: Medium Severity Bug
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
title: "Bug: "
labels: ["bug-unconfirmed", "medium severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

50
.github/ISSUE_TEMPLATE/03-bug-high.yml vendored Normal file
View file

@ -0,0 +1,50 @@
name: High Severity Bug
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
title: "Bug: "
labels: ["bug-unconfirmed", "high severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -0,0 +1,50 @@
name: Critical Severity Bug
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
title: "Bug: "
labels: ["bug-unconfirmed", "critical severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -0,0 +1,51 @@
name: Enhancement
description: Used to request enhancements for llama.cpp
title: "Feature Request: "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
- type: checkboxes
id: prerequisites
attributes:
label: Prerequisites
description: Please confirm the following before submitting your enhancement request.
options:
- label: I am running the latest code. Mention the version if possible as well.
required: true
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
required: true
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
required: true
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
required: true
- type: textarea
id: feature-description
attributes:
label: Feature Description
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
placeholder: Detailed description of the enhancement
validations:
required: true
- type: textarea
id: motivation
attributes:
label: Motivation
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
placeholder: Explanation of why this feature is needed and its benefits
validations:
required: true
- type: textarea
id: possible-implementation
attributes:
label: Possible Implementation
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
placeholder: Detailed description of potential implementation
validations:
required: false

52
.github/ISSUE_TEMPLATE/06-research.yml vendored Normal file
View file

@ -0,0 +1,52 @@
name: Research
description: Track new technical research area
title: "Research: "
labels: ["research 🔬"]
body:
- type: markdown
attributes:
value: |
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
- type: checkboxes
id: research-stage
attributes:
label: Research Stage
description: Track general state of this research ticket
options:
- label: Background Research (Let's try to avoid reinventing the wheel)
- label: Hypothesis Formed (How do you think this will work and it's effect?)
- label: Strategy / Implementation Forming
- label: Analysis of results
- label: Debrief / Documentation (So people in the future can learn from us)
- type: textarea
id: background
attributes:
label: Previous existing literature and research
description: Whats the current state of the art and whats the motivation for this research?
- type: textarea
id: hypothesis
attributes:
label: Hypothesis
description: How do you think this will work and it's effect?
- type: textarea
id: implementation
attributes:
label: Implementation
description: Got an approach? e.g. a PR ready to go?
- type: textarea
id: analysis
attributes:
label: Analysis
description: How does the proposed implementation behave?
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

28
.github/ISSUE_TEMPLATE/07-refactor.yml vendored Normal file
View file

@ -0,0 +1,28 @@
name: Refactor (Maintainers)
description: Used to track refactoring opportunities
title: "Refactor: "
labels: ["refactor"]
body:
- type: markdown
attributes:
value: |
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
- type: textarea
id: background-description
attributes:
label: Background Description
description: Please provide a detailed written description of the pain points you are trying to solve.
placeholder: Detailed description behind your motivation to request refactor
validations:
required: true
- type: textarea
id: possible-approaches
attributes:
label: Possible Refactor Approaches
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
placeholder: Your idea of possible refactoring opportunity/approaches
validations:
required: false

View file

@ -1,11 +0,0 @@
---
name: Bug template
about: Used to report bugs in llama.cpp
labels: ["bug-unconfirmed"]
assignees: ''
---
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).

13
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View file

@ -0,0 +1,13 @@
blank_issues_enabled: true
contact_links:
- name: Got an idea?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
about: Pop it there. It may then become an enhancement ticket.
- name: Got a question?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
about: Ask a question there!
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with

View file

@ -1,28 +0,0 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

90
.github/labeler.yml vendored Normal file
View file

@ -0,0 +1,90 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml-kompute.h
- ggml-kompute.cpp
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
- ggml-metal.h
- ggml-metal.cpp
- README-metal.md
SYCL:
- changed-files:
- any-glob-to-any-file:
- ggml-sycl.h
- ggml-sycl.cpp
- README-sycl.md
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml-cuda.h
- ggml-cuda/**
Vulkan:
- changed-files:
- any-glob-to-any-file:
- ggml_vk_generate_shaders.py
- ggml-vulkan*
documentation:
- changed-files:
- any-glob-to-any-file:
- docs/**
- media/**
testing:
- changed-files:
- any-glob-to-any-file:
- tests/**
build:
- changed-files:
- any-glob-to-any-file:
- cmake/**
- CMakeLists.txt
- CMakePresets.json
- codecov.yml
examples:
- changed-files:
- any-glob-to-any-file: examples/**
devops:
- changed-files:
- any-glob-to-any-file:
- .devops/**
- .github/**
- ci/**
python:
- changed-files:
- any-glob-to-any-file:
- "**/*.py"
- requirements/**
- gguf-py/**
- .flake8
script:
- changed-files:
- any-glob-to-any-file:
- scripts/**
android:
- changed-files:
- any-glob-to-any-file:
- examples/llama.android/**
server:
- changed-files:
- any-glob-to-any-file:
- examples/server/**
ggml:
- changed-files:
- any-glob-to-any-file:
- ggml.c
- ggml.h
- ggml-*.c
- ggml-*.h
- ggml-cuda/**
nix:
- changed-files:
- any-glob-to-any-file:
- "**/*.nix"
- .github/workflows/nix-*.yml
- .devops/nix/nixpkgs-instances.nix
embedding:
- changed-files:
- any-glob-to-any-file: examples/embedding/

View file

@ -271,49 +271,15 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
name: llama-bin-ubuntu-x64.zip name: llama-bin-ubuntu-x64.zip
# ubuntu-latest-cmake-sanitizer: ubuntu-latest-cmake-sanitizer:
# runs-on: ubuntu-latest
#
# continue-on-error: true
#
# strategy:
# matrix:
# sanitizer: [ADDRESS, THREAD, UNDEFINED]
# build_type: [Debug, Release]
#
# steps:
# - name: Clone
# id: checkout
# uses: actions/checkout@v4
#
# - name: Dependencies
# id: depends
# run: |
# sudo apt-get update
# sudo apt-get install build-essential
#
# - name: Build
# id: cmake_build
# run: |
# mkdir build
# cd build
# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
# cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
#
# - name: Test
# id: cmake_test
# run: |
# cd build
# ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-mpi:
runs-on: ubuntu-latest runs-on: ubuntu-latest
continue-on-error: true continue-on-error: true
strategy: strategy:
matrix: matrix:
mpi_library: [mpich, libopenmpi-dev] sanitizer: [ADDRESS, THREAD, UNDEFINED]
build_type: [Debug, Release]
steps: steps:
- name: Clone - name: Clone
@ -324,21 +290,21 @@ jobs:
id: depends id: depends
run: | run: |
sudo apt-get update sudo apt-get update
sudo apt-get install build-essential ${{ matrix.mpi_library }} sudo apt-get install build-essential
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build mkdir build
cd build cd build
cmake -DLLAMA_MPI=ON .. cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config Release -j $(nproc) cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test - name: Test
id: cmake_test id: cmake_test
run: | run: |
cd build cd build
ctest -L main --verbose ctest -L main --verbose --timeout 900
ubuntu-latest-cmake-rpc: ubuntu-latest-cmake-rpc:
runs-on: ubuntu-latest runs-on: ubuntu-latest
@ -392,6 +358,33 @@ jobs:
cmake -DLLAMA_VULKAN=ON .. cmake -DLLAMA_VULKAN=ON ..
cmake --build . --config Release -j $(nproc) cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-hip:
runs-on: ubuntu-22.04
container: rocm/dev-ubuntu-22.04:6.0.2
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Dependencies
id: depends
run: |
sudo apt-get update
sudo apt-get install -y build-essential git cmake rocblas-dev hipblas-dev
- name: Build with native CMake HIP support
id: cmake_build
run: |
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release -j $(nproc)
- name: Build with legacy HIP support
id: cmake_build_legacy_hip
run: |
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
cmake --build build2 --config Release -j $(nproc)
ubuntu-22-cmake-sycl: ubuntu-22-cmake-sycl:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04
@ -693,26 +686,28 @@ jobs:
strategy: strategy:
matrix: matrix:
include: include:
- build: 'rpc' - build: 'rpc-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
- build: 'noavx' - build: 'noavx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx2' - build: 'avx2-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'avx' - build: 'avx-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512' - build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast' - build: 'clblast-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas' - build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute' - build: 'kompute-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
- build: 'vulkan' - build: 'vulkan-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON' defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
- build: 'arm64' - build: 'llvm-arm64'
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
- build: 'msvc-arm64'
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
steps: steps:
- name: Clone - name: Clone
@ -723,13 +718,13 @@ jobs:
- name: Clone Kompute submodule - name: Clone Kompute submodule
id: clone_kompute id: clone_kompute
if: ${{ matrix.build == 'kompute' }} if: ${{ matrix.build == 'kompute-x64' }}
run: | run: |
git submodule update --init kompute git submodule update --init kompute
- name: Download OpenCL SDK - name: Download OpenCL SDK
id: get_opencl id: get_opencl
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip" curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl mkdir $env:RUNNER_TEMP/opencl
@ -737,7 +732,7 @@ jobs:
- name: Download CLBlast - name: Download CLBlast
id: get_clblast id: get_clblast
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z" curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE" curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
@ -750,7 +745,7 @@ jobs:
- name: Download OpenBLAS - name: Download OpenBLAS
id: get_openblas id: get_openblas
if: ${{ matrix.build == 'openblas' }} if: ${{ matrix.build == 'openblas-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip" curl.exe -o $env:RUNNER_TEMP/openblas.zip -L "https://github.com/xianyi/OpenBLAS/releases/download/v${env:OPENBLAS_VERSION}/OpenBLAS-${env:OPENBLAS_VERSION}-x64.zip"
curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE" curl.exe -o $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt -L "https://github.com/xianyi/OpenBLAS/raw/v${env:OPENBLAS_VERSION}/LICENSE"
@ -763,38 +758,41 @@ jobs:
- name: Install Vulkan SDK - name: Install Vulkan SDK
id: get_vulkan id: get_vulkan
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }} if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }}
run: | run: |
curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
& "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
- name: Install Ninja
id: install_ninja
run: |
choco install ninja
- name: Build - name: Build
id: cmake_build id: cmake_build
run: | run: |
mkdir build cmake -S . -B build ${{ matrix.defines }}
cd build cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
cmake .. ${{ matrix.defines }}
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll - name: Add clblast.dll
id: add_clblast_dll id: add_clblast_dll
if: ${{ matrix.build == 'clblast' }} if: ${{ matrix.build == 'clblast-x64' }}
run: | run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll - name: Add libopenblas.dll
id: add_libopenblas_dll id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas' }} if: ${{ matrix.build == 'openblas-x64' }}
run: | run: |
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
- name: Check AVX512F support - name: Check AVX512F support
id: check_avx512f id: check_avx512f
if: ${{ matrix.build == 'avx512' }} if: ${{ matrix.build == 'avx512-x64' }}
continue-on-error: true continue-on-error: true
run: | run: |
cd build cd build
@ -808,14 +806,14 @@ jobs:
- name: Test - name: Test
id: cmake_test id: cmake_test
# not all machines have native AVX-512 # not all machines have native AVX-512
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: | run: |
cd build cd build
ctest -L main -C Release --verbose --timeout 900 ctest -L main -C Release --verbose --timeout 900
- name: Test (Intel SDE) - name: Test (Intel SDE)
id: cmake_test_sde id: cmake_test_sde
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
run: | run: |
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz" curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
# for some weird reason windows tar doesn't like sde tar.xz # for some weird reason windows tar doesn't like sde tar.xz
@ -843,14 +841,14 @@ jobs:
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
run: | run: |
Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt Copy-Item LICENSE .\build\bin\Release\llama.cpp.txt
7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip .\build\bin\Release\* 7z a llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip .\build\bin\Release\*
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v4 uses: actions/upload-artifact@v4
with: with:
path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}.zip
name: llama-bin-win-${{ matrix.build }}-x64.zip name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda: windows-latest-cmake-cuda:
runs-on: windows-latest runs-on: windows-latest
@ -984,6 +982,37 @@ jobs:
path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip
name: llama-bin-win-sycl-x64.zip name: llama-bin-win-sycl-x64.zip
windows-latest-cmake-hip:
runs-on: windows-latest
steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
- name: Install
id: depends
run: |
$ErrorActionPreference = "Stop"
write-host "Downloading AMD HIP SDK Installer"
Invoke-WebRequest -Uri "https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-23.Q4-WinSvr2022-For-HIP.exe" -OutFile "${env:RUNNER_TEMP}\rocm-install.exe"
write-host "Installing AMD HIP SDK"
Start-Process "${env:RUNNER_TEMP}\rocm-install.exe" -ArgumentList '-install' -NoNewWindow -Wait
write-host "Completed AMD HIP SDK installation"
- name: Verify ROCm
id: verify
run: |
& 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' --version
- name: Build
id: cmake_build
run: |
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
cmake --build build --config Release
ios-xcode-build: ios-xcode-build:
runs-on: macos-latest runs-on: macos-latest

17
.github/workflows/labeler.yml vendored Normal file
View file

@ -0,0 +1,17 @@
name: "Pull Request Labeler"
on:
- pull_request_target
jobs:
labeler:
permissions:
contents: read
pull-requests: write
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
repository: "ggerganov/llama.cpp"
- uses: actions/labeler@v5
with:
configuration-path: '.github/labeler.yml'

View file

@ -32,10 +32,8 @@ jobs:
strategy: strategy:
matrix: matrix:
# TODO: temporary disabled due to linux kernel issues sanitizer: [ADDRESS, THREAD, UNDEFINED]
#sanitizer: [ADDRESS, THREAD, UNDEFINED] build_type: [RelWithDebInfo]
sanitizer: [UNDEFINED]
build_type: [Debug]
include: include:
- build_type: Release - build_type: Release
sanitizer: "" sanitizer: ""
@ -102,10 +100,8 @@ jobs:
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server
- name: Tests - name: Tests
id: server_integration_tests id: server_integration_tests
if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: | run: |
cd examples/server/tests cd examples/server/tests
PORT=8888 ./tests.sh PORT=8888 ./tests.sh

View file

@ -1,29 +0,0 @@
name: Zig CI
on:
pull_request:
push:
branches:
- master
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true
jobs:
build:
strategy:
fail-fast: false
matrix:
runs-on: [ubuntu-latest, macos-latest, windows-latest]
runs-on: ${{ matrix.runs-on }}
steps:
- uses: actions/checkout@v4
with:
submodules: recursive
fetch-depth: 0
- uses: goto-bus-stop/setup-zig@v2
with:
version: 0.11.0
- name: Build Summary
run: zig build --summary all -freference-trace

View file

@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("llama.cpp" C CXX) project("llama.cpp" C CXX)
include(CheckIncludeFileCXX) include(CheckIncludeFileCXX)
@ -72,11 +72,13 @@ else()
set(INS_ENB ON) set(INS_ENB ON)
endif() endif()
option(LLAMA_SVE "llama: enable SVE" OFF)
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB}) option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB}) option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
option(LLAMA_AVX512 "llama: enable AVX512" OFF) option(LLAMA_AVX512 "llama: enable AVX512" OFF)
option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF) option(LLAMA_AVX512_VBMI "llama: enable AVX512-VBMI" OFF)
option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF) option(LLAMA_AVX512_VNNI "llama: enable AVX512-VNNI" OFF)
option(LLAMA_AVX512_BF16 "llama: enable AVX512-BF16" OFF)
option(LLAMA_FMA "llama: enable FMA" ${INS_ENB}) option(LLAMA_FMA "llama: enable FMA" ${INS_ENB})
# in MSVC F16C is implied with AVX2/AVX512 # in MSVC F16C is implied with AVX2/AVX512
if (NOT MSVC) if (NOT MSVC)
@ -104,6 +106,7 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access") "llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF) option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
@ -122,9 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"llama: metal minimum macOS version") "llama: metal minimum macOS version")
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_MPI "llama: use MPI" OFF)
option(LLAMA_RPC "llama: use RPC" OFF) option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF) option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF) option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device") set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@ -134,6 +135,8 @@ set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeli
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE}) option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ON) option(LLAMA_BUILD_SERVER "llama: build server example" ON)
option(LLAMA_LASX "llama: enable lasx" ON)
option(LLAMA_LSX "llama: enable lsx" ON)
# add perf arguments # add perf arguments
option(LLAMA_PERF "llama: enable perf" OFF) option(LLAMA_PERF "llama: enable perf" OFF)
@ -382,10 +385,6 @@ if (LLAMA_LLAMAFILE)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp) set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
endif() endif()
if (LLAMA_QKK_64)
add_compile_definitions(GGML_QKK_64)
endif()
if (LLAMA_CUBLAS) if (LLAMA_CUBLAS)
message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead")
set(LLAMA_CUDA ON) set(LLAMA_CUDA ON)
@ -404,6 +403,8 @@ if (LLAMA_CUDA)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_USE_CUDA) add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS) add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@ -429,6 +430,18 @@ if (LLAMA_CUDA)
if (LLAMA_CUDA_NO_PEER_COPY) if (LLAMA_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif() endif()
if (LLAMA_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
if (LLAMA_STATIC) if (LLAMA_STATIC)
if (WIN32) if (WIN32)
@ -466,35 +479,6 @@ if (LLAMA_CUDA)
endif() endif()
endif() endif()
if (LLAMA_MPI)
cmake_minimum_required(VERSION 3.10)
find_package(MPI)
if (MPI_C_FOUND)
message(STATUS "MPI found")
set(GGML_HEADERS_MPI ggml-mpi.h)
set(GGML_SOURCES_MPI ggml-mpi.c)
add_compile_definitions(GGML_USE_MPI)
add_compile_definitions(${MPI_C_COMPILE_DEFINITIONS})
if (NOT MSVC)
add_compile_options(-Wno-cast-qual)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_C_LIBRARIES})
set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${MPI_C_INCLUDE_DIRS})
# Even if you're only using the C header, C++ programs may bring in MPI
# C++ functions, so more linkage is needed
if (MPI_CXX_FOUND)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} ${MPI_CXX_LIBRARIES})
endif()
else()
message(WARNING "MPI not found")
endif()
endif()
if (LLAMA_RPC) if (LLAMA_RPC)
add_compile_definitions(GGML_USE_RPC) add_compile_definitions(GGML_USE_RPC)
@ -532,6 +516,12 @@ if (LLAMA_VULKAN)
add_compile_definitions(GGML_USE_VULKAN) add_compile_definitions(GGML_USE_VULKAN)
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
# Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
endif()
if (LLAMA_VULKAN_CHECK_RESULTS) if (LLAMA_VULKAN_CHECK_RESULTS)
add_compile_definitions(GGML_VULKAN_CHECK_RESULTS) add_compile_definitions(GGML_VULKAN_CHECK_RESULTS)
endif() endif()
@ -555,16 +545,37 @@ if (LLAMA_VULKAN)
endif() endif()
if (LLAMA_HIPBLAS) if (LLAMA_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm) if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
else()
set(ROCM_PATH /opt/rocm)
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang") # CMake on Windows doesn't support the HIP language yet
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang") if(WIN32)
set(CXX_IS_HIPCC TRUE)
else()
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
endif() endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") if(CXX_IS_HIPCC)
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") if(LINUX)
endif() if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
" Prefer setting the HIP compiler directly. See README for details.")
endif()
else()
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
endif()
cmake_minimum_required(VERSION 3.21)
enable_language(HIP)
endif()
find_package(hip REQUIRED) find_package(hip REQUIRED)
find_package(hipblas REQUIRED) find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED) find_package(rocblas REQUIRED)
@ -575,6 +586,8 @@ if (LLAMA_HIPBLAS)
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@ -594,17 +607,35 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif() endif()
if (LLAMA_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device)
else()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
endif()
if (LLAMA_STATIC) if (LLAMA_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm") message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif() endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
endif() endif()
if (LLAMA_SYCL) if (LLAMA_SYCL)
@ -627,6 +658,10 @@ if (LLAMA_SYCL)
add_compile_definitions(GGML_SYCL_F16) add_compile_definitions(GGML_SYCL_F16)
endif() endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif()
add_compile_options(-I./) #include DPCT add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR}) add_compile_options(-I/${SYCL_INCLUDE_DIR})
@ -1007,6 +1042,11 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
if (GGML_COMPILER_SUPPORT_DOTPROD) if (GGML_COMPILER_SUPPORT_DOTPROD)
add_compile_definitions(__ARM_FEATURE_DOTPROD) add_compile_definitions(__ARM_FEATURE_DOTPROD)
endif () endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
endif ()
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@ -1035,6 +1075,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
# Raspberry Pi 3, 4, Zero 2 (32-bit) # Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access) list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif() endif()
if (LLAMA_SVE)
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
endif()
endif() endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@ -1059,6 +1102,10 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>) add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>) add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif() endif()
if (LLAMA_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
elseif (LLAMA_AVX2) elseif (LLAMA_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2) list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (LLAMA_AVX) elseif (LLAMA_AVX)
@ -1090,6 +1137,9 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
if (LLAMA_AVX512_VNNI) if (LLAMA_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni) list(APPEND ARCH_FLAGS -mavx512vnni)
endif() endif()
if (LLAMA_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64") elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
message(STATUS "PowerPC detected") message(STATUS "PowerPC detected")
@ -1099,6 +1149,17 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native) list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be) #TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
endif() endif()
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
message(STATUS "loongarch64 detected")
list(APPEND ARCH_FLAGS -march=loongarch64)
if (LLAMA_LASX)
list(APPEND ARCH_FLAGS -mlasx)
endif()
if (LLAMA_LSX)
list(APPEND ARCH_FLAGS -mlsx)
endif()
else() else()
message(STATUS "Unknown architecture") message(STATUS "Unknown architecture")
endif() endif()
@ -1187,7 +1248,6 @@ add_library(ggml OBJECT
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC} ${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
@ -1275,7 +1335,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h" set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}" "${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}") "${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER) install(TARGETS ggml PUBLIC_HEADER)
@ -1284,7 +1344,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
install(TARGETS llama LIBRARY PUBLIC_HEADER) install(TARGETS llama LIBRARY PUBLIC_HEADER)
install( install(
FILES convert.py FILES convert-hf-to-gguf.py
PERMISSIONS PERMISSIONS
OWNER_READ OWNER_READ
OWNER_WRITE OWNER_WRITE

49
CMakePresets.json Normal file
View file

@ -0,0 +1,49 @@
{
"version": 4,
"configurePresets": [
{
"name": "base",
"hidden": true,
"generator": "Ninja",
"binaryDir": "${sourceDir}/build-${presetName}",
"cacheVariables": {
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
}
},
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
{
"name": "arm64-windows-msvc", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
}
},
{
"name": "arm64-windows-llvm", "hidden": true,
"architecture": { "value": "arm64", "strategy": "external" },
"toolset": { "value": "host=x86_64", "strategy": "external" },
"cacheVariables": {
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
}
},
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
]
}

View file

@ -379,15 +379,16 @@ ifneq ($(filter ppc64le%,$(UNAME_M)),)
CUDA_POWER_ARCH = 1 CUDA_POWER_ARCH = 1
endif endif
ifneq ($(filter loongarch64%,$(UNAME_M)),)
MK_CFLAGS += -mlasx
MK_CXXFLAGS += -mlasx
endif
else else
MK_CFLAGS += -march=rv64gcv -mabi=lp64d MK_CFLAGS += -march=rv64gcv -mabi=lp64d
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
endif endif
ifdef LLAMA_QKK_64
MK_CPPFLAGS += -DGGML_QKK_64
endif
ifndef LLAMA_NO_ACCELERATE ifndef LLAMA_NO_ACCELERATE
# Mac OS - include Accelerate framework. # Mac OS - include Accelerate framework.
# `-framework Accelerate` works both with Apple Silicon and Mac Intel # `-framework Accelerate` works both with Apple Silicon and Mac Intel
@ -399,13 +400,6 @@ ifndef LLAMA_NO_ACCELERATE
endif endif
endif # LLAMA_NO_ACCELERATE endif # LLAMA_NO_ACCELERATE
ifdef LLAMA_MPI
MK_CPPFLAGS += -DGGML_USE_MPI
MK_CFLAGS += -Wno-cast-qual
MK_CXXFLAGS += -Wno-cast-qual
OBJS += ggml-mpi.o
endif # LLAMA_MPI
ifdef LLAMA_OPENBLAS ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas) MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas) MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@ -427,6 +421,15 @@ ifdef LLAMA_CUBLAS
LLAMA_CUDA := 1 LLAMA_CUDA := 1
endif endif
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
ifdef LLAMA_CUDA_FA_ALL_QUANTS
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
else
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
endif # LLAMA_CUDA_FA_ALL_QUANTS
ifdef LLAMA_CUDA ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)') ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda CUDA_PATH ?= /opt/cuda
@ -437,6 +440,7 @@ ifdef LLAMA_CUDA
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
OBJS += $(OBJS_CUDA_TEMP_INST)
MK_NVCCFLAGS += -use_fast_math MK_NVCCFLAGS += -use_fast_math
ifdef LLAMA_FATAL_WARNINGS ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings MK_NVCCFLAGS += -Werror all-warnings
@ -447,6 +451,9 @@ endif # JETSON_EOL_MODULE_DETECT
ifdef LLAMA_DEBUG ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG endif # LLAMA_DEBUG
ifdef LLAMA_CUDA_DEBUG
MK_NVCCFLAGS += --device-debug
endif # LLAMA_CUDA_DEBUG
ifdef LLAMA_CUDA_NVCC ifdef LLAMA_CUDA_NVCC
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC) NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
else else
@ -496,7 +503,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY endif # LLAMA_CUDA_NO_PEER_COPY
ifdef LLAMA_CUDA_CCBIN ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif endif # LLAMA_CUDA_CCBIN
ifdef LLAMA_CUDA_FA_ALL_QUANTS
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
endif # LLAMA_CUDA_FA_ALL_QUANTS
ifdef JETSON_EOL_MODULE_DETECT ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE define NVCC_COMPILE
@ -508,7 +518,7 @@ define NVCC_COMPILE
endef # NVCC_COMPILE endef # NVCC_COMPILE
endif # JETSON_EOL_MODULE_DETECT endif # JETSON_EOL_MODULE_DETECT
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(NVCC_COMPILE) $(NVCC_COMPILE)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
@ -560,10 +570,10 @@ endif # LLAMA_VULKAN
ifdef LLAMA_HIPBLAS ifdef LLAMA_HIPBLAS
ifeq ($(wildcard /opt/rocm),) ifeq ($(wildcard /opt/rocm),)
ROCM_PATH ?= /usr ROCM_PATH ?= /usr
GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) AMDGPU_TARGETS ?= $(shell $(shell which amdgpu-arch))
else else
ROCM_PATH ?= /opt/rocm ROCM_PATH ?= /opt/rocm
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch) AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif endif
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_DMMV_X ?= 32
@ -574,8 +584,9 @@ ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS)) HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X) HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y) HIPFLAGS += -DGGML_CUDA_MMV_Y=$(LLAMA_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER) HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(LLAMA_CUDA_KQUANTS_ITER)
@ -587,11 +598,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
OBJS += $(OBJS_CUDA_TEMP_INST)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # LLAMA_HIPBLAS endif # LLAMA_HIPBLAS
@ -629,11 +641,6 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif endif
endif # LLAMA_METAL endif # LLAMA_METAL
ifdef LLAMA_MPI
ggml-mpi.o: ggml-mpi.c ggml-mpi.h
$(CC) $(CFLAGS) -c $< -o $@
endif # LLAMA_MPI
ifndef LLAMA_NO_LLAMAFILE ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CXX) $(CXXFLAGS) -c $< -o $@
@ -756,6 +763,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean: clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml-cuda/*.o rm -vrf ggml-cuda/*.o
rm -vrf ggml-cuda/template-instances/*.o
find examples pocs -type f -name "*.o" -delete find examples pocs -type f -name "*.o" -delete
# #
@ -824,7 +832,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)

View file

@ -54,10 +54,10 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS ## OS
| OS | Status | Verified | | OS | Status | Verified |
|---------|---------|------------------------------------| |---------|---------|------------------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 | | Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
| Windows | Support | Windows 11 | | Windows | Support | Windows 11 |
## Hardware ## Hardware
@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|-------------------------------|---------|---------------------------------------| |-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 | | Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 | | Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M | | Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | | Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | | Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

132
README.md
View file

@ -2,7 +2,9 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -20,7 +22,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics ### Hot topics
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021** - **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920 - BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387 - MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404 - Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@ -107,7 +110,6 @@ Typically finetunes of the base models below are supported as well.
- [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila) - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187) - [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
- [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim) - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
- [X] [Persimmon 8B](https://github.com/ggerganov/llama.cpp/pull/3410)
- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417) - [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553) - [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
- [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi) - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
@ -128,6 +130,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) - [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) - [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo) - [x] [OLMo](https://allenai.org/olmo)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md)) (instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
@ -141,11 +144,14 @@ Typically finetunes of the base models below are supported as well.
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) - [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
- [x] [Bunny](https://github.com/BAAI-DCAI/Bunny)
**HTTP server** **HTTP server**
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients. [llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
**Bindings:** **Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python) - Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@ -199,9 +205,14 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) - [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) - [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) - [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* *(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
--- ---
Here is a typical run using LLaMA v2 13B on M2 Ultra: Here is a typical run using LLaMA v2 13B on M2 Ultra:
@ -301,7 +312,7 @@ cd llama.cpp
### Build ### Build
In order to build llama.cpp you have three different options. In order to build llama.cpp you have four different options.
- Using `make`: - Using `make`:
- On Linux or MacOS: - On Linux or MacOS:
@ -310,8 +321,6 @@ In order to build llama.cpp you have three different options.
make make
``` ```
**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
- On Windows: - On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@ -323,23 +332,32 @@ In order to build llama.cpp you have three different options.
make make
``` ```
- Notes:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`: - Using `CMake`:
```bash ```bash
cmake -B build cmake -B build
cmake --build build --config Release cmake --build build --config Release
``` ```
**Note**: for `Debug` builds, there are two cases: **Notes**:
- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag): - For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash ```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build cmake --build build
``` ```
- Multi-config generators (`-G` param set to Visual Studio, XCode...): 2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash ```bash
cmake -B build -G "Xcode" cmake -B build -G "Xcode"
@ -374,6 +392,14 @@ In order to build llama.cpp you have three different options.
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
the instructions for use and activate this options in this document below. the instructions for use and activate this options in this document below.
### Homebrew
On Mac and Linux, the homebrew package manager can be used via
```
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Metal Build ### Metal Build
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU. On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
@ -382,45 +408,6 @@ To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or th
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
argument. argument.
### MPI Build
MPI lets you distribute the computation over a cluster of machines. Because of the serial nature of LLM prediction, this won't yield any end-to-end speed-ups, but it will let you run larger models than would otherwise fit into RAM on a single machine.
First you will need MPI libraries installed on your system. The two most popular (only?) options are [MPICH](https://www.mpich.org) and [OpenMPI](https://www.open-mpi.org). Either can be installed with a package manager (`apt`, Homebrew, MacPorts, etc).
Next you will need to build the project with `LLAMA_MPI` set to true on all machines; if you're building with `make`, you will also need to specify an MPI-capable compiler (when building with CMake, this is configured automatically):
- Using `make`:
```bash
make CC=mpicc CXX=mpicxx LLAMA_MPI=1
```
- Using `CMake`:
```bash
cmake -S . -B build -DLLAMA_MPI=ON
```
Once the programs are built, download/convert the weights on all of the machines in your cluster. The paths to the weights and programs should be identical on all machines.
Next, ensure password-less SSH access to each machine from the primary host, and create a `hostfile` with a list of the hostnames and their relative "weights" (slots). If you want to use localhost for computation, use its local subnet IP address rather than the loopback address or "localhost".
Here is an example hostfile:
```
192.168.0.1:2
malvolio.local:1
```
The above will distribute the computation across 2 processes on the first host and 1 process on the second host. Each process will use roughly an equal amount of RAM. Try to keep these numbers small, as inter-process (intra-host) communication is expensive.
Finally, you're ready to run a computation using `mpirun`:
```bash
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
```
### BLAS Build ### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use: Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
@ -511,10 +498,12 @@ Building the program with BLAS support may lead to some performance improvements
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
- #### hipBLAS - #### hipBLAS
@ -528,13 +517,28 @@ Building the program with BLAS support may lead to some performance improvements
``` ```
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
```bash ```bash
CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build --config Release -- -j 16 && cmake --build build --config Release -- -j 16
``` ```
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
Note that if you get the following error:
```
clang: error: cannot find ROCm device library; provide its path via '--rocm-path' or '--rocm-device-lib-path', or pass '-nogpulib' to build without ROCm device library
```
Try searching for a directory under `HIP_PATH` that contains the file
`oclc_abi_version_400.bc`. Then, add the following to the start of the
command: `HIP_DEVICE_LIB_PATH=<directory-you-just-found>`, so something
like:
```bash
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
&& cmake --build build -- -j 16
```
- Using `make` (example for target gfx1030, build with 16 CPU threads): - Using `make` (example for target gfx1030, build with 16 CPU threads):
```bash ```bash
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
@ -543,10 +547,8 @@ Building the program with BLAS support may lead to some performance improvements
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
```bash ```bash
set PATH=%HIP_PATH%\bin;%PATH% set PATH=%HIP_PATH%\bin;%PATH%
mkdir build cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
cd build cmake --build build
cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release ..
cmake --build .
``` ```
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`. Find your gpu version string by matching the most significant version information from `rocminfo | grep gfx | head -1 | awk '{print $2}'` with the list of processors, e.g. `gfx1035` maps to `gfx1030`.
@ -712,9 +714,13 @@ Building the program with BLAS support may lead to some performance improvements
### Prepare and Quantize ### Prepare and Quantize
> [!NOTE]
> You can use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to quantise your model weights without any setup too. It is synced from `llama.cpp` main every 6 hours.
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash ```bash
# obtain the official LLaMA model weights and place them in ./models # obtain the official LLaMA model weights and place them in ./models
@ -731,10 +737,10 @@ ls ./models
python3 -m pip install -r requirements.txt python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format # convert the model to ggml FP16 format
python3 convert.py models/mymodel/ python3 convert-hf-to-gguf.py models/mymodel/
# [Optional] for models using BPE tokenizers # [Optional] for models using BPE tokenizers
python convert.py models/mymodel/ --vocab-type bpe python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method) # quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M ./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M

172
build.zig
View file

@ -1,172 +0,0 @@
// Compatible with Zig Version 0.11.0
const std = @import("std");
const ArrayList = std.ArrayList;
const Compile = std.Build.Step.Compile;
const ConfigHeader = std.Build.Step.ConfigHeader;
const Mode = std.builtin.Mode;
const CrossTarget = std.zig.CrossTarget;
const Maker = struct {
builder: *std.build.Builder,
target: CrossTarget,
optimize: Mode,
enable_lto: bool,
include_dirs: ArrayList([]const u8),
cflags: ArrayList([]const u8),
cxxflags: ArrayList([]const u8),
objs: ArrayList(*Compile),
fn addInclude(m: *Maker, dir: []const u8) !void {
try m.include_dirs.append(dir);
}
fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
}
fn addCFlag(m: *Maker, flag: []const u8) !void {
try m.cflags.append(flag);
}
fn addCxxFlag(m: *Maker, flag: []const u8) !void {
try m.cxxflags.append(flag);
}
fn addFlag(m: *Maker, flag: []const u8) !void {
try m.addCFlag(flag);
try m.addCxxFlag(flag);
}
fn init(builder: *std.build.Builder) !Maker {
const target = builder.standardTargetOptions(.{});
const zig_version = @import("builtin").zig_version_string;
const commit_hash = try std.ChildProcess.exec(
.{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
);
try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
\\int LLAMA_BUILD_NUMBER = {};
\\char const *LLAMA_COMMIT = "{s}";
\\char const *LLAMA_COMPILER = "Zig {s}";
\\char const *LLAMA_BUILD_TARGET = "{s}";
\\
, .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
var m = Maker{
.builder = builder,
.target = target,
.optimize = builder.standardOptimizeOption(.{}),
.enable_lto = false,
.include_dirs = ArrayList([]const u8).init(builder.allocator),
.cflags = ArrayList([]const u8).init(builder.allocator),
.cxxflags = ArrayList([]const u8).init(builder.allocator),
.objs = ArrayList(*Compile).init(builder.allocator),
};
try m.addCFlag("-std=c11");
try m.addCxxFlag("-std=c++11");
try m.addProjectInclude(&.{});
try m.addProjectInclude(&.{"common"});
return m;
}
fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
if (o.target.getAbi() != .msvc)
o.defineCMacro("_GNU_SOURCE", null);
if (std.mem.endsWith(u8, src, ".c")) {
o.addCSourceFiles(&.{src}, m.cflags.items);
o.linkLibC();
} else {
o.addCSourceFiles(&.{src}, m.cxxflags.items);
if (o.target.getAbi() == .msvc) {
o.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
o.linkLibCpp();
}
}
for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
o.want_lto = m.enable_lto;
return o;
}
fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
e.addCSourceFiles(&.{src}, m.cxxflags.items);
for (deps) |d| e.addObject(d);
for (m.objs.items) |o| e.addObject(o);
for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
// https://github.com/ziglang/zig/issues/15448
if (e.target.getAbi() == .msvc) {
e.linkLibC(); // need winsdk + crt
} else {
// linkLibCpp already add (libc++ + libunwind + libc)
e.linkLibCpp();
}
m.builder.installArtifact(e);
e.want_lto = m.enable_lto;
return e;
}
};
pub fn build(b: *std.build.Builder) !void {
var make = try Maker.init(b);
make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
const ggml = make.obj("ggml", "ggml.c");
const sgemm = make.obj("sgemm", "sgemm.cpp");
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
const unicode = make.obj("unicode", "unicode.cpp");
const unicode_data = make.obj("unicode-data", "unicode-data.cpp");
const llama = make.obj("llama", "llama.cpp");
const buildinfo = make.obj("common", "common/build-info.cpp");
const common = make.obj("common", "common/common.cpp");
const console = make.obj("console", "common/console.cpp");
const sampling = make.obj("sampling", "common/sampling.cpp");
const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
const json_schema_to_grammar = make.obj("json-schema-to-grammar", "common/json-schema-to-grammar.cpp");
const train = make.obj("train", "common/train.cpp");
const clip = make.obj("clip", "examples/llava/clip.cpp");
const llava = make.obj("llava", "examples/llava/llava.cpp");
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser });
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo });
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train });
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava });
if (server.target.isWindows()) {
server.linkSystemLibrary("ws2_32");
}
const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" };
for (server_assets) |asset| {
const input_path = b.fmt("examples/server/public/{s}", .{asset});
const output_path = b.fmt("examples/server/{s}.hpp", .{asset});
// Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`:
const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize));
defer b.allocator.free(input);
var buf = std.ArrayList(u8).init(b.allocator);
defer buf.deinit();
for (input) |byte| {
try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte});
}
var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_");
defer b.allocator.free(name);
std.mem.replaceScalar(u8, name, '.', '_');
try std.fs.cwd().writeFile(output_path, b.fmt(
"unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n",
.{ name, buf.items, name, input.len },
));
std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path });
}
}

423
ci/run.sh
View file

@ -202,12 +202,15 @@ function gg_sum_test_scripts_release {
} }
function gg_get_model { function gg_get_model {
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
if [[ -s $gguf_3b ]]; then local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
echo -n "$gguf_3b" if [[ -s $gguf_0 ]]; then
elif [[ -s $gguf_7b ]]; then echo -n "$gguf_0"
echo -n "$gguf_7b" elif [[ -s $gguf_1 ]]; then
echo -n "$gguf_1"
elif [[ -s $gguf_2 ]]; then
echo -n "$gguf_2"
else else
echo >&2 "No model found. Can't run gg_run_ctest_with_model." echo >&2 "No model found. Can't run gg_run_ctest_with_model."
exit 1 exit 1
@ -256,139 +259,6 @@ function gg_sum_ctest_with_model_release {
gg_printf '```\n' gg_printf '```\n'
} }
# open_llama_3b_v2
function gg_run_open_llama_3b_v2 {
cd ${SRC}
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/open-llama/3B-v2"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models}
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_open_llama_3b_v2 {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'OpenLLaMA 3B-v2:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# open_llama_7b_v2 # open_llama_7b_v2
# requires: GG_BUILD_CUDA # requires: GG_BUILD_CUDA
@ -417,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models} python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -526,6 +396,272 @@ function gg_sum_open_llama_7b_v2 {
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)" gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
} }
# pythia_1.4b
function gg_run_pythia_1_4b {
cd ${SRC}
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
path_models="../models-mnt/pythia/1.4B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test_60="${path_wiki}/wiki.test-60.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_pythia_1_4b {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 1.4B:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# pythia_2_8b
# requires: GG_BUILD_CUDA
function gg_run_pythia_2_8b {
cd ${SRC}
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
path_models="../models-mnt/pythia/2.8B"
path_wiki="../models-mnt/wikitext/wikitext-2-raw"
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
set -e
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
wiki_test="${path_wiki}/wiki.test.raw"
./bin/quantize ${model_f16} ${model_q8_0} q8_0
./bin/quantize ${model_f16} ${model_q4_0} q4_0
./bin/quantize ${model_f16} ${model_q4_1} q4_1
./bin/quantize ${model_f16} ${model_q5_0} q5_0
./bin/quantize ${model_f16} ${model_q5_1} q5_1
./bin/quantize ${model_f16} ${model_q2_k} q2_k
./bin/quantize ${model_f16} ${model_q3_k} q3_k
./bin/quantize ${model_f16} ${model_q4_k} q4_k
./bin/quantize ${model_f16} ${model_q5_k} q5_k
./bin/quantize ${model_f16} ${model_q6_k} q6_k
(time ./bin/main --model ${model_f16} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/perplexity --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
(time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
(time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
(time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
(time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
(time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
(time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
(time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
(time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
(time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
(time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
function check_ppl {
qnt="$1"
ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
printf ' - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
return 20
fi
printf ' - %s @ %s OK\n' "$qnt" "$ppl"
return 0
}
check_ppl "f16" "$(cat $OUT/${ci}-tg-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
#check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
set +e
}
function gg_sum_pythia_2_8b {
gg_printf '### %s\n\n' "${ci}"
gg_printf 'Pythia 2.8B:\n'
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
}
# bge-small # bge-small
function gg_run_embd_bge_small { function gg_run_embd_bge_small {
@ -552,7 +688,7 @@ function gg_run_embd_bge_small {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert-hf-to-gguf.py ${path_models} python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf" model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf" model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -606,9 +742,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then if [ -z ${GG_BUILD_CUDA} ]; then
test $ret -eq 0 && gg_run open_llama_3b_v2 test $ret -eq 0 && gg_run pythia_1_4b
else else
test $ret -eq 0 && gg_run open_llama_7b_v2 test $ret -eq 0 && gg_run pythia_2_8b
#test $ret -eq 0 && gg_run open_llama_7b_v2
fi fi
test $ret -eq 0 && gg_run ctest_with_model_debug test $ret -eq 0 && gg_run ctest_with_model_debug
test $ret -eq 0 && gg_run ctest_with_model_release test $ret -eq 0 && gg_run ctest_with_model_release

View file

@ -0,0 +1,16 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER clang )
set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

View file

@ -0,0 +1,6 @@
set( CMAKE_SYSTEM_NAME Windows )
set( CMAKE_SYSTEM_PROCESSOR arm64 )
set( target arm64-pc-windows-msvc )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )

File diff suppressed because it is too large Load diff

View file

@ -27,7 +27,7 @@
#define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
#define print_build_info() do { \ #define print_build_info() do { \
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \
} while(0) } while(0)
@ -35,14 +35,18 @@
// build info // build info
extern int LLAMA_BUILD_NUMBER; extern int LLAMA_BUILD_NUMBER;
extern char const *LLAMA_COMMIT; extern char const * LLAMA_COMMIT;
extern char const *LLAMA_COMPILER; extern char const * LLAMA_COMPILER;
extern char const *LLAMA_BUILD_TARGET; extern char const * LLAMA_BUILD_TARGET;
struct llama_control_vector_load_info; struct llama_control_vector_load_info;
int get_math_cpu_count(); //
int32_t get_num_physical_cores(); // CPU utils
//
int32_t cpu_get_num_physical_cores();
int32_t cpu_get_num_math();
// //
// CLI argument parsing // CLI argument parsing
@ -51,7 +55,7 @@ int32_t get_num_physical_cores();
struct gpt_params { struct gpt_params {
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
int32_t n_threads = get_math_cpu_count(); int32_t n_threads = cpu_get_num_math();
int32_t n_threads_draft = -1; int32_t n_threads_draft = -1;
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1; int32_t n_threads_batch_draft = -1;
@ -142,6 +146,7 @@ struct gpt_params {
bool use_color = false; // use color to distinguish generations and inputs bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool special = false; // enable special token output
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_all = false; // save user input and generations to prompt cache
@ -179,33 +184,34 @@ struct gpt_params {
void gpt_params_handle_model_default(gpt_params & params); void gpt_params_handle_model_default(gpt_params & params);
bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides); bool gpt_params_parse_ex (int argc, char ** argv, gpt_params & params);
bool gpt_params_parse (int argc, char ** argv, gpt_params & params);
bool gpt_params_find_arg (int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
void gpt_params_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); std::string gpt_params_get_system_info(const gpt_params & params);
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param);
std::string get_system_info(const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
void process_escapes(std::string& input);
bool validate_file_name(const std::string & filename);
// //
// String utils // String utils
// //
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
std::vector<std::string> string_split(std::string input, char separator); std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str); std::string string_strip(const std::string & str);
std::string sampler_type_to_name_string(llama_sampler_type sampler_type); std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
//
// Filesystem utils
//
bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
// //
// Model utils // Model utils
@ -276,29 +282,15 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false. // defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model); bool llama_should_add_bos_token(const llama_model * model);
//
// YAML utils
//
bool create_directory_with_parents(const std::string & path);
void dump_vector_float_yaml(FILE * stream, const char * prop_name, const std::vector<float> & data);
void dump_vector_int_yaml(FILE * stream, const char * prop_name, const std::vector<int> & data);
void dump_string_yaml_multiline(FILE * stream, const char * prop_name, const char * data);
std::string get_sortable_timestamp();
void dump_non_result_info_yaml(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
// //
// KV cache utils // KV cache utils
// //
// Dump the KV cache view with the number of sequences per cell. // Dump the KV cache view with the number of sequences per cell.
void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
// Dump the KV cache view showing individual sequences in each cell (long output). // Dump the KV cache view showing individual sequences in each cell (long output).
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
// //
// Embedding utils // Embedding utils
@ -332,6 +324,20 @@ llama_control_vector_data llama_control_vector_load(const std::vector<llama_cont
// //
// Split utils // Split utils
// //
static const char * const LLM_KV_SPLIT_NO = "split.no"; static const char * const LLM_KV_SPLIT_NO = "split.no";
static const char * const LLM_KV_SPLIT_COUNT = "split.count"; static const char * const LLM_KV_SPLIT_COUNT = "split.count";
static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
//
// YAML utils
//
void yaml_dump_vector_float (FILE * stream, const char * prop_name, const std::vector<float> & data);
void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std::vector<int> & data);
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
void yaml_dump_non_result_info(
FILE * stream, const gpt_params & params, const llama_context * lctx,
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

View file

@ -26,7 +26,7 @@ namespace grammar_parser {
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size()); uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id)); auto result = state.symbol_ids.emplace(std::string(src, len), next_id);
return result.first->second; return result.first->second;
} }

View file

@ -272,7 +272,7 @@ private:
if (literal.empty()) { if (literal.empty()) {
return false; return false;
} }
ret.push_back(std::make_pair(literal, true)); ret.emplace_back(literal, true);
literal.clear(); literal.clear();
return true; return true;
}; };
@ -298,7 +298,7 @@ private:
while (i < length) { while (i < length) {
char c = sub_pattern[i]; char c = sub_pattern[i];
if (c == '.') { if (c == '.') {
seq.push_back(std::make_pair(get_dot(), false)); seq.emplace_back(get_dot(), false);
i++; i++;
} else if (c == '(') { } else if (c == '(') {
i++; i++;
@ -307,7 +307,7 @@ private:
_warnings.push_back("Unsupported pattern syntax"); _warnings.push_back("Unsupported pattern syntax");
} }
} }
seq.push_back(std::make_pair("(" + to_rule(transform()) + ")", false)); seq.emplace_back("(" + to_rule(transform()) + ")", false);
} else if (c == ')') { } else if (c == ')') {
i++; i++;
if (start > 0 && sub_pattern[start - 1] != '(') { if (start > 0 && sub_pattern[start - 1] != '(') {
@ -331,9 +331,9 @@ private:
} }
square_brackets += ']'; square_brackets += ']';
i++; i++;
seq.push_back(std::make_pair(square_brackets, false)); seq.emplace_back(square_brackets, false);
} else if (c == '|') { } else if (c == '|') {
seq.push_back(std::make_pair("|", false)); seq.emplace_back("|", false);
i++; i++;
} else if (c == '*' || c == '+' || c == '?') { } else if (c == '*' || c == '+' || c == '?') {
seq.back() = std::make_pair(to_rule(seq.back()) + c, false); seq.back() = std::make_pair(to_rule(seq.back()) + c, false);
@ -417,7 +417,7 @@ private:
} }
} }
if (!literal.empty()) { if (!literal.empty()) {
seq.push_back(std::make_pair(literal, true)); seq.emplace_back(literal, true);
} }
} }
} }

View file

@ -211,7 +211,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
#define LOG_FLF_FMT "[%24s:%5ld][%24s] " #define LOG_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif #endif
#else #else
#define LOG_FLF_FMT "%s" #define LOG_FLF_FMT "%s"
@ -224,7 +224,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
#else #else
#define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] " #define LOG_TEE_FLF_FMT "[%24s:%5ld][%24s] "
#define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__ #define LOG_TEE_FLF_VAL , __FILE__, (long)__LINE__, __FUNCTION__
#endif #endif
#else #else
#define LOG_TEE_FLF_FMT "%s" #define LOG_TEE_FLF_FMT "%s"
@ -294,7 +294,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Main LOG macro. // Main LOG macro.
// behaves like printf, and supports arguments the exact same way. // behaves like printf, and supports arguments the exact same way.
// //
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOG(...) LOG_IMPL(__VA_ARGS__, "") #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
#else #else
#define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "") #define LOG(str, ...) LOG_IMPL("%s" str, "", ##__VA_ARGS__, "")
@ -308,14 +308,14 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// Secondary target can be changed just like LOG_TARGET // Secondary target can be changed just like LOG_TARGET
// by defining LOG_TEE_TARGET // by defining LOG_TEE_TARGET
// //
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "") #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
#else #else
#define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "") #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", ##__VA_ARGS__, "")
#endif #endif
// LOG macro variants with auto endline. // LOG macro variants with auto endline.
#ifndef _MSC_VER #if !defined(_MSC_VER) || defined(__clang__)
#define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n") #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
#define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n") #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
#else #else

View file

@ -125,7 +125,7 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
std::string result = "CFG -> Penalties "; std::string result = "CFG -> Penalties ";
if (params.mirostat == 0) { if (params.mirostat == 0) {
for (auto sampler_type : params.samplers_sequence) { for (auto sampler_type : params.samplers_sequence) {
const auto sampler_type_name = sampler_type_to_name_string(sampler_type); const auto sampler_type_name = llama_sampling_type_to_str(sampler_type);
if (!sampler_type_name.empty()) { if (!sampler_type_name.empty()) {
result += "-> " + sampler_type_name + " "; result += "-> " + sampler_type_name + " ";
} }
@ -137,6 +137,87 @@ std::string llama_sampling_order_print(const llama_sampling_params & params) {
return result; return result;
} }
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type) {
switch (sampler_type) {
case llama_sampler_type::TOP_K: return "top_k";
case llama_sampler_type::TFS_Z: return "tfs_z";
case llama_sampler_type::TYPICAL_P: return "typical_p";
case llama_sampler_type::TOP_P: return "top_p";
case llama_sampler_type::MIN_P: return "min_p";
case llama_sampler_type::TEMPERATURE: return "temperature";
default : return "";
}
}
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
std::unordered_map<std::string, llama_sampler_type> sampler_canonical_name_map {
{"top_k", llama_sampler_type::TOP_K},
{"top_p", llama_sampler_type::TOP_P},
{"typical_p", llama_sampler_type::TYPICAL_P},
{"min_p", llama_sampler_type::MIN_P},
{"tfs_z", llama_sampler_type::TFS_Z},
{"temperature", llama_sampler_type::TEMPERATURE}
};
// since samplers names are written multiple ways
// make it ready for both system names and input names
std::unordered_map<std::string, llama_sampler_type> sampler_alt_name_map {
{"top-k", llama_sampler_type::TOP_K},
{"top-p", llama_sampler_type::TOP_P},
{"nucleus", llama_sampler_type::TOP_P},
{"typical-p", llama_sampler_type::TYPICAL_P},
{"typical", llama_sampler_type::TYPICAL_P},
{"min-p", llama_sampler_type::MIN_P},
{"tfs-z", llama_sampler_type::TFS_Z},
{"tfs", llama_sampler_type::TFS_Z},
{"temp", llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names.size());
for (const auto & name : names)
{
auto sampler_item = sampler_canonical_name_map.find(name);
if (sampler_item != sampler_canonical_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
else
{
if (allow_alt_names)
{
sampler_item = sampler_alt_name_map.find(name);
if (sampler_item != sampler_alt_name_map.end())
{
sampler_types.push_back(sampler_item->second);
}
}
}
}
return sampler_types;
}
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string) {
std::unordered_map<char, llama_sampler_type> sampler_name_map {
{'k', llama_sampler_type::TOP_K},
{'p', llama_sampler_type::TOP_P},
{'y', llama_sampler_type::TYPICAL_P},
{'m', llama_sampler_type::MIN_P},
{'f', llama_sampler_type::TFS_Z},
{'t', llama_sampler_type::TEMPERATURE}
};
std::vector<llama_sampler_type> sampler_types;
sampler_types.reserve(names_string.size());
for (const auto & c : names_string) {
const auto sampler_item = sampler_name_map.find(c);
if (sampler_item != sampler_name_map.end()) {
sampler_types.push_back(sampler_item->second);
}
}
return sampler_types;
}
// no reasons to expose this function in header // no reasons to expose this function in header
static void sampler_queue( static void sampler_queue(
struct llama_context * ctx_main, struct llama_context * ctx_main,
@ -179,7 +260,7 @@ static llama_token llama_sampling_sample_impl(
struct llama_context * ctx_main, struct llama_context * ctx_main,
struct llama_context * ctx_cfg, struct llama_context * ctx_cfg,
const int idx, const int idx,
bool is_resampling) { // Add a parameter to indicate if we are resampling bool is_resampling) {
const llama_sampling_params & params = ctx_sampling->params; const llama_sampling_params & params = ctx_sampling->params;
const float temp = params.temp; const float temp = params.temp;
@ -188,8 +269,8 @@ static llama_token llama_sampling_sample_impl(
const float mirostat_eta = params.mirostat_eta; const float mirostat_eta = params.mirostat_eta;
std::vector<float> original_logits; std::vector<float> original_logits;
auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, /* apply_grammar= */ is_resampling, &original_logits);
if (!is_resampling) { if (ctx_sampling->grammar != NULL && !is_resampling) {
GGML_ASSERT(!original_logits.empty()); GGML_ASSERT(!original_logits.empty());
} }
llama_token id = 0; llama_token id = 0;
@ -252,7 +333,7 @@ static llama_token llama_sampling_sample_impl(
// Restore logits from the copy // Restore logits from the copy
std::copy(original_logits.begin(), original_logits.end(), logits); std::copy(original_logits.begin(), original_logits.end(), logits);
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, true); // Pass true for is_resampling return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ true);
} }
} }
@ -285,7 +366,8 @@ static llama_token_data_array llama_sampling_prepare_impl(
// Get a pointer to the logits // Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx); float * logits = llama_get_logits_ith(ctx_main, idx);
if (apply_grammar && original_logits != NULL) { if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
} }
@ -342,7 +424,7 @@ llama_token llama_sampling_sample(
struct llama_context * ctx_cfg, struct llama_context * ctx_cfg,
const int idx) { const int idx) {
// Call the implementation function with is_resampling set to false by default // Call the implementation function with is_resampling set to false by default
return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, /* is_resampling= */ false);
} }
llama_token_data_array llama_sampling_prepare( llama_token_data_array llama_sampling_prepare(

View file

@ -116,6 +116,11 @@ std::string llama_sampling_print(const llama_sampling_params & params);
// Print sampling order into a string // Print sampling order into a string
std::string llama_sampling_order_print(const llama_sampling_params & params); std::string llama_sampling_order_print(const llama_sampling_params & params);
std::string llama_sampling_type_to_str(llama_sampler_type sampler_type);
std::vector<llama_sampler_type> llama_sampling_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
std::vector<llama_sampler_type> llama_sampling_types_from_chars(const std::string & names_string);
// this is a common sampling function used across the examples for convenience // this is a common sampling function used across the examples for convenience
// it can serve as a starting point for implementing your own sampling function // it can serve as a starting point for implementing your own sampling function
// Note: When using multiple sequences, it is the caller's responsibility to call // Note: When using multiple sequences, it is the caller's responsibility to call

View file

@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
params.custom_n_ctx = false; params.custom_n_ctx = false;
params.use_flash = true; params.use_flash = false;
params.use_checkpointing = true; params.use_checkpointing = true;
params.sample_start = ""; params.sample_start = "";
@ -1380,7 +1380,7 @@ bool consume_common_train_arg(
void finish_processing_train_args(struct train_params_common * params) { void finish_processing_train_args(struct train_params_common * params) {
if (params->escape) { if (params->escape) {
process_escapes(params->sample_start); string_process_escapes(params->sample_start);
} }
} }

View file

@ -20,11 +20,13 @@
# - Update llama.cpp with the new pre-tokenizer if necessary # - Update llama.cpp with the new pre-tokenizer if necessary
# #
# TODO: generate tokenizer tests for llama.cpp # TODO: generate tokenizer tests for llama.cpp
# TODO: automate the update of convert-hf-to-gguf.py
# #
import logging import logging
import os import os
import pathlib
import re
import requests import requests
import sys import sys
import json import json
@ -35,6 +37,7 @@ from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update") logger = logging.getLogger("convert-hf-to-gguf-update")
sess = requests.Session()
class TOKENIZER_TYPE(IntEnum): class TOKENIZER_TYPE(IntEnum):
@ -69,6 +72,7 @@ models = [
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "stablelm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
@ -77,65 +81,47 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM! {"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", }, {"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", }, {"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
] ]
# make directory "models/tokenizers" if it doesn't exist
if not os.path.exists("models/tokenizers"):
os.makedirs("models/tokenizers")
def download_file_with_auth(url, token, save_path): def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers) response = sess.get(url, headers=headers)
if response.status_code == 200: response.raise_for_status()
with open(save_path, 'wb') as f: os.makedirs(os.path.dirname(save_path), exist_ok=True)
f.write(response.content) with open(save_path, 'wb') as f:
logger.info(f"File {save_path} downloaded successfully") f.write(response.content)
else: logger.info(f"File {save_path} downloaded successfully")
logger.info(f"Failed to download file. Status code: {response.status_code}")
# download the tokenizer models def download_model(model):
for model in models:
name = model["name"] name = model["name"]
repo = model["repo"] repo = model["repo"]
tokt = model["tokt"] tokt = model["tokt"]
if not os.path.exists(f"models/tokenizers/{name}"): os.makedirs(f"models/tokenizers/{name}", exist_ok=True)
os.makedirs(f"models/tokenizers/{name}")
else:
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
continue
logger.info(f"Downloading {name} to models/tokenizers/{name}")
url = f"{repo}/raw/main/config.json"
save_path = f"models/tokenizers/{name}/config.json"
download_file_with_auth(url, token, save_path)
url = f"{repo}/raw/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
# if downloaded file is less than 1KB, we likely need to download an LFS instead
if os.path.getsize(save_path) < 1024:
# remove the file
os.remove(save_path)
url = f"{repo}/resolve/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
files = ["config.json", "tokenizer.json", "tokenizer_config.json"]
if tokt == TOKENIZER_TYPE.SPM: if tokt == TOKENIZER_TYPE.SPM:
url = f"{repo}/resolve/main/tokenizer.model" files.append("tokenizer.model")
save_path = f"models/tokenizers/{name}/tokenizer.model"
download_file_with_auth(url, token, save_path) for file in files:
save_path = f"models/tokenizers/{name}/{file}"
if os.path.isfile(save_path):
logger.info(f"{name}: File {save_path} already exists - skipping")
continue
download_file_with_auth(f"{repo}/resolve/main/{file}", token, save_path)
for model in models:
try:
download_model(model)
except Exception as e:
logger.error(f"Failed to download model {model['name']}. Error: {e}")
url = f"{repo}/raw/main/tokenizer_config.json"
save_path = f"models/tokenizers/{name}/tokenizer_config.json"
download_file_with_auth(url, token, save_path)
# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: # generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function:
# TODO: auto-update convert-hf-to-gguf.py with the generated function
src_ifs = "" src_ifs = ""
for model in models: for model in models:
@ -224,11 +210,18 @@ src_func = f"""
return res return res
""" """
print(src_func) # noqa: NP100 convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
convert_py = convert_py_pth.read_text()
convert_py = re.sub(
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
lambda m: m.group(1) + src_func + m.group(3),
convert_py,
flags=re.DOTALL | re.MULTILINE,
)
logger.info("\n") convert_py_pth.write_text(convert_py)
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
logger.info("\n") logger.info("+++ convert-hf-to-gguf.py was updated")
# generate tests for each tokenizer model # generate tests for each tokenizer model

View file

@ -14,6 +14,7 @@ from pathlib import Path
from hashlib import sha256 from hashlib import sha256
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
import math
import numpy as np import numpy as np
import torch import torch
@ -24,8 +25,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf import gguf
from convert import LlamaHfVocab
logger = logging.getLogger("hf-to-gguf") logger = logging.getLogger("hf-to-gguf")
@ -312,11 +311,10 @@ class Model:
data = data.astype(np.float32) data = data.astype(np.float32)
data_qtype = gguf.GGMLQuantizationType.F32 data_qtype = gguf.GGMLQuantizationType.F32
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype] shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
# reverse shape to make it similar to the internal ggml dimension order # reverse shape to make it similar to the internal ggml dimension order
shape_str = f"""{{{', '.join(str(n) for n in reversed( shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
)}}}"""
# n_dims is implicit in the shape # n_dims is implicit in the shape
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}") logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
@ -402,6 +400,7 @@ class Model:
# NOTE: this function is generated by convert-hf-to-gguf-update.py # NOTE: this function is generated by convert-hf-to-gguf-update.py
# do not modify it manually! # do not modify it manually!
# ref: https://github.com/ggerganov/llama.cpp/pull/6920 # ref: https://github.com/ggerganov/llama.cpp/pull/6920
# Marker: Start get_vocab_base_pre
def get_vocab_base_pre(self, tokenizer) -> str: def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model # is specific for the BPE pre-tokenizer used by the model
@ -445,6 +444,9 @@ class Model:
if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454":
# ref: https://huggingface.co/openai-community/gpt2 # ref: https://huggingface.co/openai-community/gpt2
res = "gpt-2" res = "gpt-2"
if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
# ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
res = "stablelm2"
if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
# ref: https://huggingface.co/smallcloudai/Refact-1_6-base # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
res = "refact" res = "refact"
@ -469,6 +471,9 @@ class Model:
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6": if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de" res = "jina-v2-de"
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
if res is None: if res is None:
logger.warning("\n") logger.warning("\n")
@ -489,6 +494,7 @@ class Model:
logger.debug(f"chkhsh: {chkhsh}") logger.debug(f"chkhsh: {chkhsh}")
return res return res
# Marker: End get_vocab_base_pre
def _set_vocab_gpt2(self) -> None: def _set_vocab_gpt2(self) -> None:
tokens, toktypes, tokpre = self.get_vocab_base() tokens, toktypes, tokpre = self.get_vocab_base()
@ -526,7 +532,7 @@ class Model:
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined # for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
added_vocab = tokenizer.special_tokens added_vocab = tokenizer.special_tokens
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()} reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in {**vocab, **added_vocab}.items()}
for i in range(vocab_size): for i in range(vocab_size):
if i not in reverse_vocab: if i not in reverse_vocab:
@ -571,6 +577,10 @@ class Model:
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()): for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id) piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8") text = piece.encode("utf-8")
@ -586,21 +596,23 @@ class Model:
elif tokenizer.IsByte(token_id): elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE toktype = SentencePieceTokenTypes.BYTE
tokens.append(text) tokens[token_id] = text
scores.append(score) scores[token_id] = score
toktypes.append(toktype) toktypes[token_id] = toktype
added_tokens_file = self.dir_model / 'added_tokens.json' added_tokens_file = self.dir_model / 'added_tokens.json'
if added_tokens_file.is_file(): if added_tokens_file.is_file():
with open(added_tokens_file, "r", encoding="utf-8") as f: with open(added_tokens_file, "r", encoding="utf-8") as f:
added_tokens_json = json.load(f) added_tokens_json = json.load(f)
for key in added_tokens_json: for key in added_tokens_json:
key = key.encode("utf-8") token_id = added_tokens_json[key]
if key not in tokens: if (token_id >= vocab_size):
tokens.append(key) logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
scores.append(-1000.0) continue
toktypes.append(SentencePieceTokenTypes.USER_DEFINED)
tokens[token_id] = key.encode("utf-8")
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if vocab_size > len(tokens): if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens) pad_count = vocab_size - len(tokens)
@ -610,8 +622,6 @@ class Model:
scores.append(-1000.0) scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED) toktypes.append(SentencePieceTokenTypes.UNUSED)
assert len(tokens) == vocab_size
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
@ -622,7 +632,7 @@ class Model:
special_vocab.add_to_gguf(self.gguf_writer) special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_llama_hf(self): def _set_vocab_llama_hf(self):
vocab = LlamaHfVocab(self.dir_model) vocab = gguf.LlamaHfVocab(self.dir_model)
tokens = [] tokens = []
scores = [] scores = []
toktypes = [] toktypes = []
@ -663,6 +673,44 @@ class GPTNeoXModel(Model):
self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True)) self.gguf_writer.add_parallel_residual(self.hparams.get("use_parallel_residual", True))
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads"))
n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed"))
tensors: list[tuple[str, Tensor]] = []
if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name):
# Map bloom-style qkv_linear to gpt-style qkv_linear
# bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa
# gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa
qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed))
data_torch = torch.cat(
(
qkv_weights[:, 0, :, :].reshape((-1, n_embed)),
qkv_weights[:, 1, :, :].reshape((-1, n_embed)),
qkv_weights[:, 2, :, :].reshape((-1, n_embed)),
),
dim=0,
)
logger.info("re-format attention.linear_qkv.weight")
elif re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.bias", name):
qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head))
data_torch = torch.cat(
(
qkv_bias[:, 0, :].reshape((n_embed,)),
qkv_bias[:, 1, :].reshape((n_embed,)),
qkv_bias[:, 2, :].reshape((n_embed,)),
),
dim=0,
)
logger.info("re-format attention.linear_qkv.bias")
tensors.append((self.map_tensor_name(name), data_torch))
return tensors
@Model.register("BloomForCausalLM") @Model.register("BloomForCausalLM")
class BloomModel(Model): class BloomModel(Model):
@ -1139,45 +1187,6 @@ class RefactModel(Model):
return tensors return tensors
@Model.register("PersimmonForCausalLM")
class PersimmonModel(Model):
model_arch = gguf.MODEL_ARCH.PERSIMMON
def set_gguf_parameters(self):
block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers"))
head_count = self.hparams["num_attention_heads"]
head_count_kv = head_count
hidden_size = self.hparams["hidden_size"]
self.gguf_writer.add_name('persimmon-8b-chat')
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hidden_size)
self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
# NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller
# than the head size?
# ref: https://github.com/ggerganov/llama.cpp/pull/4889
# self.gguf_writer.add_rope_dimension_count(hidden_size // head_count)
self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
self.gguf_writer.add_head_count(head_count)
self.gguf_writer.add_head_count_kv(head_count_kv)
self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"])
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"])
def set_vocab(self):
self._set_vocab_sentencepiece()
# self.gguf_writer.add_bos_token_id(71013)
# self.gguf_writer.add_eos_token_id(71013)
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
del name, new_name, bid, n_dims # unused
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
return True
@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
class StableLMModel(Model): class StableLMModel(Model):
model_arch = gguf.MODEL_ARCH.STABLELM model_arch = gguf.MODEL_ARCH.STABLELM
@ -1306,6 +1315,17 @@ class LlamaModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
# Apply to granite small models only
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)
@staticmethod @staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None): def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv: if n_head_kv is not None and n_head != n_head_kv:
@ -1320,9 +1340,9 @@ class LlamaModel(Model):
n_head = self.hparams["num_attention_heads"] n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads") n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith("q_proj.weight"): if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head) data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith("k_proj.weight"): if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately # process the experts separately
@ -1770,6 +1790,38 @@ class Phi3MiniModel(Model):
scores[token_id] = -1000.0 scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
for token_id, foken_data in added_tokens_decoder.items():
token_id = int(token_id)
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
tokenizer_file = self.dir_model / 'tokenizer.json'
if tokenizer_file.is_file():
with open(tokenizer_file, "r", encoding="utf-8") as f:
tokenizer_json = json.load(f)
added_tokens = tokenizer_json.get("added_tokens", [])
for foken_data in added_tokens:
token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
if foken_data.get("special"):
toktypes[token_id] = SentencePieceTokenTypes.CONTROL
self.gguf_writer.add_tokenizer_model("llama") self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_list(tokens)
@ -1782,23 +1834,59 @@ class Phi3MiniModel(Model):
def set_gguf_parameters(self): def set_gguf_parameters(self):
block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) block_count = self.find_hparam(["num_hidden_layers", "n_layer"])
rot_pct = 1.0
n_embd = self.find_hparam(["hidden_size", "n_embd"]) n_embd = self.find_hparam(["hidden_size", "n_embd"])
n_head = self.find_hparam(["num_attention_heads", "n_head"]) n_head = self.find_hparam(["num_attention_heads", "n_head"])
n_head_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"])
rms_eps = self.find_hparam(["rms_norm_eps"]) rms_eps = self.find_hparam(["rms_norm_eps"])
max_pos_embds = self.find_hparam(["n_positions", "max_position_embeddings"])
orig_max_pos_embds = self.find_hparam(["original_max_position_embeddings"])
rope_dims = n_embd // n_head
self.gguf_writer.add_name("Phi3") self.gguf_writer.add_name("Phi3")
self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) self.gguf_writer.add_context_length(max_pos_embds)
self.gguf_writer.add_rope_scaling_orig_ctx_len(orig_max_pos_embds)
self.gguf_writer.add_embedding_length(n_embd) self.gguf_writer.add_embedding_length(n_embd)
self.gguf_writer.add_feed_forward_length(8192) self.gguf_writer.add_feed_forward_length(self.find_hparam(["intermediate_size"]))
self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_block_count(block_count)
self.gguf_writer.add_head_count(n_head) self.gguf_writer.add_head_count(n_head)
self.gguf_writer.add_head_count_kv(n_head) self.gguf_writer.add_head_count_kv(n_head_kv)
self.gguf_writer.add_layer_norm_rms_eps(rms_eps) self.gguf_writer.add_layer_norm_rms_eps(rms_eps)
self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) self.gguf_writer.add_rope_dimension_count(rope_dims)
self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"]))
self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_file_type(self.ftype)
# write rope scaling for long context (128k) model
rope_scaling = self.find_hparam(['rope_scaling'], True)
if (rope_scaling is None):
return
scale = max_pos_embds / orig_max_pos_embds
rope_scaling_type = rope_scaling.get('type', '').lower()
if len(rope_scaling_type) == 0:
raise KeyError('Missing the required key rope_scaling.type')
if rope_scaling_type == 'su':
attn_factor = math.sqrt(1 + math.log(scale) / math.log(orig_max_pos_embds)) if scale > 1.0 else 1.0
elif rope_scaling_type == 'yarn':
attn_factor = 0.1 * math.log(scale) + 1.0 if scale > 1.0 else 1.0
else:
raise NotImplementedError(f'The rope scaling type {rope_scaling_type} is not supported yet')
self.gguf_writer.add_rope_scaling_attn_factors(attn_factor)
long_factors = rope_scaling.get('long_factor', None)
short_factors = rope_scaling.get('short_factor', None)
if long_factors is None or short_factors is None:
raise KeyError('Missing the required key rope_scaling.long_factor or rope_scaling_short_factor')
if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2:
raise ValueError(f'The length of rope long and short factors must be {rope_dims / 2}')
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_LONG] + ".weight", np.array(long_factors, dtype=np.float32))
self.gguf_writer.add_tensor(gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT] + ".weight", np.array(short_factors, dtype=np.float32))
@Model.register("PlamoForCausalLM") @Model.register("PlamoForCausalLM")
class PlamoModel(Model): class PlamoModel(Model):
@ -2316,7 +2404,8 @@ class CommandR2Model(Model):
# max_position_embeddings = 8192 in config.json but model was actually # max_position_embeddings = 8192 in config.json but model was actually
# trained on 128k context length # trained on 128k context length
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"] # aya-23 models don't have model_max_length specified
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
def set_gguf_parameters(self): def set_gguf_parameters(self):
super().set_gguf_parameters() super().set_gguf_parameters()
@ -2389,6 +2478,236 @@ class JinaBertV2Model(BertModel):
self.gguf_writer.add_add_eos_token(True) self.gguf_writer.add_add_eos_token(True)
@Model.register("ArcticForCausalLM")
class ArcticModel(Model):
model_arch = gguf.MODEL_ARCH.ARCTIC
def set_vocab(self):
# The reason for using a custom implementation here is that the
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model'
if not tokenizer_path.is_file():
logger.error(f'Error: Missing {tokenizer_path}')
sys.exit(1)
# Read the whole vocabulary from the tokenizer.model file
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
# Use the added_tokens_decoder field from tokeniser_config.json as the source
# of information about added/redefined tokens and modify them accordingly.
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "added_tokens_decoder" in tokenizer_config_json:
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
for token_id, token_json in added_tokens_decoder.items():
token_id = int(token_id)
if (token_id >= vocab_size):
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
token_content = token_json["content"]
token_type = SentencePieceTokenTypes.USER_DEFINED
token_score = -10000.0
# Map unk_token to UNKNOWN, other special tokens to CONTROL
# Set the score to 0.0 as in the original tokenizer.model
if ("special" in token_json) and token_json["special"]:
if token_content == tokenizer_config_json["unk_token"]:
token_type = SentencePieceTokenTypes.UNKNOWN
else:
token_type = SentencePieceTokenTypes.CONTROL
token_score = 0.0
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
tokens[token_id] = token_content.encode("utf-8")
toktypes[token_id] = token_type
scores[token_id] = token_score
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith("q_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith("k_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.hparams["num_local_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for wid in ["w1", "w2", "w3"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def write_tensors(self):
super().write_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeepseekV2ForCausalLM")
class DeepseekV2Model(Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
def set_vocab(self):
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["v_head_dim"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def write_tensors(self):
super().write_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
###### CONVERSION LOGIC ###### ###### CONVERSION LOGIC ######
@ -2521,7 +2840,12 @@ def main() -> None:
hparams = Model.load_hparams(dir_model) hparams = Model.load_hparams(dir_model)
with torch.inference_mode(): with torch.inference_mode():
model_class = Model.from_model_architecture(hparams["architectures"][0]) try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy) model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
logger.info("Set model parameters") logger.info("Set model parameters")

View file

@ -1,143 +0,0 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import os
import sys
from pathlib import Path
from pprint import pprint
import torch
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
logger = logging.getLogger("persimmon-to-gguf")
def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict)
for key in dct.keys():
new_prefix = prefix + '.' + key if prefix is not None else key
if isinstance(dct[key], torch.Tensor):
tensors[new_prefix] = dct[key]
elif isinstance(dct[key], dict):
_flatten_dict(dct[key], tensors, new_prefix)
else:
raise ValueError(type(dct[key]))
return None
def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
logger.info('getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
logger.info('adding tokens')
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
for i in range(tokenizer.vocab_size()):
text: bytes
score: float
piece = tokenizer.id_to_piece(i)
text = piece.encode("utf-8")
score = tokenizer.get_score(i)
toktype = 1
if tokenizer.is_unknown(i):
toktype = 2
if tokenizer.is_control(i):
toktype = 3
if tokenizer.is_unused(i):
toktype = 5
if tokenizer.is_byte(i):
toktype = 6
tokens.append(text)
scores.append(score)
toktypes.append(toktype)
pass
return tokens, scores, toktypes
def main():
parser = argparse.ArgumentParser(description="Convert a Persimmon model from Adept (e.g. Persimmon 8b chat) to a GGML compatible file")
parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input")
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
sys.path.append(str(args.adept_inference_dir))
persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args']
pprint(hparams)
tensors: dict[str, torch.Tensor] = {}
_flatten_dict(persimmon_model['model'], tensors, None)
arch = gguf.MODEL_ARCH.PERSIMMON
gguf_writer = gguf.GGUFWriter(args.outfile, gguf.MODEL_ARCH_NAMES[arch])
block_count = hparams.num_layers
head_count = hparams.num_attention_heads
head_count_kv = head_count
ctx_length = hparams.seq_length
hidden_size = hparams.hidden_size
gguf_writer.add_name('persimmon-8b-chat')
gguf_writer.add_context_length(ctx_length)
gguf_writer.add_embedding_length(hidden_size)
gguf_writer.add_block_count(block_count)
gguf_writer.add_feed_forward_length(hparams.ffn_hidden_size)
# ref: https://github.com/ggerganov/llama.cpp/pull/4889/commits/eea19039fc52ea2dbd1aab45b59ab4e3e29a3443
gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2)
gguf_writer.add_head_count(head_count)
gguf_writer.add_head_count_kv(head_count_kv)
gguf_writer.add_rope_freq_base(hparams.rotary_emb_base)
gguf_writer.add_layer_norm_eps(hparams.layernorm_epsilon)
tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir)
gguf_writer.add_tokenizer_model('llama')
gguf_writer.add_tokenizer_pre('default')
gguf_writer.add_token_list(tokens)
gguf_writer.add_token_scores(scores)
gguf_writer.add_token_types(toktypes)
gguf_writer.add_bos_token_id(71013)
gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count)
logger.info(tensor_map)
for name in tensors.keys():
data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data_torch.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
raise ValueError(f"Can not map tensor '{name}'")
n_dims = len(data.shape)
logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
gguf_writer.add_tensor(new_name, data)
logger.info("gguf: write header")
gguf_writer.write_header_to_file()
logger.info("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
logger.info("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
logger.info(f"gguf: model successfully exported to '{args.outfile}'")
if __name__ == '__main__':
main()

View file

@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
### 1. Convert the model to GGUF ### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py). Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

View file

@ -1,6 +1,6 @@
# Debugging Tests Tips # Debugging Tests Tips
## How to run & debug a specific test without anything else to keep the feedback loop short? ## How to run & execute or debug a specific test without anything else to keep the feedback loop short?
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number. There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
@ -10,13 +10,27 @@ For example, running the following command will output an interactive list from
It will then build & run in the debugger for you. It will then build & run in the debugger for you.
To just execute a test and get back a PASS or FAIL message run:
```bash ```bash
./scripts/debug-test.sh test-tokenizer ./scripts/debug-test.sh test-tokenizer
```
To test in GDB use the `-g` flag to enable gdb test mode.
```bash
./scripts/debug-test.sh -g test-tokenizer
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows: # Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
>>> b main >>> b main
``` ```
To speed up the testing loop, if you know your test number you can just run it similar to below:
```bash
./scripts/debug-test.sh test 23
```
For further reference use `debug-test.sh -h` to print help. For further reference use `debug-test.sh -h` to print help.
&nbsp; &nbsp;
@ -41,7 +55,7 @@ cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
make -j make -j
``` ```
#### Step 3.1: Identify Test Command for Debugging #### Step 3: Find all tests available that matches REGEX
The output of this command will give you the command & arguments needed to run GDB. The output of this command will give you the command & arguments needed to run GDB.
@ -69,11 +83,13 @@ Labels: main
... ...
``` ```
So for test #1 we can tell these two pieces of relevant information: #### Step 4: Identify Test Command for Debugging
So for test #1 above we can tell these two pieces of relevant information:
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0` * Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf` * Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
#### Step 3.2: Run GDB on test command #### Step 5: Run GDB on test command
Based on the ctest 'test command' report above we can then run a gdb session via this command below: Based on the ctest 'test command' report above we can then run a gdb session via this command below:

View file

@ -48,7 +48,7 @@ int main(int argc, char ** argv) {
params.prompt = "Hello my name is"; params.prompt = "Hello my name is";
} }
process_escapes(params.prompt); string_process_escapes(params.prompt);
// init LLM // init LLM

View file

@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
import numpy as np import numpy as np
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ: if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) # use .parent.parent since we are in "examples" directory
sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
import gguf import gguf
from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
if TYPE_CHECKING: if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias from typing_extensions import Self, TypeAlias
@ -380,306 +382,6 @@ class Metadata:
return metadata return metadata
#
# vocab
#
@runtime_checkable
class BaseVocab(Protocol):
tokenizer_model: ClassVar[str]
name: ClassVar[str]
class NoVocab(BaseVocab):
tokenizer_model = "no_vocab"
name = "no_vocab"
def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"
@runtime_checkable
class Vocab(BaseVocab, Protocol):
vocab_size: int
added_tokens_dict: dict[str, int]
added_tokens_list: list[str]
fname_tokenizer: Path
def __init__(self, base_path: Path): ...
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
class BpeVocab(Vocab):
tokenizer_model = "gpt2"
name = "bpe"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'vocab.json').exists():
# "slow" tokenizer
with open(fname_tokenizer, encoding="utf-8") as f:
self.vocab = json.load(f)
try:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
else:
# "fast" tokenizer
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding="utf-8") as f:
tokenizer_json = json.load(f)
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if (
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'ByteLevel'
):
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
self.vocab = tokenizer_model["vocab"]
if (added := tokenizer_json.get('added_tokens')) is not None:
# Added tokens here can be duplicates of the main vocabulary.
added_tokens = {item['content']: item['id']
for item in added
if item['content'] not in self.vocab}
vocab_size = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
for i, _ in enumerate(self.vocab):
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.bpe_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab(Vocab):
tokenizer_model = "llama"
name = "spm"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
# normal location
try:
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
# not found in alternate location either
raise FileNotFoundError('Cannot find tokenizer.model')
self.sentencepiece_tokenizer = SentencePieceProcessor()
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
vocab_size = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids:
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(i)
text = piece.encode("utf-8")
score: float = tokenizer.GetScore(i)
toktype = gguf.TokenType.NORMAL
if tokenizer.IsUnknown(i):
toktype = gguf.TokenType.UNKNOWN
if tokenizer.IsControl(i):
toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.IsUnused(i):
toktype = gguf.TokenType.UNUSED
if tokenizer.IsByte(i):
toktype = gguf.TokenType.BYTE
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.sentencepiece_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class LlamaHfVocab(Vocab):
tokenizer_model = "llama"
name = "hfft"
def __init__(self, base_path: Path):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f:
tokenizer_json = json.load(f)
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
is_llama3 = (
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
and not tokenizer_model.get('byte_fallback', True)
)
if is_llama3:
raise TypeError('Llama 3 must be converted with BpeVocab')
if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use LlamaHfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
# Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths.
self.tokenizer = AutoTokenizer.from_pretrained(
base_path,
cache_dir=base_path,
local_files_only=True,
)
assert self.tokenizer.is_fast # assume tokenizer.json is used
# Initialize lists and dictionaries for added tokens
self.added_tokens_list = []
self.added_tokens_dict = dict()
self.added_tokens_ids = set()
# Process added tokens
for tok, tokidx in sorted(
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
):
# Only consider added tokens that are not in the base vocabulary
if tokidx >= self.tokenizer.vocab_size:
self.added_tokens_list.append(tok)
self.added_tokens_dict[tok] = tokidx
self.added_tokens_ids.add(tokidx)
# Store special tokens and their IDs
self.specials = {
tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens
}
self.special_ids = set(self.tokenizer.all_special_ids)
# Set vocabulary sizes
self.vocab_size_base = self.tokenizer.vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
}
for token_id in range(self.vocab_size_base):
# Skip processing added tokens here
if token_id in self.added_tokens_ids:
continue
# Convert token text to bytes
token_text = reverse_vocab[token_id].encode("utf-8")
# Yield token text, score, and type
yield token_text, self.get_token_score(token_id), self.get_token_type(
token_id, token_text, self.special_ids # Reuse already stored special IDs
)
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
# Special case for byte tokens
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
return gguf.TokenType.BYTE
# Determine token type based on whether it's a special token
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
def get_token_score(self, token_id: int) -> float:
# Placeholder for actual logic to determine the token's score
# This needs to be implemented based on specific requirements
return -1000.0 # Default score
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
if text in self.specials:
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
score = self.get_token_score(self.specials[text])
else:
toktype = gguf.TokenType.USER_DEFINED
score = -1000.0
yield text.encode("utf-8"), score, toktype
def has_newline_token(self):
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.hf_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
# #
# data loading # data loading
# TODO: reuse (probably move to gguf.py?) # TODO: reuse (probably move to gguf.py?)
@ -1109,7 +811,7 @@ class OutputFile:
if metadata is not None and metadata.name is not None: if metadata is not None and metadata.name is not None:
name = metadata.name name = metadata.name
elif params.path_model is not None: elif params.path_model is not None:
name = str(params.path_model.parent).split("/")[-1] name = params.path_model.name
elif params.n_ctx == 4096: elif params.n_ctx == 4096:
# Heuristic detection of LLaMA v2 model # Heuristic detection of LLaMA v2 model
name = "LLaMA v2" name = "LLaMA v2"

View file

@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
params.samples_start_after_nl = false; params.samples_start_after_nl = false;
params.use_adam = true; params.use_adam = true;
params.use_flash = true; params.use_flash = false;
params.use_scratch = true; params.use_scratch = true;
// only adam // only adam

View file

@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
// split the prompt into lines // split the prompt into lines
@ -211,6 +211,7 @@ int main(int argc, char ** argv) {
// clean up // clean up
llama_print_timings(ctx); llama_print_timings(ctx);
llama_batch_free(batch);
llama_free(ctx); llama_free(ctx);
llama_free_model(model); llama_free_model(model);
llama_backend_free(); llama_backend_free();

View file

@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = run(ctx, params); bool OK = run(ctx, params);

View file

@ -563,8 +563,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
// not capturing these, to silcence warnings // not capturing these, to silcence warnings
const int rope_mode = 0; const int rope_mode = 0;
return ggml_rope_custom(ctx, return ggml_rope_ext(ctx,
t, KQ_pos, n_rot, rope_mode, n_ctx, 0, t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
); );
}; };
@ -643,7 +643,8 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch); struct ggml_tensor * t15 = ggml_permute (ctx, t12, 0, 3, 1, 2); set_name(t15, "t15"); assert_shape_4d(t15, N, n_embd_head, n_head_kv, n_batch);
struct ggml_tensor * t16; struct ggml_tensor * t16;
if (enable_flash_attn) { if (enable_flash_attn) {
t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch); GGML_ASSERT(false && "TODO: ggml_flash_attn_ext() not yet supported");
//t16 = ggml_flash_attn(ctx, t13, t14, t15, true); set_name(t16, "t16"); assert_shape_4d(t16, n_embd_head, N, n_head, n_batch);
} else { } else {
struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch); struct ggml_tensor * t16_0 = ggml_mul_mat (ctx, t14, t13); set_name(t16_0, "t16_0"); assert_shape_4d(t16_0, N, N, n_head, n_batch);
struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch); struct ggml_tensor * t16_1 = ggml_scale_inplace (ctx, t16_0, kv_scale); set_name(t16_1, "t16_1"); assert_shape_4d(t16_1, N, N, n_head, n_batch);

View file

@ -598,7 +598,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
sparams.dataset = params.prompt_file; sparams.dataset = params.prompt_file;
@ -667,7 +667,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk); bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);

View file

@ -50,9 +50,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -70,7 +70,7 @@ static void write_logfile(
fprintf(logfile, "binary: infill\n"); fprintf(logfile, "binary: infill\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -78,8 +78,8 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_string_yaml_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -236,7 +236,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str()); LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
} }
const bool add_bos = llama_should_add_bos_token(model); const bool add_bos = llama_should_add_bos_token(model);
GGML_ASSERT(llama_add_eos_token(model) != 1); GGML_ASSERT(llama_add_eos_token(model) != 1);
@ -621,8 +621,8 @@ int main(int argc, char ** argv) {
if (params.escape) { if (params.escape) {
//process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here
process_escapes(params.input_prefix); string_process_escapes(params.input_prefix);
process_escapes(params.input_suffix); string_process_escapes(params.input_suffix);
} }
suff_rm_leading_spc = params.escape; suff_rm_leading_spc = params.escape;
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {

View file

@ -178,6 +178,7 @@ struct cmd_params {
std::vector<ggml_type> type_v; std::vector<ggml_type> type_v;
std::vector<int> n_threads; std::vector<int> n_threads;
std::vector<int> n_gpu_layers; std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode; std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu; std::vector<int> main_gpu;
std::vector<bool> no_kv_offload; std::vector<bool> no_kv_offload;
@ -195,13 +196,14 @@ static const cmd_params cmd_params_defaults = {
/* model */ {"models/7B/ggml-model-q4_0.gguf"}, /* model */ {"models/7B/ggml-model-q4_0.gguf"},
/* n_prompt */ {512}, /* n_prompt */ {512},
/* n_gen */ {128}, /* n_gen */ {128},
/* n_pg */ {{512, 128}}, /* n_pg */ {},
/* n_batch */ {2048}, /* n_batch */ {2048},
/* n_ubatch */ {512}, /* n_ubatch */ {512},
/* type_k */ {GGML_TYPE_F16}, /* type_k */ {GGML_TYPE_F16},
/* type_v */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16},
/* n_threads */ {get_math_cpu_count()}, /* n_threads */ {cpu_get_num_math()},
/* n_gpu_layers */ {99}, /* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0}, /* main_gpu */ {0},
/* no_kv_offload */ {false}, /* no_kv_offload */ {false},
@ -230,6 +232,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@ -384,6 +387,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
} }
auto p = split<int>(argv[i], split_delim); auto p = split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
} else if (arg == "-sm" || arg == "--split-mode") { } else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -519,6 +528,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@ -541,6 +551,7 @@ struct cmd_params_instance {
ggml_type type_v; ggml_type type_v;
int n_threads; int n_threads;
int n_gpu_layers; int n_gpu_layers;
std::string rpc_servers;
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
bool no_kv_offload; bool no_kv_offload;
@ -553,6 +564,9 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params(); llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers; mparams.n_gpu_layers = n_gpu_layers;
if (!rpc_servers.empty()) {
mparams.rpc_servers = rpc_servers.c_str();
}
mparams.split_mode = split_mode; mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu; mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data(); mparams.tensor_split = tensor_split.data();
@ -564,6 +578,7 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const { bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && return model == other.model &&
n_gpu_layers == other.n_gpu_layers && n_gpu_layers == other.n_gpu_layers &&
rpc_servers == other.rpc_servers &&
split_mode == other.split_mode && split_mode == other.split_mode &&
main_gpu == other.main_gpu && main_gpu == other.main_gpu &&
use_mmap == other.use_mmap && use_mmap == other.use_mmap &&
@ -592,6 +607,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
// this ordering minimizes the number of times that each model needs to be reloaded // this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model) for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers) for (const auto & nl : params.n_gpu_layers)
for (const auto & rpc : params.rpc_servers)
for (const auto & sm : params.split_mode) for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu) for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split) for (const auto & ts : params.tensor_split)
@ -618,6 +634,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
@ -643,6 +660,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
@ -668,6 +686,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv, /* .type_v = */ tv,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .n_gpu_layers = */ nl, /* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm, /* .split_mode = */ sm,
/* .main_gpu = */ mg, /* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo, /* .no_kv_offload= */ nkvo,
@ -692,6 +711,7 @@ struct test {
static const bool kompute; static const bool kompute;
static const bool metal; static const bool metal;
static const bool sycl; static const bool sycl;
static const bool rpc;
static const bool gpu_blas; static const bool gpu_blas;
static const bool blas; static const bool blas;
static const std::string cpu_info; static const std::string cpu_info;
@ -790,6 +810,9 @@ struct test {
if (sycl) { if (sycl) {
return GGML_SYCL_NAME; return GGML_SYCL_NAME;
} }
if (rpc) {
return "RPC";
}
if (gpu_blas) { if (gpu_blas) {
return "GPU BLAS"; return "GPU BLAS";
} }
@ -803,7 +826,7 @@ struct test {
static const std::vector<std::string> & get_fields() { static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = { static const std::vector<std::string> fields = {
"build_commit", "build_number", "build_commit", "build_number",
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cpu_info", "gpu_info", "cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params", "model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch", "n_batch", "n_ubatch",
@ -859,7 +882,7 @@ struct test {
std::vector<std::string> values = { std::vector<std::string> values = {
build_commit, std::to_string(build_number), build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan), std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas), std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info, cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_batch), std::to_string(n_ubatch),
@ -894,6 +917,7 @@ const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas(); const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl(); const bool test::sycl = !!ggml_cpu_has_sycl();
const bool test::rpc = !!ggml_cpu_has_rpc();
const std::string test::cpu_info = get_cpu_info(); const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info(); const std::string test::gpu_info = get_gpu_info();

View file

@ -7,8 +7,6 @@ android {
namespace = "com.example.llama" namespace = "com.example.llama"
compileSdk = 34 compileSdk = 34
ndkVersion = "26.1.10909125"
defaultConfig { defaultConfig {
applicationId = "com.example.llama" applicationId = "com.example.llama"
minSdk = 33 minSdk = 33
@ -20,17 +18,6 @@ android {
vectorDrawables { vectorDrawables {
useSupportLibrary = true useSupportLibrary = true
} }
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
}
externalNativeBuild {
cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()
}
}
} }
buildTypes { buildTypes {
@ -55,17 +42,6 @@ android {
composeOptions { composeOptions {
kotlinCompilerExtensionVersion = "1.5.1" kotlinCompilerExtensionVersion = "1.5.1"
} }
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
externalNativeBuild {
cmake {
path = file("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
} }
dependencies { dependencies {
@ -78,6 +54,7 @@ dependencies {
implementation("androidx.compose.ui:ui-graphics") implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview") implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3") implementation("androidx.compose.material3:material3")
implementation(project(":llama"))
testImplementation("junit:junit:4.13.2") testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5") androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1") androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")

View file

@ -1,5 +1,6 @@
package com.example.llama package com.example.llama
import android.llama.cpp.LLamaAndroid
import android.util.Log import android.util.Log
import androidx.compose.runtime.getValue import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf import androidx.compose.runtime.mutableStateOf
@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch import kotlinx.coroutines.launch
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() { class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
companion object { companion object {
@JvmStatic @JvmStatic
private val NanosPerSecond = 1_000_000_000.0 private val NanosPerSecond = 1_000_000_000.0
@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch { viewModelScope.launch {
try { try {
llm.unload() llamaAndroid.unload()
} catch (exc: IllegalStateException) { } catch (exc: IllegalStateException) {
messages += exc.message!! messages += exc.message!!
} }
@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
messages += "" messages += ""
viewModelScope.launch { viewModelScope.launch {
llm.send(text) llamaAndroid.send(text)
.catch { .catch {
Log.e(tag, "send() failed", it) Log.e(tag, "send() failed", it)
messages += it.message!! messages += it.message!!
@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch { viewModelScope.launch {
try { try {
val start = System.nanoTime() val start = System.nanoTime()
val warmupResult = llm.bench(pp, tg, pl, nr) val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
val end = System.nanoTime() val end = System.nanoTime()
messages += warmupResult messages += warmupResult
@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
return@launch return@launch
} }
messages += llm.bench(512, 128, 1, 3) messages += llamaAndroid.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) { } catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc) Log.e(tag, "bench() failed", exc)
messages += exc.message!! messages += exc.message!!
@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
fun load(pathToModel: String) { fun load(pathToModel: String) {
viewModelScope.launch { viewModelScope.launch {
try { try {
llm.load(pathToModel) llamaAndroid.load(pathToModel)
messages += "Loaded $pathToModel" messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) { } catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc) Log.e(tag, "load() failed", exc)

View file

@ -2,4 +2,5 @@
plugins { plugins {
id("com.android.application") version "8.2.0" apply false id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false id("org.jetbrains.kotlin.android") version "1.9.0" apply false
id("com.android.library") version "8.2.0" apply false
} }

View file

@ -0,0 +1 @@
/build

View file

@ -0,0 +1,55 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
## Fetch latest llama.cpp from GitHub
#include(FetchContent)
#FetchContent_Declare(
# llama
# GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
# GIT_TAG master
#)
#
## Also provides "common"
#FetchContent_MakeAvailable(llama)
# llama.cpp CI uses the code from the current branch
# ref: https://github.com/ggerganov/llama.cpp/pull/7341#issuecomment-2117617700
add_subdirectory(../../../../../../ build-llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View file

@ -0,0 +1,68 @@
plugins {
id("com.android.library")
id("org.jetbrains.kotlin.android")
}
android {
namespace = "android.llama.cpp"
compileSdk = 34
defaultConfig {
minSdk = 33
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
consumerProguardFiles("consumer-rules.pro")
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
}
externalNativeBuild {
cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()
cppFlags("")
}
}
}
buildTypes {
release {
isMinifyEnabled = false
proguardFiles(
getDefaultProguardFile("proguard-android-optimize.txt"),
"proguard-rules.pro"
)
}
}
externalNativeBuild {
cmake {
path("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_1_8
targetCompatibility = JavaVersion.VERSION_1_8
}
kotlinOptions {
jvmTarget = "1.8"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
}
dependencies {
implementation("androidx.core:core-ktx:1.12.0")
implementation("androidx.appcompat:appcompat:1.6.1")
implementation("com.google.android.material:material:1.11.0")
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
}

View file

@ -0,0 +1,21 @@
# Add project specific ProGuard rules here.
# You can control the set of applied configuration files using the
# proguardFiles setting in build.gradle.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html
# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}
# Uncomment this to preserve the line number information for
# debugging stack traces.
#-keepattributes SourceFile,LineNumberTable
# If you keep the line number information, uncomment this to
# hide the original source file name.
#-renamesourcefileattribute SourceFile

View file

@ -0,0 +1,24 @@
package android.llama.cpp
import androidx.test.platform.app.InstrumentationRegistry
import androidx.test.ext.junit.runners.AndroidJUnit4
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.Assert.*
/**
* Instrumented test, which will execute on an Android device.
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
@RunWith(AndroidJUnit4::class)
class ExampleInstrumentedTest {
@Test
fun useAppContext() {
// Context of the app under test.
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
assertEquals("android.llama.cpp.test", appContext.packageName)
}
}

View file

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
</manifest>

View file

@ -1,4 +1,3 @@
# For more information about using CMake with Android Studio, read the # For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html. # documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples. # For more examples on how to use CMake, see https://github.com/android/ndk-samples.
@ -36,15 +35,15 @@ FetchContent_MakeAvailable(llama)
# for GameActivity/NativeActivity derived applications, the same library name must be # for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file. # used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt. # List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp) llama-android.cpp)
# Specifies libraries CMake should link to your target library. You # Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this # can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries. # build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME} target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library # List libraries link to the target library
llama llama
common common
android android
log) log)

View file

@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
extern "C" extern "C"
JNIEXPORT jlong JNICALL JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) { Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0); auto path_to_model = env->GetStringUTFChars(filename, 0);
@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) { Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast<llama_model *>(model)); llama_free_model(reinterpret_cast<llama_model *>(model));
} }
extern "C" extern "C"
JNIEXPORT jlong JNICALL JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) { Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast<llama_model *>(jmodel); auto model = reinterpret_cast<llama_model *>(jmodel);
if (!model) { if (!model) {
@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) { Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast<llama_context *>(context)); llama_free(reinterpret_cast<llama_context *>(context));
} }
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) { Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
llama_backend_free(); llama_backend_free();
} }
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) { Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL); llama_log_set(log_callback, NULL);
} }
extern "C" extern "C"
JNIEXPORT jstring JNICALL JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_bench_1model( Java_android_llama_cpp_LLamaAndroid_bench_1model(
JNIEnv *env, JNIEnv *env,
jobject, jobject,
jlong context_pointer, jlong context_pointer,
@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) { Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer)); llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
} }
extern "C" extern "C"
JNIEXPORT jlong JNICALL JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) { Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated. // Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) { Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
llama_backend_init(); llama_backend_init();
} }
extern "C" extern "C"
JNIEXPORT jstring JNICALL JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) { Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info()); return env->NewStringUTF(llama_print_system_info());
} }
extern "C" extern "C"
JNIEXPORT jint JNICALL JNIEXPORT jint JNICALL
Java_com_example_llama_Llm_completion_1init( Java_android_llama_cpp_LLamaAndroid_completion_1init(
JNIEnv *env, JNIEnv *env,
jobject, jobject,
jlong context_pointer, jlong context_pointer,
@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
extern "C" extern "C"
JNIEXPORT jstring JNICALL JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_completion_1loop( Java_android_llama_cpp_LLamaAndroid_completion_1loop(
JNIEnv * env, JNIEnv * env,
jobject, jobject,
jlong context_pointer, jlong context_pointer,
@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
extern "C" extern "C"
JNIEXPORT void JNICALL JNIEXPORT void JNICALL
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) { Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context)); llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
} }

View file

@ -1,4 +1,4 @@
package com.example.llama package android.llama.cpp
import android.util.Log import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher import kotlinx.coroutines.CoroutineDispatcher
@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
import java.util.concurrent.Executors import java.util.concurrent.Executors
import kotlin.concurrent.thread import kotlin.concurrent.thread
class Llm { class LLamaAndroid {
private val tag: String? = this::class.simpleName private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle } private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@ -165,8 +165,8 @@ class Llm {
} }
// Enforce only one instance of Llm. // Enforce only one instance of Llm.
private val _instance: Llm = Llm() private val _instance: LLamaAndroid = LLamaAndroid()
fun instance(): Llm = _instance fun instance(): LLamaAndroid = _instance
} }
} }

View file

@ -0,0 +1,17 @@
package android.llama.cpp
import org.junit.Test
import org.junit.Assert.*
/**
* Example local unit test, which will execute on the development machine (host).
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
class ExampleUnitTest {
@Test
fun addition_isCorrect() {
assertEquals(4, 2 + 2)
}
}

View file

@ -15,3 +15,4 @@ dependencyResolutionManagement {
rootProject.name = "LlamaAndroid" rootProject.name = "LlamaAndroid"
include(":app") include(":app")
include(":llama")

View file

@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
--projector-type ldpv2 --projector-type ldpv2
``` ```
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: 4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh ```sh
python ./convert.py path/to/MobileVLM-1.7B python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
``` ```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k` 5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`

View file

@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
``` ```
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: 5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh ```sh
python ./convert.py ../llava-v1.5-7b --skip-unknown python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
``` ```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
6) Then convert the model to gguf format: 6) Then convert the model to gguf format:
```console ```console
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
``` ```
7) And finally we can run the llava-cli using the 1.6 model version: 7) And finally we can run the llava-cli using the 1.6 model version:

View file

@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */ /** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img); CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */ /** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs ); CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx); CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

View file

@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS #endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_print_usage(argc, argv, params); gpt_params_print_usage(argc, argv, params);
show_additional_info(argc, argv); show_additional_info(argc, argv);
return 1; return 1;
} }

View file

@ -88,7 +88,6 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out) // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) { static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
struct { struct {
struct ggml_tensor * newline;
struct ggml_context * ctx; struct ggml_context * ctx;
} model; } model;
@ -150,20 +149,6 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
ggml_tensor * newline_tmp = clip_get_newline_tensor(ctx_clip);
model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]);
if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) {
if (newline_tmp->buffer == NULL) {
LOG_TEE("newline_tmp tensor buffer is NULL\n");
}
ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp));
} else {
model.newline->data = newline_tmp->data;
if (model.newline->data == NULL) {
LOG_TEE("newline_tmp tensor data is NULL\n");
}
}
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4 struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false); // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
// fill it with the image embeddings, ignoring the base // fill it with the image embeddings, ignoring the base

View file

@ -1,3 +1,3 @@
-r ../../requirements/requirements-convert.txt -r ../../requirements/requirements-convert-legacy-llama.txt
pillow~=10.2.0 pillow~=10.2.0
torch~=2.1.1 torch~=2.1.1

View file

@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/ // build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/

View file

@ -121,7 +121,7 @@ int main(int argc, char ** argv){
// debug // debug
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
// print current draft sequence // print current draft sequence

View file

@ -325,3 +325,5 @@ These options provide extra functionality and customization when running the LLa
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-hfr URL --hf-repo URL`: The url to the Hugging Face model repository. Used in conjunction with `--hf-file` or `-hff`. The model is downloaded and stored in the file provided by `-m` or `--model`. If `-m` is not provided, the model is auto-stored in the path specified by the `LLAMA_CACHE` environment variable or in an OS-specific local cache.

View file

@ -60,9 +60,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -80,7 +80,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n"); fprintf(logfile, "binary: main\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, input_tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -88,8 +88,8 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_string_yaml_multiline(logfile, "output", output.c_str()); yaml_dump_string_multiline(logfile, "output", output.c_str());
dump_vector_int_yaml(logfile, "output_tokens", output_tokens); yaml_dump_vector_int(logfile, "output_tokens", output_tokens);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -181,7 +181,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
LOG("%s: llama backend init\n", __func__); LOG("%s: llama backend init\n", __func__);
@ -219,7 +219,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
LOG_TEE("\n"); LOG_TEE("\n");
LOG_TEE("%s\n", get_system_info(params).c_str()); LOG_TEE("%s\n", gpt_params_get_system_info(params).c_str());
} }
std::string path_session = params.path_prompt_cache; std::string path_session = params.path_prompt_cache;
@ -474,12 +474,12 @@ int main(int argc, char ** argv) {
LOG_TEE("\n\n"); LOG_TEE("\n\n");
if (params.interactive) { if (params.interactive) {
const char *control_message; const char * control_message;
if (params.multiline_input) { if (params.multiline_input) {
control_message = " - To return control to LLaMa, end your input with '\\'.\n" control_message = " - To return control to the AI, end your input with '\\'.\n"
" - To return control without starting a new line, end your input with '/'.\n"; " - To return control without starting a new line, end your input with '/'.\n";
} else { } else {
control_message = " - Press Return to return control to LLaMa.\n" control_message = " - Press Return to return control to the AI.\n"
" - To return control without starting a new line, end your input with '/'.\n" " - To return control without starting a new line, end your input with '/'.\n"
" - If you want to submit another line, end your input with '\\'.\n"; " - If you want to submit another line, end your input with '\\'.\n";
} }
@ -707,7 +707,7 @@ int main(int argc, char ** argv) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
llama_sampling_accept(ctx_sampling, ctx, id, true); llama_sampling_accept(ctx_sampling, ctx, id, /* apply_grammar= */ true);
LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str());
@ -728,7 +728,7 @@ int main(int argc, char ** argv) {
// push the prompt in the sampling context in order to apply repetition penalties later // push the prompt in the sampling context in order to apply repetition penalties later
// for the prompt, we don't apply grammar rules // for the prompt, we don't apply grammar rules
llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], /* apply_grammar= */ false);
++n_consumed; ++n_consumed;
if ((int) embd.size() >= params.n_batch) { if ((int) embd.size() >= params.n_batch) {
@ -740,18 +740,26 @@ int main(int argc, char ** argv) {
// display text // display text
if (input_echo && display) { if (input_echo && display) {
for (auto id : embd) { for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation); const std::string token_str = llama_token_to_piece(ctx, id, params.special);
printf("%s", token_str.c_str());
// Console/Stream Output
fprintf(stdout, "%s", token_str.c_str());
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) { if (embd.size() > 1) {
// Incoming Requested Tokens
input_tokens.push_back(id); input_tokens.push_back(id);
} else { } else {
// Outgoing Generated Tokens
output_tokens.push_back(id); output_tokens.push_back(id);
output_ss << token_str; output_ss << token_str;
} }
fflush(stdout);
} }
fflush(stdout);
} }
// reset color to default if there is no pending user input // reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) { if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset); console::set_display(console::reset);
@ -879,7 +887,7 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
} }
if (params.escape) { if (params.escape) {
process_escapes(buffer); string_process_escapes(buffer);
} }
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);

View file

@ -1,98 +0,0 @@
#!/usr/bin/env python3
"""
This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
Usage:
python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
Arguments:
- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
Old quant types (some base model types require these):
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
New quant types (recommended):
- Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss
- Q3_K_M: very small, very high quality loss
- Q3_K_L: small, substantial quality loss
- Q4_K: alias for Q4_K_M
- Q4_K_S: small, significant quality loss
- Q4_K_M: medium, balanced quality - recommended
- Q5_K: alias for Q5_K_M
- Q5_K_S: large, low quality loss - recommended
- Q5_K_M: large, very low quality loss - recommended
- Q6_K: very large, extremely low quality loss
- Q8_0: very large, extremely low quality loss - not recommended
- F16: extremely large, virtually no quality loss - not recommended
- F32: absolutely huge, lossless - not recommended
"""
import subprocess
subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
import argparse
import os
from huggingface_hub import snapshot_download
def main(model, model_type, outname, outdir, quants, keep_fp16):
if not os.path.isdir(model):
print(f"Model not found at {model}. Downloading...")
try:
if outname is None:
outname = model.split('/')[-1]
model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
except Exception as e:
raise Exception(f"Could not download the model: {e}")
if outdir is None:
outdir = f'../models/{outname}'
if not os.path.isfile(f"{model}/config.json"):
raise Exception(f"Could not find config.json in {model}")
os.makedirs(outdir, exist_ok=True)
print("Building llama.cpp")
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
print(f"Making unquantised GGUF at {fp16}")
if not os.path.isfile(fp16):
if model_type != "llama":
subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
else:
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
else:
print(f"Unquantised GGML already exists at: {fp16}")
print("Making quants")
for type in quants:
outfile = f"{outdir}/{outname}.gguf.{type}.bin"
print(f"Making {type} : {outfile}")
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
if not keep_fp16:
os.remove(fp16)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
parser.add_argument('--outname', default=None, help='Output model(s) name')
parser.add_argument('--outdir', default=None, help='Output directory')
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
args = parser.parse_args()
main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)

View file

@ -210,7 +210,7 @@ int main(int argc, char ** argv) {
while (true) { while (true) {
if (dump_kv_cache) { if (dump_kv_cache) {
llama_kv_cache_view_update(ctx, &kvc_view); llama_kv_cache_view_update(ctx, &kvc_view);
dump_kv_cache_view_seqs(kvc_view, 40); llama_kv_cache_dump_view_seqs(kvc_view, 40);
} }
llama_batch_clear(batch); llama_batch_clear(batch);

View file

@ -42,10 +42,13 @@ In addition to the KL divergence the following statistics are calculated with `-
Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16. Results were generated using the CUDA backend and are sorted by Kullback-Leibler divergence relative to FP16.
The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat). The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat).
Note: the FP16 logits used for the calculation of all metrics other than perplexity are stored in a binary file between runs.
In order to save space this file does **not** contain the exact same FP32 logits but instead casts them to 16 bit unsigned integers (with some scaling).
So the "f16" results are to be understood as the difference resulting only from this downcast.
| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp | | Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp |
|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------| |--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------|
| f16 | None | 14.97 | 6.233160 ± 0.037828 | - | - | - | - | | f16 | None | 14.97 | 6.233160 ± 0.037828 | 0.001524 ± 0.000755 | 0.000551 ± 0.000002 | 0.001 ± 0.002 % | 0.787 ± 0.004 % |
| q8_0 | None | 7.96 | 6.234284 ± 0.037878 | 0.002650 ± 0.001006 | 0.001355 ± 0.000006 | -0.019 ± 0.003 % | 1.198 ± 0.007 % | | q8_0 | None | 7.96 | 6.234284 ± 0.037878 | 0.002650 ± 0.001006 | 0.001355 ± 0.000006 | -0.019 ± 0.003 % | 1.198 ± 0.007 % |
| q6_K | None | 6.14 | 6.253382 ± 0.038078 | 0.021748 ± 0.001852 | 0.005452 ± 0.000035 | -0.007 ± 0.006 % | 2.295 ± 0.019 % | | q6_K | None | 6.14 | 6.253382 ± 0.038078 | 0.021748 ± 0.001852 | 0.005452 ± 0.000035 | -0.007 ± 0.006 % | 2.295 ± 0.019 % |
| q5_K_M | None | 5.33 | 6.288607 ± 0.038338 | 0.056974 ± 0.002598 | 0.010762 ± 0.000079 | -0.114 ± 0.008 % | 3.160 ± 0.031 % | | q5_K_M | None | 5.33 | 6.288607 ± 0.038338 | 0.056974 ± 0.002598 | 0.010762 ± 0.000079 | -0.114 ± 0.008 % | 3.160 ± 0.031 % |

View file

@ -44,9 +44,9 @@ static void write_logfile(
return; return;
} }
const std::string timestamp = get_sortable_timestamp(); const std::string timestamp = string_get_sortable_timestamp();
const bool success = create_directory_with_parents(params.logdir); const bool success = fs_create_directory_with_parents(params.logdir);
if (!success) { if (!success) {
fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n",
__func__, params.logdir.c_str()); __func__, params.logdir.c_str());
@ -64,7 +64,7 @@ static void write_logfile(
fprintf(logfile, "binary: main\n"); fprintf(logfile, "binary: main\n");
char model_desc[128]; char model_desc[128];
llama_model_desc(model, model_desc, sizeof(model_desc)); llama_model_desc(model, model_desc, sizeof(model_desc));
dump_non_result_info_yaml(logfile, params, ctx, timestamp, results.tokens, model_desc); yaml_dump_non_result_info(logfile, params, ctx, timestamp, results.tokens, model_desc);
fprintf(logfile, "\n"); fprintf(logfile, "\n");
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
@ -72,9 +72,9 @@ static void write_logfile(
fprintf(logfile, "######################\n"); fprintf(logfile, "######################\n");
fprintf(logfile, "\n"); fprintf(logfile, "\n");
dump_vector_float_yaml(logfile, "logits", results.logits); yaml_dump_vector_float(logfile, "logits", results.logits);
fprintf(logfile, "ppl_value: %f\n", results.ppl_value); fprintf(logfile, "ppl_value: %f\n", results.ppl_value);
dump_vector_float_yaml(logfile, "probs", results.probs); yaml_dump_vector_float(logfile, "probs", results.probs);
llama_dump_timing_info_yaml(logfile, ctx); llama_dump_timing_info_yaml(logfile, ctx);
fclose(logfile); fclose(logfile);
@ -1425,7 +1425,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
// Use all tasks // Use all tasks
tasks.resize(n_task); tasks.resize(n_task);
printf("%s: reading tasks", __func__); printf("%s: reading tasks", __func__);
int n_dot = n_task/100; int n_dot = std::max((int) n_task/100, 1);
int i = 0; int i = 0;
for (auto& task : tasks) { for (auto& task : tasks) {
++i; ++i;
@ -1675,7 +1675,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
llama_batch_free(batch); llama_batch_free(batch);
if (n_done < 100) return; if (n_done < 100 && (params.multiple_choice_tasks != 0 && params.multiple_choice_tasks < (size_t)n_task)) return;
float p = 1.f*n_correct/n_done; float p = 1.f*n_correct/n_done;
float sigma = sqrt(p*(1-p)/(n_done-1)); float sigma = sqrt(p*(1-p)/(n_done-1));
@ -2007,7 +2007,7 @@ int main(int argc, char ** argv) {
std::mt19937 rng(params.seed); std::mt19937 rng(params.seed);
if (params.random_prompt) { if (params.random_prompt) {
params.prompt = gpt_random_prompt(rng); params.prompt = string_random_prompt(rng);
} }
llama_backend_init(); llama_backend_init();
@ -2035,7 +2035,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
struct results_perplexity results; struct results_perplexity results;

View file

@ -1,6 +1,8 @@
# quantize # quantize
TODO You can also use the [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space on Hugging Face to build your own quants without any setup.
Note: It is synced from llama.cpp `main` every 6 hours.
## Llama 2 7B ## Llama 2 7B

View file

@ -259,7 +259,7 @@ int main(int argc, char ** argv) {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--override-kv") == 0) { } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
@ -284,7 +284,7 @@ int main(int argc, char ** argv) {
} else { } else {
usage(argv[0]); usage(argv[0]);
} }
} else if (strcmp(argv[arg_idx], "--keep-split")) { } else if (strcmp(argv[arg_idx], "--keep-split") == 0) {
params.keep_split = true; params.keep_split = true;
} else { } else {
usage(argv[0]); usage(argv[0]);

View file

@ -41,8 +41,8 @@ $SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/g
echo PASS echo PASS
echo echo
# 3. Requant model with '--keep_split' # 3. Requant model with '--keep-split'
$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K $QUANTIZE --allow-requantize --keep-split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
echo PASS echo PASS
echo echo
@ -51,7 +51,7 @@ $MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt
echo PASS echo PASS
echo echo
# 4. Requant mode without '--keep_split' # 4. Requant mode without '--keep-split'
$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K $QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
echo PASS echo PASS
echo echo

View file

@ -11,7 +11,7 @@ struct retrieval_params {
}; };
static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) { static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) {
gpt_print_usage(argc, argv, gpt_params); gpt_params_print_usage(argc, argv, gpt_params);
printf("retrieval options:\n"); printf("retrieval options:\n");
printf(" --context-file FNAME file containing context to embed.\n"); printf(" --context-file FNAME file containing context to embed.\n");
printf(" specify multiple files by providing --context-file option multiple times.\n"); printf(" specify multiple files by providing --context-file option multiple times.\n");
@ -226,7 +226,7 @@ int main(int argc, char ** argv) {
// print system information // print system information
{ {
fprintf(stderr, "\n"); fprintf(stderr, "\n");
fprintf(stderr, "%s\n", get_system_info(params).c_str()); fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
} }
// max batch size // max batch size

View file

@ -42,7 +42,7 @@ cmake --build . --config Release
Then, start the `rpc-server` with the backend: Then, start the `rpc-server` with the backend:
```bash ```bash
$ bin/rpc-server 0.0.0.0 50052 $ bin/rpc-server -p 50052
create_backend: using CUDA backend create_backend: using CUDA backend
ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no ggml_cuda_init: GGML_CUDA_FORCE_MMQ: no
ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes
@ -53,7 +53,7 @@ Starting RPC server on 0.0.0.0:50052
When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.: When using the CUDA backend, you can specify the device with the `CUDA_VISIBLE_DEVICES` environment variable, e.g.:
```bash ```bash
$ CUDA_VISIBLE_DEVICES=0 bin/rpc-server 0.0.0.0 50052 $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
``` ```
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device. This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.

View file

@ -6,10 +6,69 @@
#include "ggml-metal.h" #include "ggml-metal.h"
#endif #endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#include "ggml-rpc.h" #include "ggml-rpc.h"
#ifdef _WIN32
# include <windows.h>
#else
# include <unistd.h>
#endif
#include <string> #include <string>
#include <stdio.h> #include <stdio.h>
struct rpc_server_params {
std::string host = "0.0.0.0";
int port = 50052;
size_t backend_mem = 0;
};
static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -H HOST, --host HOST host to bind to (default: %s)\n", params.host.c_str());
fprintf(stderr, " -p PORT, --port PORT port to bind to (default: %d)\n", params.port);
fprintf(stderr, " -m MEM, --mem MEM backend memory size (in MB)\n");
fprintf(stderr, "\n");
}
static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params & params) {
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-H" || arg == "--host") {
if (++i >= argc) {
return false;
}
params.host = argv[i];
} else if (arg == "-p" || arg == "--port") {
if (++i >= argc) {
return false;
}
params.port = std::stoi(argv[i]);
if (params.port <= 0 || params.port > 65535) {
return false;
}
} else if (arg == "-m" || arg == "--mem") {
if (++i >= argc) {
return false;
}
params.backend_mem = std::stoul(argv[i]) * 1024 * 1024;
} else if (arg == "-h" || arg == "--help") {
print_usage(argc, argv, params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
static ggml_backend_t create_backend() { static ggml_backend_t create_backend() {
ggml_backend_t backend = NULL; ggml_backend_t backend = NULL;
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
@ -24,6 +83,12 @@ static ggml_backend_t create_backend() {
if (!backend) { if (!backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
} }
#elif GGML_USE_SYCL
fprintf(stderr, "%s: using SYCL backend\n", __func__);
backend = ggml_backend_sycl_init(0); // init device 0
if (!backend) {
fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__);
}
#endif #endif
// if there aren't GPU Backends fallback to CPU backend // if there aren't GPU Backends fallback to CPU backend
@ -38,21 +103,25 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
#ifdef GGML_USE_CUDA #ifdef GGML_USE_CUDA
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
#else #else
// TODO: implement for other backends #ifdef _WIN32
*free_mem = 1; MEMORYSTATUSEX status;
*total_mem = 1; status.dwLength = sizeof(status);
GlobalMemoryStatusEx(&status);
*total_mem = status.ullTotalPhys;
*free_mem = status.ullAvailPhys;
#else
long pages = sysconf(_SC_PHYS_PAGES);
long page_size = sysconf(_SC_PAGE_SIZE);
*total_mem = pages * page_size;
*free_mem = *total_mem;
#endif
#endif #endif
} }
int main(int argc, char * argv[]) { int main(int argc, char * argv[]) {
if (argc < 3) { rpc_server_params params;
fprintf(stderr, "Usage: %s <host> <port>\n", argv[0]); if (!rpc_server_params_parse(argc, argv, params)) {
return 1; fprintf(stderr, "Invalid parameters\n");
}
const char * host = argv[1];
int port = std::stoi(argv[2]);
if (port <= 0 || port > 65535) {
fprintf(stderr, "Invalid port number: %d\n", port);
return 1; return 1;
} }
ggml_backend_t backend = create_backend(); ggml_backend_t backend = create_backend();
@ -60,10 +129,15 @@ int main(int argc, char * argv[]) {
fprintf(stderr, "Failed to create backend\n"); fprintf(stderr, "Failed to create backend\n");
return 1; return 1;
} }
printf("Starting RPC server on %s:%d\n", host, port); std::string endpoint = params.host + ":" + std::to_string(params.port);
size_t free_mem, total_mem; size_t free_mem, total_mem;
get_backend_memory(&free_mem, &total_mem); if (params.backend_mem > 0) {
std::string endpoint = std::string(host) + ":" + std::to_string(port); free_mem = params.backend_mem;
total_mem = params.backend_mem;
} else {
get_backend_memory(&free_mem, &total_mem);
}
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem); start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
ggml_backend_free(backend); ggml_backend_free(backend);
return 0; return 0;

View file

@ -8,9 +8,20 @@ set(TARGET_SRCS
httplib.h httplib.h
) )
set(PUBLIC_ASSETS set(PUBLIC_ASSETS
colorthemes.css
style.css
theme-beeninorder.css
theme-ketivah.css
theme-mangotango.css
theme-playground.css
theme-polarnight.css
theme-snowstorm.css
index.html index.html
index-new.html
index.js index.js
completion.js completion.js
system-prompts.js
prompt-formats.js
json-schema-to-grammar.mjs json-schema-to-grammar.mjs
) )
foreach(asset ${PUBLIC_ASSETS}) foreach(asset ${PUBLIC_ASSETS})

View file

@ -17,8 +17,9 @@ The project is under active development, and we are [looking for feedback and co
**Command line options:** **Command line options:**
- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend. - `-v`, `--verbose`: Enable verbose server output. When using the `/completion` endpoint, this includes the tokenized prompt, the full request and the full response.
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU. - `-t N`, `--threads N`: Set the number of threads to use by CPU layers during generation. Not used by model layers that are offloaded to GPU. This option has no effect when using the maximum number of GPU layers. Default: `std::thread::hardware_concurrency()` (number of CPU cores).
- `-tb N, --threads-batch N`: Set the number of threads to use by CPU layers during batch and prompt processing (>= 32 tokens). This option has no effect if a GPU is available. Default: `--threads`.
- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` - `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused
@ -36,9 +37,7 @@ The project is under active development, and we are [looking for feedback and co
- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems - `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems
- `--numa distribute`: Spread execution evenly over all nodes - `--numa distribute`: Spread execution evenly over all nodes
- `--numa isolate`: Only spawn threads on CPUs on the node that execution started on - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on
- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system - `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437
- `--numa`: Attempt optimizations that may help on some NUMA systems. - `--numa`: Attempt optimizations that may help on some NUMA systems.
- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
@ -49,7 +48,7 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. - `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys.
- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. - `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s.
- `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled - `--embeddings`: Enable embedding vector output and the OAI compatible endpoint /v1/embeddings. Physical batch size (`--ubatch-size`) must be carefully defined. Default: disabled
- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1` - `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1`. Values > 1 will allow for higher throughput with multiple parallel requests but the results will **not** be deterministic due to differences in rounding error.
- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled - `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime)
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.

View file

@ -0,0 +1,402 @@
@import url("theme-snowstorm.css");
@import url("theme-polarnight.css");
@import url("theme-ketivah.css");
@import url("theme-mangotango.css");
@import url("theme-playground.css");
@import url("theme-beeninorder.css");
:root {
/* ---------- PRIMARY COLORS ----------------- */
--primary-color-1: hsl(217.5, 26.7%, 94.1%);
--primary-color-1-hue: 217.5;
--primary-color-1-saturation: 26.7%;
--primary-color-1-lightness: 94.1%;
--primary-color-2: hsl(218.2, 26.8%, 92.0%);
--primary-color-2-hue: 218.2;
--primary-color-2-saturation: 26.8%;
--primary-color-2-lightness: 92.0%;
--primary-color-3: hsl(218.8, 27.9%, 88.0%);
--primary-color-3-hue: 218.8;
--primary-color-3-saturation: 27.9%;
--primary-color-3-lightness: 88.0%;
--primary-color-4: hsl(218.8, 18.3%, 81.8%);
--primary-color-4-hue: 218.8;
--primary-color-4-saturation: 18.3%;
--primary-color-4-lightness: 81.8%;
/* ---------- SECONDARY COLORS --------------- */
--secondary-color-1: hsl(220.0, 16.4%, 21.6%);
--secondary-color-1-hue: 220.0;
--secondary-color-1-saturation: 16.4%;
--secondary-color-1-lightness: 21.6%;
--secondary-color-2: hsl(221.7, 16.3%, 27.6%);
--secondary-color-2-hue: 221.7;
--secondary-color-2-saturation: 16.3%;
--secondary-color-2-lightness: 27.6%;
--secondary-color-3: hsl(220.0, 16.8%, 31.6%);
--secondary-color-3-hue: 220.0;
--secondary-color-3-saturation: 16.8%;
--secondary-color-3-lightness: 31.6%;
--secondary-color-4: hsl(220.0, 16.5%, 35.7%);
--secondary-color-4-hue: 220.0;
--secondary-color-4-saturation: 16.5%;
--secondary-color-4-lightness: 35.7%;
/* ----------- NUANCES COLORS ---------------- */
--theme-nuance-color-1: hsl(178.7, 25.1%, 64.9%);
--theme-nuance-color-1-hue: 178.7;
--theme-nuance-color-1-saturation: 25.1%;
--theme-nuance-color-1-lightness: 64.9%;
--theme-nuance-color-2: hsl(193.3, 43.4%, 67.5%);
--theme-nuance-color-2-hue: 193.3;
--theme-nuance-color-2-saturation: 43.4%;
--theme-nuance-color-2-lightness: 67.5%;
--theme-nuance-color-3: hsl(210.0, 34.0%, 63.1%);
--theme-nuance-color-3-hue: 210.0;
--theme-nuance-color-3-saturation: 34.0%;
--theme-nuance-color-3-lightness: 63.1%;
--theme-nuance-color-4: hsl(213.1, 32.0%, 52.2%);
--theme-nuance-color-4-hue: 213.1;
--theme-nuance-color-4-saturation: 32.0%;
--theme-nuance-color-4-lightness: 52.2%;
/* ----------- ROYGP COLORS ------------------ */
--theme-red-color: hsl(32.5, 80%, 50%);
--theme-orange-color: hsl(32.5, 70%, 45%);
--theme-yellow-color: hsl(40.0, 0.6%, 73.3%);
--theme-green-color: hsl(92.4, 27.8%, 64.7%);
--theme-purple-color: hsl(311.1, 20.2%, 63.1%);
/* ------------------------------------------- */
--background-color-1: var(--primary-color-1);
--background-color-2: var(--primary-color-2);
--background-color-3: var(--primary-color-3);
--background-color-4: var(--primary-color-4);
--border-color-1: var(--primary-color-2);
--border-color-2: var(--primary-color-3);
--border-color-3: var(--primary-color-4);
--border-focus-color: var(--theme-nuance-color-2);
--border-focus-shadow: var(--theme-nuance-color-1);
--text-color-plain: var(--secondary-color-1);
--text-color-subtile-1: var(--secondary-color-2);
--text-color-subtile-2: var(--secondary-color-3);
--code-background-color: var(--secondary-color-2);
--code-text-color: var(--primary-color-2);
--ui-range-thumb-color: var(--theme-nuance-color-3);
--ui-range-thumb-border: var(--ui-ranger-thumb-color);
--textarea-border-color: var(--secondary-color-4);
--chat-id-color: var(--theme-nuance-color-4);
/* ------------------------------------------- */
--button-alert-text-hover: var(--primary-color-1);
--button-alert-color-hover: var(--theme-orange-color);
--button-alert-border-hover: var(--theme-orange-color);
--button-alert-text-active: var(--primary-color-1);
--button-alert-color-active: var(--theme-red-color);
--button-alert-border-active: var(--theme-red-color);
/* ----------- PRIMARY BUTTONS --------------- */
/* - button should immediately catch the eye - */
--button-primary-text: var(--secondary-color-1);
--button-primary-color: var(--theme-nuance-color-3);
--button-primary-border: var(--theme-nuance-color-3);
/* ---------hover---------- */
--button-primary-text-hover:
hsl(217.5,
calc(var(--secondary-color-1-saturation) + 35%),
calc(var(--secondary-color-1-lightness) - 30%));
--button-primary-color-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 2%),
calc(var(--theme-nuance-color-3-lightness) - 10%));
--button-primary-border-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 2%),
calc(var(--theme-nuance-color-3-lightness) - 10%));
/* ---------active--------- */
--button-primary-text-active:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 20%),
calc(var(--theme-nuance-color-3-lightness) + 35%));
--button-primary-color-active:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 10%),
calc(var(--theme-nuance-color-3-lightness) - 25%));
--button-primary-border-active:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 10%),
calc(var(--theme-nuance-color-3-lightness) - 25%));
/* ---------- SECONDARY BUTTONS -------------- */
/* these should NOT immediately catch the eye */
--button-secondary-text:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 20%),
calc(var(--theme-nuance-color-3-lightness) - 50%));
--button-secondary-color:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 20%),
calc(var(--theme-nuance-color-3-lightness) + 10%));
--button-secondary-border:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 20%),
calc(var(--theme-nuance-color-3-lightness) + 10%));
/* ---------hover---------- */
--button-secondary-text-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 20%),
calc(var(--theme-nuance-color-3-lightness) - 80%));
--button-secondary-color-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 22%),
calc(var(--theme-nuance-color-3-lightness) + 1%));
--button-secondary-border-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 22%),
calc(var(--theme-nuance-color-3-lightness) + 1%));
/* ---------active--------- */
--button-secondary-text-active:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) + 40%),
calc(var(--theme-nuance-color-3-lightness) - 55%));
--button-secondary-color-active:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 30%),
calc(var(--theme-nuance-color-3-lightness) - 5%));
--button-secondary-border-active:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 30%),
calc(var(--theme-nuance-color-3-lightness) - 5%));
/* ---------- TERTIARY BUTTONS --------------- */
/* ---------- disabled buttons --------------- */
--button-tertiary-text:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 40%),
calc(var(--theme-nuance-color-3-lightness) - 5%));
--button-tertiary-color:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 40%),
calc(var(--theme-nuance-color-3-lightness) + 20%));
--button-tertiary-border:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 40%),
calc(var(--theme-nuance-color-3-lightness) + 20%));
/* ---------hover---------- */
--button-tertiary-text-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 40%),
calc(var(--theme-nuance-color-3-lightness) - 5%));
--button-tertiary-color-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 40%),
calc(var(--theme-nuance-color-3-lightness) + 20%));
--button-tertiary-border-hover:
hsl(210,
calc(var(--theme-nuance-color-3-saturation) - 40%),
calc(var(--theme-nuance-color-3-lightness) + 20%));
}
/*
.theme-template {
If light theme: should go from bright to darker
If dark theme: should go from dark to brighter
ideally this should not be anything but steps of
gray or slightly variants from it
--primary-color-1: #2E3440;
--primary-color-2: #3B4252;
--primary-color-3: #434C5E;
--primary-color-4: #4C566A;
If light theme: should go from dark to brighter
If dark theme: should go from bright to darker
ideally this should not be anything but steps of
gray or slightly variants from it
--secondary-color-1: #ECEFF4;
--secondary-color-2: #E5E9F0;
--secondary-color-3: #D8DEE9;
--secondary-color-4: #C8CED9;
Choose wisely nuance colors. It is not easy to find
4 harmonizing nuance colors. But keep in mind, that
only one accent color could work too.
--theme-nuance-color-1: #8FBCBB;
--theme-nuance-color-2: #88C0D0;
--theme-nuance-color-3: #81A1C1;
--theme-nuance-color-4: #5E81AC;
adapt the color red, orange, yellow, green,
purple to the 'mood' of your overall design
e.g is it low-contrast? vibrant? dynamic? etc
--theme-red-color: #BF616A;
--theme-orange-color: #D08770;
--theme-yellow-color: #EBCB8B;
--theme-green-color: #A3BE8C;
--theme-purple-color: #B48EAD;
NOTE: comment all those line `--- ...` out
------------------------------------------------
--background-color-1:
--background-color-2:
--background-color-3:
--background-color-4:
--border-color-1:
--border-color-2:
--border-color-3:
--border-focus-color:
--border-focus-shadow:
--text-color-plain:
--text-color-subtile-1:
--text-color-subtile-2:
--code-background-color:
--code-text-color:
--ui-range-thumb-color:
--ui-range-thumb-border:
--textarea-border-color:
-------------------------------------------
--button-alert-text-hover:
--button-alert-color-hover:
--button-alert-border-hover:
--button-alert-text-active:
--button-alert-color-active:
--button-alert-border-active:
----------- PRIMARY -----------------------
--button should immediately catch the eye--
--button-primary-text:
--button-primary-color:
--button-primary-border:
---------hover----------
--button-primary-text-hover:
--button-primary-color-hover:
--button-primary-border-hover:
---------active---------
--button-primary-text-active:
--button-primary-color-active:
--button-primary-border-active:
------------ SECONDARY ------------------------
--button should NOT immediately catch the eye--
--button-secondary-text:
--button-secondary-color:
--button-secondary-border:
---------hover----------
--button-secondary-text-hover:
--button-secondary-color-hover:
--button-secondary-border-hover:
---------active---------
--button-secondary-text-active:
--button-secondary-color-active:
--button-secondary-border-active:
---------- TERTIARY -----------------------
---------- disabled buttons ---------------
--button-tertiary-text:
--button-tertiary-color:
--button-tertiary-border:
---------hover----------
--button-tertiary-text:
--button-tertiary-color:
--button-tertiary-border:
}
*/

File diff suppressed because it is too large Load diff

View file

@ -12,6 +12,18 @@
font-size: 90%; font-size: 90%;
} }
.grid-container {
display: grid;
grid-template-columns: auto auto auto;
padding: 10px;
}
.grid-item {
padding: 5px;
/* font-size: 30px; */
text-align: center;
}
#container { #container {
margin: 0em auto; margin: 0em auto;
display: flex; display: flex;
@ -35,6 +47,67 @@
padding: 0.5em; padding: 0.5em;
} }
h1 {
text-align: center;
}
.customlink:link {
color: white;
background-color: #007aff;
font-weight: 600;
text-decoration: none;
float: right;
margin-top: 30px;
display: flex;
flex-direction: row;
gap: 0.5em;
justify-content: flex-end;
border-radius: 4px;
padding: 8px;
}
.customlink:visited {
color: white;
background-color: #007aff;
font-weight: 600;
text-decoration: none;
float: right;
margin-top: 30px;
display: flex;
flex-direction: row;
gap: 0.5em;
justify-content: flex-end;
padding: 8px;
}
.customlink:hover {
color: white;
background-color: #0070ee;
font-weight: 600;
text-decoration: none;
float: right;
margin-top: 30px;
display: flex;
flex-direction: row;
gap: 0.5em;
justify-content: flex-end;
padding: 8px;
}
.customlink:active {
color: #0070ee;
background-color: #80b3ef;
font-weight: 600;
text-decoration: none;
float: right;
margin-top: 30px;
display: flex;
flex-direction: row;
gap: 0.5em;
justify-content: flex-end;
padding: 8px;
}
body { body {
max-width: 600px; max-width: 600px;
min-width: 300px; min-width: 300px;
@ -594,7 +667,7 @@
message = html`<${Probabilities} data=${data} />` message = html`<${Probabilities} data=${data} />`
} else { } else {
const text = isArrayMessage ? const text = isArrayMessage ?
data.map(msg => msg.content).join('').replace(/^\s+/, '') : data.map(msg => msg.content).join('') :
data; data;
message = isCompletionMode ? message = isCompletionMode ?
text : text :
@ -877,19 +950,30 @@
// poor mans markdown replacement // poor mans markdown replacement
const Markdownish = (params) => { const Markdownish = (params) => {
const md = params.text const chunks = params.text.split('```');
.replace(/&/g, '&amp;')
.replace(/</g, '&lt;') for (let i = 0; i < chunks.length; i++) {
.replace(/>/g, '&gt;') if (i % 2 === 0) { // outside code block
.replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>') chunks[i] = chunks[i]
.replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>') .replace(/&/g, '&amp;')
.replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>') .replace(/</g, '&lt;')
.replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>') .replace(/>/g, '&gt;')
.replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>') .replace(/(^|\n)#{1,6} ([^\n]*)(?=([^`]*`[^`]*`)*[^`]*$)/g, '$1<h3>$2</h3>')
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>') .replace(/\*\*(.*?)\*\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
.replace(/`(.*?)`/g, '<code>$1</code>') .replace(/__(.*?)__(?=([^`]*`[^`]*`)*[^`]*$)/g, '<strong>$1</strong>')
.replace(/\n/gim, '<br />'); .replace(/\*(.*?)\*(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
return html`<span dangerouslySetInnerHTML=${{ __html: md }} />`; .replace(/_(.*?)_(?=([^`]*`[^`]*`)*[^`]*$)/g, '<em>$1</em>')
.replace(/```.*?\n([\s\S]*?)```/g, '<pre><code>$1</code></pre>')
.replace(/`(.*?)`/g, '<code>$1</code>')
.replace(/\n/gim, '<br />');
} else { // inside code block
chunks[i] = `<pre><code>${chunks[i]}</code></pre>`;
}
}
const restoredText = chunks.join('');
return html`<span dangerouslySetInnerHTML=${{ __html: restoredText }} />`;
}; };
const ModelGenerationInfo = (params) => { const ModelGenerationInfo = (params) => {
@ -903,6 +987,7 @@
` `
} }
// simple popover impl // simple popover impl
const Popover = (props) => { const Popover = (props) => {
const isOpen = useSignal(false); const isOpen = useSignal(false);
@ -1023,7 +1108,11 @@
return html` return html`
<div class="mode-${session.value.type}"> <div class="mode-${session.value.type}">
<header> <header>
<h1>llama.cpp</h1> <div class="grid-container">
<div class="grid-item"></div>
<div class="grid-item"><h1>llama.cpp</h1></div>
<div class="grid-item"><a class="customlink" href="index-new.html">New UI</a></div>
</div>
</header> </header>
<main id="content"> <main id="content">
@ -1054,4 +1143,3 @@
</body> </body>
</html> </html>

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show more