Merge branch 'master' of https://github.com/ggerganov/llama.cpp into add-paligemma-support

This commit is contained in:
Andrei Betlen 2024-06-11 21:10:52 -04:00
commit 7612e4cdcc
324 changed files with 92943 additions and 25642 deletions

View file

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make
RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make
RUN make -j$(nproc)
ENTRYPOINT ["/app/.devops/tools.sh"]

View file

@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
FROM ubuntu:$UBUNTU_VERSION as build
RUN apt-get update && \
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev
apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1
COPY requirements.txt requirements.txt
COPY requirements requirements
@ -18,7 +18,7 @@ COPY . .
ENV LLAMA_CURL=1
RUN make
RUN make -j$(nproc)
ENV LC_ALL=C.utf8

View file

@ -23,10 +23,13 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable CUDA
ENV LLAMA_CUDA=1
RUN make
RUN make -j$(nproc) main
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/main /main
ENTRYPOINT [ "/main" ]

View file

@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git

View file

@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
ENV CC=/opt/rocm/llvm/bin/clang
ENV CXX=/opt/rocm/llvm/bin/clang++
RUN make
RUN make -j$(nproc) main
ENTRYPOINT [ "/app/main" ]

View file

@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=jammy
FROM ubuntu:$UBUNTU_VERSION as build
# Install build tools
RUN apt update && apt install -y git build-essential cmake wget
RUN apt update && apt install -y git build-essential cmake wget libgomp1
# Install Vulkan SDK
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \

View file

@ -9,10 +9,13 @@ WORKDIR /app
COPY . .
RUN make
RUN make -j$(nproc) main
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libgomp1
COPY --from=build /app/main /main
ENV LC_ALL=C.utf8

View file

@ -25,12 +25,12 @@ ENV LLAMA_CUDA=1
# Enable cURL
ENV LLAMA_CURL=1
RUN make
RUN make -j$(nproc) server
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server

View file

@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git libcurl4-openssl-dev
@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
rm /etc/apt/sources.list.d/intel-graphics.list && \
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev

View file

@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
RUN make
RUN make -j$(nproc)
ENTRYPOINT [ "/app/server" ]

View file

@ -11,12 +11,12 @@ COPY . .
ENV LLAMA_CURL=1
RUN make
RUN make -j$(nproc) server
FROM ubuntu:$UBUNTU_VERSION as runtime
RUN apt-get update && \
apt-get install -y libcurl4-openssl-dev
apt-get install -y libcurl4-openssl-dev libgomp1
COPY --from=build /app/server /server

View file

@ -8,7 +8,7 @@ arg1="$1"
shift
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
python3 ./convert.py "$@"
python3 ./convert-hf-to-gguf.py "$@"
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
./quantize "$@"
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then

50
.github/ISSUE_TEMPLATE/01-bug-low.yml vendored Normal file
View file

@ -0,0 +1,50 @@
name: Low Severity Bugs
description: Used to report low severity bugs in llama.cpp (e.g. cosmetic issues, non critical UI glitches)
title: "Bug: "
labels: ["bug-unconfirmed", "low severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -0,0 +1,50 @@
name: Medium Severity Bug
description: Used to report medium severity bugs in llama.cpp (e.g. Malfunctioning Features but generally still useable)
title: "Bug: "
labels: ["bug-unconfirmed", "medium severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

50
.github/ISSUE_TEMPLATE/03-bug-high.yml vendored Normal file
View file

@ -0,0 +1,50 @@
name: High Severity Bug
description: Used to report high severity bugs in llama.cpp (e.g. Malfunctioning features hindering important common workflow)
title: "Bug: "
labels: ["bug-unconfirmed", "high severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -0,0 +1,50 @@
name: Critical Severity Bug
description: Used to report critical severity bugs in llama.cpp (e.g. Crashing, Corrupted, Dataloss)
title: "Bug: "
labels: ["bug-unconfirmed", "critical severity"]
body:
- type: markdown
attributes:
value: |
Thanks for taking the time to fill out this bug report!
Please include information about your system, the steps to reproduce the bug,
and the version of llama.cpp that you are using.
If possible, please provide a minimal code example that reproduces the bug.
- type: textarea
id: what-happened
attributes:
label: What happened?
description: Also tell us, what did you expect to happen?
placeholder: Tell us what you see!
validations:
required: true
- type: textarea
id: version
attributes:
label: Name and Version
description: Which executable and which version of our software are you running? (use `--version` to get a version string)
placeholder: |
$./main --version
version: 2999 (42b4109e)
built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
validations:
required: true
- type: dropdown
id: operating-system
attributes:
label: What operating system are you seeing the problem on?
multiple: true
options:
- Linux
- Mac
- Windows
- BSD
- Other? (Please let us know in description)
validations:
required: false
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

View file

@ -0,0 +1,51 @@
name: Enhancement
description: Used to request enhancements for llama.cpp
title: "Feature Request: "
labels: ["enhancement"]
body:
- type: markdown
attributes:
value: |
[Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
- type: checkboxes
id: prerequisites
attributes:
label: Prerequisites
description: Please confirm the following before submitting your enhancement request.
options:
- label: I am running the latest code. Mention the version if possible as well.
required: true
- label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
required: true
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
required: true
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
required: true
- type: textarea
id: feature-description
attributes:
label: Feature Description
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
placeholder: Detailed description of the enhancement
validations:
required: true
- type: textarea
id: motivation
attributes:
label: Motivation
description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
placeholder: Explanation of why this feature is needed and its benefits
validations:
required: true
- type: textarea
id: possible-implementation
attributes:
label: Possible Implementation
description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
placeholder: Detailed description of potential implementation
validations:
required: false

52
.github/ISSUE_TEMPLATE/06-research.yml vendored Normal file
View file

@ -0,0 +1,52 @@
name: Research
description: Track new technical research area
title: "Research: "
labels: ["research 🔬"]
body:
- type: markdown
attributes:
value: |
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
- type: checkboxes
id: research-stage
attributes:
label: Research Stage
description: Track general state of this research ticket
options:
- label: Background Research (Let's try to avoid reinventing the wheel)
- label: Hypothesis Formed (How do you think this will work and it's effect?)
- label: Strategy / Implementation Forming
- label: Analysis of results
- label: Debrief / Documentation (So people in the future can learn from us)
- type: textarea
id: background
attributes:
label: Previous existing literature and research
description: Whats the current state of the art and whats the motivation for this research?
- type: textarea
id: hypothesis
attributes:
label: Hypothesis
description: How do you think this will work and it's effect?
- type: textarea
id: implementation
attributes:
label: Implementation
description: Got an approach? e.g. a PR ready to go?
- type: textarea
id: analysis
attributes:
label: Analysis
description: How does the proposed implementation behave?
- type: textarea
id: logs
attributes:
label: Relevant log output
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
render: shell

28
.github/ISSUE_TEMPLATE/07-refactor.yml vendored Normal file
View file

@ -0,0 +1,28 @@
name: Refactor (Maintainers)
description: Used to track refactoring opportunities
title: "Refactor: "
labels: ["refactor"]
body:
- type: markdown
attributes:
value: |
Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
- type: textarea
id: background-description
attributes:
label: Background Description
description: Please provide a detailed written description of the pain points you are trying to solve.
placeholder: Detailed description behind your motivation to request refactor
validations:
required: true
- type: textarea
id: possible-approaches
attributes:
label: Possible Refactor Approaches
description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
placeholder: Your idea of possible refactoring opportunity/approaches
validations:
required: false

View file

@ -1,11 +0,0 @@
---
name: Bug template
about: Used to report bugs in llama.cpp
labels: ["bug-unconfirmed"]
assignees: ''
---
Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
If the bug concerns the server, please try to reproduce it first using the [server test scenario framework](https://github.com/ggerganov/llama.cpp/tree/master/examples/server/tests).

13
.github/ISSUE_TEMPLATE/config.yml vendored Normal file
View file

@ -0,0 +1,13 @@
blank_issues_enabled: true
contact_links:
- name: Got an idea?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
about: Pop it there. It may then become an enhancement ticket.
- name: Got a question?
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
about: Ask a question there!
- name: Want to contribute?
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
about: Head to the contribution guide page of the wiki for areas you can help with

View file

@ -1,28 +0,0 @@
---
name: Enhancement template
about: Used to request enhancements for llama.cpp
labels: ["enhancement"]
assignees: ''
---
# Prerequisites
Please answer the following questions for yourself before submitting an issue.
- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
# Feature Description
Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
# Motivation
Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
# Possible Implementation
If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.

14
.github/labeler.yml vendored
View file

@ -1,5 +1,16 @@
# https://github.com/actions/labeler
Kompute:
- changed-files:
- any-glob-to-any-file:
- ggml-kompute.h
- ggml-kompute.cpp
- README-kompute.md
Apple Metal:
- changed-files:
- any-glob-to-any-file:
- ggml-metal.h
- ggml-metal.cpp
- README-metal.md
SYCL:
- changed-files:
- any-glob-to-any-file:
@ -9,6 +20,7 @@ SYCL:
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
- ggml-cuda.h
- ggml-cuda/**
Vulkan:
- changed-files:

5
.github/pull_request_template.md vendored Normal file
View file

@ -0,0 +1,5 @@
- Self Reported Review Complexity:
- [ ] Review Complexity : Low
- [ ] Review Complexity : Medium
- [ ] Review Complexity : High
- [ ] I have read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md)

View file

@ -13,7 +13,7 @@ on:
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
@ -294,12 +294,22 @@ jobs:
- name: Build
id: cmake_build
if: ${{ matrix.sanitizer != 'THREAD' }}
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Build (no OpenMP)
id: cmake_build_no_openmp
if: ${{ matrix.sanitizer == 'THREAD' }}
run: |
mkdir build
cd build
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
- name: Test
id: cmake_test
run: |
@ -674,12 +684,10 @@ jobs:
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows-latest-cmake:
runs-on: windows-latest
runs-on: windows-2019
env:
OPENBLAS_VERSION: 0.3.23
OPENCL_VERSION: 2023.04.17
CLBLAST_VERSION: 1.6.0
SDE_VERSION: 9.33.0-2024-01-07
VULKAN_VERSION: 1.3.261.1
@ -696,8 +704,6 @@ jobs:
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
- build: 'avx512-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
- build: 'clblast-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/clblast"'
- build: 'openblas-x64'
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
- build: 'kompute-x64'
@ -722,27 +728,6 @@ jobs:
run: |
git submodule update --init kompute
- name: Download OpenCL SDK
id: get_opencl
if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/opencl.zip -L "https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v${env:OPENCL_VERSION}/OpenCL-SDK-v${env:OPENCL_VERSION}-Win-x64.zip"
mkdir $env:RUNNER_TEMP/opencl
tar.exe -xvf $env:RUNNER_TEMP/opencl.zip --strip-components=1 -C $env:RUNNER_TEMP/opencl
- name: Download CLBlast
id: get_clblast
if: ${{ matrix.build == 'clblast-x64' }}
run: |
curl.exe -o $env:RUNNER_TEMP/clblast.7z -L "https://github.com/CNugteren/CLBlast/releases/download/${env:CLBLAST_VERSION}/CLBlast-${env:CLBLAST_VERSION}-windows-x64.7z"
curl.exe -o $env:RUNNER_TEMP/CLBlast.LICENSE.txt -L "https://github.com/CNugteren/CLBlast/raw/${env:CLBLAST_VERSION}/LICENSE"
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/clblast.7z
rename-item $env:RUNNER_TEMP/CLBlast-${env:CLBLAST_VERSION}-windows-x64 clblast
foreach ($f in (gci -Recurse -Path "$env:RUNNER_TEMP/clblast" -Filter '*.cmake')) {
$txt = Get-Content -Path $f -Raw
$txt.Replace('C:/vcpkg/packages/opencl_x64-windows/', "$($env:RUNNER_TEMP.Replace('\','/'))/opencl/") | Set-Content -Path $f -Encoding UTF8
}
- name: Download OpenBLAS
id: get_openblas
if: ${{ matrix.build == 'openblas-x64' }}
@ -776,13 +761,6 @@ jobs:
cmake -S . -B build ${{ matrix.defines }}
cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS}
- name: Add clblast.dll
id: add_clblast_dll
if: ${{ matrix.build == 'clblast-x64' }}
run: |
cp $env:RUNNER_TEMP/clblast/lib/clblast.dll ./build/bin/Release
cp $env:RUNNER_TEMP/CLBlast.LICENSE.txt ./build/bin/Release/CLBlast-${env:CLBLAST_VERSION}.txt
- name: Add libopenblas.dll
id: add_libopenblas_dll
if: ${{ matrix.build == 'openblas-x64' }}
@ -806,7 +784,7 @@ jobs:
- name: Test
id: cmake_test
# not all machines have native AVX-512
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'clblast-x64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
run: |
cd build
ctest -L main -C Release --verbose --timeout 900
@ -851,7 +829,7 @@ jobs:
name: llama-bin-win-${{ matrix.build }}.zip
windows-latest-cmake-cuda:
runs-on: windows-latest
runs-on: windows-2019
strategy:
matrix:
@ -865,8 +843,9 @@ jobs:
with:
fetch-depth: 0
- uses: Jimver/cuda-toolkit@v0.2.11
- name: Install CUDA toolkit
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.15
with:
cuda: ${{ matrix.cuda }}
method: 'network'
@ -1061,7 +1040,7 @@ jobs:
# hypervisor: 'qemu'
# run: |
# sudo pkg update
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 clinfo clover opencl clblast openblas
# sudo pkg install -y gmake automake autoconf pkgconf llvm15 openblas
# gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j `sysctl -n hw.ncpu`
release:

View file

@ -16,11 +16,9 @@ on:
branches:
- master
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
pull_request_target:
pull_request:
types: [opened, synchronize, reopened]
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
schedule:
- cron: '2 4 * * *'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
@ -115,7 +113,7 @@ jobs:
server-windows:
runs-on: windows-latest
runs-on: windows-2019
steps:
- name: Clone

3
.gitignore vendored
View file

@ -34,9 +34,11 @@ ggml-metal-embed.metal
lcov-report/
gcovr-report/
tags
build*
!build.zig
cmake-build-*
android-ndk-*
out/
tmp/
@ -105,6 +107,7 @@ examples/jeopardy/results.txt
examples/server/*.html.hpp
examples/server/*.js.hpp
examples/server/*.mjs.hpp
examples/server/*.css.hpp
poetry.lock
poetry.toml

View file

@ -72,6 +72,7 @@ else()
set(INS_ENB ON)
endif()
option(LLAMA_SVE "llama: enable SVE" OFF)
option(LLAMA_AVX "llama: enable AVX" ${INS_ENB})
option(LLAMA_AVX2 "llama: enable AVX2" ${INS_ENB})
option(LLAMA_AVX512 "llama: enable AVX512" OFF)
@ -105,11 +106,11 @@ set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
option(LLAMA_CUDA_FA_ALL_QUANTS "llama: compile all quants for FlashAttention" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
option(LLAMA_VULKAN "llama: use Vulkan" OFF)
option(LLAMA_VULKAN_CHECK_RESULTS "llama: run Vulkan op checks" OFF)
option(LLAMA_VULKAN_DEBUG "llama: enable Vulkan debug output" OFF)
@ -124,6 +125,7 @@ set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)")
option(LLAMA_KOMPUTE "llama: use Kompute" OFF)
option(LLAMA_RPC "llama: use RPC" OFF)
option(LLAMA_OPENMP "llama: use OpenMP" ON)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
@ -294,6 +296,17 @@ if (LLAMA_METAL)
)
endif()
if (LLAMA_OPENMP)
find_package(OpenMP)
if (OpenMP_FOUND)
message(STATUS "OpenMP found")
add_compile_definitions(GGML_USE_OPENMP)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
else()
message(WARNING "OpenMP not found")
endif()
endif()
if (LLAMA_BLAS)
if (LLAMA_STATIC)
set(BLA_STATIC ON)
@ -389,18 +402,36 @@ if (LLAMA_CUBLAS)
endif()
if (LLAMA_CUDA)
cmake_minimum_required(VERSION 3.17)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA)
set(GGML_HEADERS_CUDA ggml-cuda.h)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
@ -426,6 +457,18 @@ if (LLAMA_CUDA)
if (LLAMA_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (LLAMA_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
if (LLAMA_STATIC)
if (WIN32)
@ -443,21 +486,6 @@ if (LLAMA_CUDA)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
else()
message(WARNING "CUDA not found")
endif()
@ -474,22 +502,6 @@ if (LLAMA_RPC)
set(GGML_SOURCES_RPC ggml-rpc.cpp)
endif()
if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")
set(GGML_HEADERS_OPENCL ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp)
add_compile_definitions(GGML_USE_CLBLAST)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
endif()
if (LLAMA_VULKAN)
find_package(Vulkan)
if (Vulkan_FOUND)
@ -529,12 +541,17 @@ if (LLAMA_VULKAN)
endif()
if (LLAMA_HIPBLAS)
if ($ENV{ROCM_PATH})
set(ROCM_PATH $ENV{ROCM_PATH})
if (NOT EXISTS $ENV{ROCM_PATH})
if (NOT EXISTS /opt/rocm)
set(ROCM_PATH /usr)
else()
set(ROCM_PATH /opt/rocm)
endif()
else()
set(ROCM_PATH $ENV{ROCM_PATH})
endif()
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
# CMake on Windows doesn't support the HIP language yet
if(WIN32)
@ -570,6 +587,10 @@ if (LLAMA_HIPBLAS)
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
@ -589,6 +610,19 @@ if (LLAMA_HIPBLAS)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (LLAMA_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_ROCM ${SRCS})
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
@ -627,6 +661,10 @@ if (LLAMA_SYCL)
add_compile_definitions(GGML_SYCL_F16)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif()
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
@ -742,6 +780,7 @@ if (LLAMA_KOMPUTE)
kompute-shaders/op_mul_mat_q4_0.comp
kompute-shaders/op_mul_mat_q4_1.comp
kompute-shaders/op_mul_mat_q6_k.comp
kompute-shaders/op_getrows_f32.comp
kompute-shaders/op_getrows_f16.comp
kompute-shaders/op_getrows_q4_0.comp
kompute-shaders/op_getrows_q4_1.comp
@ -774,6 +813,7 @@ if (LLAMA_KOMPUTE)
shaderop_mul_mat_q4_0.h
shaderop_mul_mat_q4_1.h
shaderop_mul_mat_q6_k.h
shaderop_getrows_f32.h
shaderop_getrows_f16.h
shaderop_getrows_q4_0.h
shaderop_getrows_q4_1.h
@ -1040,6 +1080,9 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
# Raspberry Pi 3, 4, Zero 2 (32-bit)
list(APPEND ARCH_FLAGS -mno-unaligned-access)
endif()
if (LLAMA_SVE)
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
endif()
endif()
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
@ -1208,7 +1251,6 @@ add_library(ggml OBJECT
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL}
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
${GGML_SOURCES_RPC} ${GGML_HEADERS_RPC}
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
@ -1296,8 +1338,9 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_EXTRA}")
"${GGML_HEADERS_CUDA}"
"${GGML_HEADERS_METAL}"
"${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
install(TARGETS ggml PUBLIC_HEADER)
@ -1306,7 +1349,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
install(TARGETS llama LIBRARY PUBLIC_HEADER)
install(
FILES convert.py
FILES convert-hf-to-gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
@ -1333,6 +1376,13 @@ if (LLAMA_METAL)
endif()
endif()
configure_file(cmake/llama.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
@ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
DESTINATION lib/pkgconfig)
#
# programs, examples and tests
#

View file

@ -1,4 +1,4 @@
{
{
"version": 4,
"configurePresets": [
{
@ -40,6 +40,10 @@
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] }
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
]
}

14
CONTRIBUTING.md Normal file
View file

@ -0,0 +1,14 @@
# Contributing Guidelines
## Checklist
* Make sure your PR follows the [coding guidelines](https://github.com/ggerganov/llama.cpp/blob/master/README.md#coding-guidelines)
* Test your changes using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
* Execute [the full CI locally on your machine](ci/README.md) before publishing
## PR formatting
* Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
- The PR template has a series of review complexity checkboxes `[ ]` that you can mark as `[X]` for your conveience. Refer to [About task lists](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) for more information.
* If the pull request only contains documentation changes (e.g., updating READMEs, adding new wiki pages), please add `[no ci]` to the commit title. This will skip unnecessary CI checks and help reduce build times.
* When squashing multiple commits on merge, use the following format for your commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : Fix typo in utils.py (#1234)`

View file

@ -1,7 +1,7 @@
# Define the default target now so that it is always the first target
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
# Binaries only useful for tests
@ -57,6 +57,8 @@ ifeq ($(UNAME_S),Darwin)
LLAMA_METAL := 1
endif
LLAMA_NO_OPENMP := 1
ifneq ($(UNAME_P),arm)
SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
ifeq ($(SYSCTL_M),1)
@ -67,6 +69,10 @@ ifeq ($(UNAME_S),Darwin)
endif
endif
ifdef LLAMA_RPC
BUILD_TARGETS += rpc-server
endif
default: $(BUILD_TARGETS)
test: $(TEST_TARGETS)
@ -135,12 +141,16 @@ MK_NVCCFLAGS = -std=c++11
ifdef LLAMA_FAST
MK_CFLAGS += -Ofast
HOST_CXXFLAGS += -Ofast
ifndef LLAMA_DEBUG
MK_NVCCFLAGS += -O3
endif # LLAMA_DEBUG
else
MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
ifndef LLAMA_DEBUG
MK_NVCCFLAGS += -O3
endif
endif # LLAMA_DEBUG
endif # LLAMA_FAST
ifndef LLAMA_NO_CCACHE
CCACHE := $(shell which ccache)
@ -204,6 +214,7 @@ ifdef LLAMA_DEBUG
MK_CFLAGS += -O0 -g
MK_CXXFLAGS += -O0 -g
MK_LDFLAGS += -g
MK_NVCCFLAGS += -O0 -g
ifeq ($(UNAME_S),Linux)
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
@ -400,6 +411,12 @@ ifndef LLAMA_NO_ACCELERATE
endif
endif # LLAMA_NO_ACCELERATE
ifndef LLAMA_NO_OPENMP
MK_CPPFLAGS += -DGGML_USE_OPENMP
MK_CFLAGS += -fopenmp
MK_CXXFLAGS += -fopenmp
endif # LLAMA_NO_OPENMP
ifdef LLAMA_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_OPENBLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
@ -416,11 +433,26 @@ ifdef LLAMA_BLIS
MK_LDFLAGS += -lblis -L/usr/local/lib
endif # LLAMA_BLIS
ifdef LLAMA_RPC
MK_CPPFLAGS += -DGGML_USE_RPC
OBJS += ggml-rpc.o
endif # LLAMA_RPC
ifdef LLAMA_CUBLAS
# LLAMA_CUBLAS is deprecated and will be removed in the future
LLAMA_CUDA := 1
endif
OBJS_CUDA_TEMP_INST = $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/mmq*.cu))
ifdef LLAMA_CUDA_FA_ALL_QUANTS
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*.cu))
else
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu))
OBJS_CUDA_TEMP_INST += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu))
endif # LLAMA_CUDA_FA_ALL_QUANTS
ifdef LLAMA_CUDA
ifneq ('', '$(wildcard /opt/cuda)')
CUDA_PATH ?= /opt/cuda
@ -431,6 +463,7 @@ ifdef LLAMA_CUDA
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
OBJS += $(OBJS_CUDA_TEMP_INST)
MK_NVCCFLAGS += -use_fast_math
ifdef LLAMA_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings
@ -441,6 +474,9 @@ endif # JETSON_EOL_MODULE_DETECT
ifdef LLAMA_DEBUG
MK_NVCCFLAGS += -lineinfo
endif # LLAMA_DEBUG
ifdef LLAMA_CUDA_DEBUG
MK_NVCCFLAGS += --device-debug
endif # LLAMA_CUDA_DEBUG
ifdef LLAMA_CUDA_NVCC
NVCC = $(CCACHE) $(LLAMA_CUDA_NVCC)
else
@ -490,7 +526,10 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
ifdef LLAMA_CUDA_CCBIN
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
endif
endif # LLAMA_CUDA_CCBIN
ifdef LLAMA_CUDA_FA_ALL_QUANTS
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
endif # LLAMA_CUDA_FA_ALL_QUANTS
ifdef JETSON_EOL_MODULE_DETECT
define NVCC_COMPILE
@ -502,30 +541,13 @@ define NVCC_COMPILE
endef # NVCC_COMPILE
endif # JETSON_EOL_MODULE_DETECT
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(NVCC_COMPILE)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(NVCC_COMPILE)
endif # LLAMA_CUDA
ifdef LLAMA_CLBLAST
MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL)
MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL)
# Mac provides OpenCL as a framework
ifeq ($(UNAME_S),Darwin)
MK_LDFLAGS += -lclblast -framework OpenCL
else
MK_LDFLAGS += $(shell pkg-config --libs clblast OpenCL)
endif
OBJS += ggml-opencl.o
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # LLAMA_CLBLAST
ifdef LLAMA_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += -lvulkan
@ -568,6 +590,7 @@ ifdef LLAMA_HIP_UMA
MK_CPPFLAGS += -DGGML_HIP_UMA
endif # LLAMA_HIP_UMA
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
@ -581,11 +604,12 @@ ifdef LLAMA_CUDA_NO_PEER_COPY
endif # LLAMA_CUDA_NO_PEER_COPY
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
OBJS += $(OBJS_CUDA_TEMP_INST)
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh
ggml-cuda/%.o: ggml-cuda/%.cu ggml.h ggml-common.h ggml-cuda/common.cuh
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif # LLAMA_HIPBLAS
@ -623,11 +647,26 @@ ggml-metal-embed.o: ggml-metal.metal ggml-common.h
endif
endif # LLAMA_METAL
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
ifndef LLAMA_NO_LLAMAFILE
sgemm.o: sgemm.cpp sgemm.h ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif
ifdef LLAMA_RPC
ggml-rpc.o: ggml-rpc.cpp ggml-rpc.h
$(CXX) $(CXXFLAGS) -c $< -o $@
rpc-server.o: examples/rpc/rpc-server.cpp ggml-rpc.h
$(CXX) $(CXXFLAGS) -c $< -o $@
rpc-server: rpc-server.o ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
endif # LLAMA_RPC
GF_CC := $(CC)
include scripts/get-flags.mk
@ -707,14 +746,9 @@ unicode.o: unicode.cpp unicode.h
unicode-data.o: unicode-data.cpp unicode-data.h
$(CXX) $(CXXFLAGS) -c $< -o $@
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
$(CXX) $(CXXFLAGS) -c $< -o $@
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h
COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o
common.o: common/common.cpp $(COMMON_H_DEPS)
$(CXX) $(CXXFLAGS) -c $< -o $@
@ -745,6 +779,7 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS)
clean:
rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS)
rm -vrf ggml-cuda/*.o
rm -vrf ggml-cuda/template-instances/*.o
find examples pocs -type f -name "*.o" -delete
#
@ -813,7 +848,7 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/colorthemes.css.hpp examples/server/style.css.hpp examples/server/theme-beeninorder.css.hpp examples/server/theme-ketivah.css.hpp examples/server/theme-mangotango.css.hpp examples/server/theme-playground.css.hpp examples/server/theme-polarnight.css.hpp examples/server/theme-snowstorm.css.hpp examples/server/index.html.hpp examples/server/index-new.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/system-prompts.js.hpp examples/server/prompt-formats.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
@ -863,10 +898,6 @@ baby-llama: examples/baby-llama/baby-llama.cpp ggml.o llama.o $(COMMON_DEPS) tra
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
beam-search: examples/beam-search/beam-search.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
finetune: examples/finetune/finetune.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

View file

@ -29,7 +29,7 @@ The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based o
When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose.
## News
@ -55,8 +55,8 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
## OS
| OS | Status | Verified |
|---------|---------|------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 |
|---------|---------|------------------------------------------------|
| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39, Arch Linux |
| Windows | Support | Windows 11 |
@ -70,7 +70,7 @@ It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS,
|-------------------------------|---------|---------------------------------------|
| Intel Data Center Max Series | Support | Max 1550, 1100 |
| Intel Data Center Flex Series | Support | Flex 170 |
| Intel Arc Series | Support | Arc 770, 730M |
| Intel Arc Series | Support | Arc 770, 730M, Arc A750 |
| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake |
| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 |

202
README.md
View file

@ -2,7 +2,9 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[![Conan Center](https://shields.io/conan/v/llama-cpp)](https://conan.io/center/llama-cpp)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -20,7 +22,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
@ -50,7 +53,6 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
<li><a href="#quantization">Quantization</a></li>
<li><a href="#interactive-mode">Interactive mode</a></li>
<li><a href="#constrained-output-with-grammars">Constrained output with grammars</a></li>
<li><a href="#instruct-mode">Instruct mode</a></li>
<li><a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a></li>
<li><a href="#seminal-papers-and-background-on-the-models">Seminal papers and background on the models</a></li>
<li><a href="#perplexity-measuring-model-quality">Perplexity (measuring model quality)</a></li>
@ -74,7 +76,7 @@ variety of hardware - locally and in the cloud.
- AVX, AVX2 and AVX512 support for x86 architectures
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
- Vulkan, SYCL, and (partial) OpenCL backend support
- Vulkan and SYCL backend support
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
Since its [inception](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022), the project has
@ -147,6 +149,8 @@ Typically finetunes of the base models below are supported as well.
[llama.cpp web server](./examples/server) is a lightweight [OpenAI API](https://github.com/openai/openai-openapi) compatible HTTP server that can be used to serve local models and easily connect them to existing clients.
[simplechat](./examples/server/public_simplechat) is a simple chat client, which can be used to chat with the model exposed using above web server (use --path to point to simplechat), from a local web browser.
**Bindings:**
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
@ -200,9 +204,14 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
- [AIKit](https://github.com/sozercan/aikit) (MIT)
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
**Tools:**
- [akx/ggify](https://github.com/akx/ggify) download PyTorch models from HuggingFace Hub and convert them to GGML
---
Here is a typical run using LLaMA v2 13B on M2 Ultra:
@ -311,8 +320,6 @@ In order to build llama.cpp you have four different options.
make
```
**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
- On Windows:
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
@ -324,6 +331,11 @@ In order to build llama.cpp you have four different options.
make
```
- Notes:
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, run `make LLAMA_DEBUG=1`
- Using `CMake`:
```bash
@ -331,33 +343,26 @@ In order to build llama.cpp you have four different options.
cmake --build build --config Release
```
**Note**: for `Debug` builds, there are two cases:
**Notes**:
- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
- For faster repeated compilation, install [ccache](https://ccache.dev/).
- For debug builds, there are two cases:
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
```bash
cmake -B build -DCMAKE_BUILD_TYPE=Debug
cmake --build build
```
- Multi-config generators (`-G` param set to Visual Studio, XCode...):
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
```bash
cmake -B build -G "Xcode"
cmake --build build --config Debug
```
- Using `Zig` (version 0.11 or later):
Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C,
it's also possible to cross compile for other operating systems and architectures:
```bash
zig build -Doptimize=ReleaseFast -Dtarget=x86_64-windows-gnu -Dcpu=x86_64+avx2+fma+f16c
```
The `zig targets` command will give you valid options to use.
- Using `gmake` (FreeBSD):
1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics)
@ -365,15 +370,18 @@ In order to build llama.cpp you have four different options.
3. Install compilation dependencies.
```bash
sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \
opencl clblast openblas
sudo pkg install gmake automake autoconf pkgconf llvm15 openblas
gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4
```
**Notes:** With this packages you can build llama.cpp with OPENBLAS and
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
the instructions for use and activate this options in this document below.
### Homebrew
On Mac and Linux, the homebrew package manager can be used via
```
brew install llama.cpp
```
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
### Metal Build
@ -385,7 +393,7 @@ argument.
### BLAS Build
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS and CLBlast. There are currently several different BLAS implementations available for build and use:
Building the program with BLAS support may lead to some performance improvements in prompt processing using batch sizes higher than 32 (the default is 512). Support with CPU-only BLAS implementations doesn't affect the normal generation performance. We may see generation performance improvements with GPU-involved BLAS implementations, e.g. cuBLAS, hipBLAS. There are currently several different BLAS implementations available for build and use:
- #### Accelerate Framework:
@ -474,9 +482,11 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
- #### hipBLAS
@ -537,111 +547,6 @@ Building the program with BLAS support may lead to some performance improvements
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
- #### CLBlast
OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK).
- For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed.
- For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page.
- <details>
<summary>Installing the OpenCL SDK from source</summary>
```sh
git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git
cd OpenCL-SDK
cmake -B build -DBUILD_DOCS=OFF \
-DBUILD_EXAMPLES=OFF \
-DBUILD_TESTING=OFF \
-DOPENCL_SDK_BUILD_SAMPLES=OFF \
-DOPENCL_SDK_TEST_SAMPLES=OFF
cmake --build build
cmake --install build --prefix /some/path
```
</details>
##### Installing CLBlast
Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages.
Linux packaging:
Fedora Linux:
```bash
sudo dnf install clblast
```
Alternatively, they may be built from source.
- <details>
<summary>Windows:</summary>
```cmd
set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64"
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/CLBlast
```
(note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds)
- <details>
<summary>Unix:</summary>
```sh
git clone https://github.com/CNugteren/CLBlast.git
cd CLBlast
cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF
cmake --build build --config Release
cmake --install build --prefix /some/path
```
Where `/some/path` is where the built library will be installed (default is `/usr/local`).
</details>
##### Building Llama with CLBlast
- Build with make:
```sh
make LLAMA_CLBLAST=1
```
- CMake (Unix):
```sh
cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
cmake --build build --config Release
```
- CMake (Windows):
```cmd
set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast"
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
```
##### Running Llama with CLBlast
The CLBlast build supports `--gpu-layers|-ngl` like the CUDA version does.
To select the correct platform (driver) and device (GPU), you can use the environment variables `GGML_OPENCL_PLATFORM` and `GGML_OPENCL_DEVICE`.
The selection can be a number (starting from 0) or a text string to search:
```sh
GGML_OPENCL_PLATFORM=1 ./main ...
GGML_OPENCL_DEVICE=2 ./main ...
GGML_OPENCL_PLATFORM=Intel ./main ...
GGML_OPENCL_PLATFORM=AMD GGML_OPENCL_DEVICE=1 ./main ...
```
The default behavior is to find the first GPU device, but when it is an integrated GPU on a laptop, for instance, the selectors are useful.
Using the variables it is possible to select a CPU-based driver as well, if so desired.
You can get a list of platforms and devices from the `clinfo -l` command, etc.
- #### Vulkan
**With docker**:
@ -692,7 +597,8 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derivatives.
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
# obtain the official LLaMA model weights and place them in ./models
@ -709,10 +615,10 @@ ls ./models
python3 -m pip install -r requirements.txt
# convert the model to ggml FP16 format
python3 convert.py models/mymodel/
python3 convert-hf-to-gguf.py models/mymodel/
# [Optional] for models using BPE tokenizers
python convert.py models/mymodel/ --vocab-type bpe
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
# quantize the model to 4-bits (using Q4_K_M method)
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
@ -862,34 +768,6 @@ The `grammars/` folder contains a handful of sample grammars. To write your own,
For authoring more complex JSON grammars, you can also check out https://grammar.intrinsiclabs.ai/, a browser app that lets you write TypeScript interfaces which it compiles to GBNF grammars that you can save for local use. Note that the app is built and maintained by members of the community, please file any issues or FRs on [its repo](http://github.com/intrinsiclabsai/gbnfgen) and not this one.
### Instruct mode
1. First, download and place the `ggml` model into the `./models` folder
2. Run the `main` tool like this:
```
./examples/alpaca.sh
```
Sample run:
```
== Running in interactive mode. ==
- Press Ctrl+C to interject at any time.
- Press Return to return control to LLaMA.
- If you want to submit another line, end your input in '\'.
Below is an instruction that describes a task. Write a response that appropriately completes the request.
> How many letters are there in the English alphabet?
There 26 letters in the English Alphabet
> What is the most common way of transportation in Amsterdam?
The majority (54%) are using public transit. This includes buses, trams and metros with over 100 lines throughout the city which make it very accessible for tourists to navigate around town as well as locals who commute by tram or metro on a daily basis
> List 5 words that start with "ca".
cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
>
```
### Obtaining and using the Facebook LLaMA 2 model
- Refer to [Facebook's LLaMA download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) if you want to access the model data.

View file

@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
model_f16="${path_models}/ggml-model-f16.gguf"
model_q8_0="${path_models}/ggml-model-q8_0.gguf"

View file

@ -9,7 +9,7 @@ set( CMAKE_CXX_COMPILER clang++ )
set( CMAKE_C_COMPILER_TARGET ${target} )
set( CMAKE_CXX_COMPILER_TARGET ${target} )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast" )
set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
set( CMAKE_C_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )

10
cmake/llama.pc.in Normal file
View file

@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include
Name: llama
Description: Port of Facebook's LLaMA model in C/C++
Version: @PROJECT_VERSION@
Libs: -L${libdir} -lllama
Cflags: -I${includedir}

View file

@ -84,4 +84,4 @@ endif ()
target_include_directories(${TARGET} PUBLIC .)
target_compile_features(${TARGET} PUBLIC cxx_std_11)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)

File diff suppressed because it is too large Load diff

View file

@ -60,7 +60,7 @@ struct gpt_params {
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
int32_t n_threads_batch_draft = -1;
int32_t n_predict = -1; // new tokens to predict
int32_t n_ctx = 512; // context size
int32_t n_ctx = 0; // context size
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
int32_t n_keep = 0; // number of tokens to keep from initial prompt
@ -71,7 +71,6 @@ struct gpt_params {
float p_split = 0.1f; // speculative decoding split probability
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
int32_t n_beams = 0; // if non-zero then use beam search of given width.
@ -86,13 +85,13 @@ struct gpt_params {
float yarn_beta_slow = 1.0f; // YaRN high correction dim
int32_t yarn_orig_ctx = 0; // YaRN original context length
float defrag_thold = -1.0f; // KV cache defragmentation threshold
std::string rpc_servers = ""; // comma separated list of RPC servers
ggml_backend_sched_eval_callback cb_eval = nullptr;
void * cb_eval_user_data = nullptr;
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
@ -110,12 +109,14 @@ struct gpt_params {
std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state
std::string input_prefix = ""; // string to prefix user inputs with
std::string input_suffix = ""; // string to suffix user inputs with
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
std::string logdir = ""; // directory in which to save YAML log files
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding
std::string logits_file = ""; // file for saving *all* logits
std::string rpc_servers = ""; // comma separated list of RPC servers
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
std::vector<llama_model_kv_override> kv_overrides;
// TODO: avoid tuple, use struct
@ -124,11 +125,12 @@ struct gpt_params {
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
int32_t verbosity = 0;
int32_t control_vector_layer_start = -1; // layer range for control vector
int32_t control_vector_layer_end = -1; // layer range for control vector
int ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
// (which is more convenient to use for plotting)
//
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
@ -142,18 +144,17 @@ struct gpt_params {
bool kl_divergence = false; // compute KL divergence
bool random_prompt = false; // do not randomize prompt if none provided
bool usage = false; // print usage
bool use_color = false; // use color to distinguish generations and inputs
bool special = false; // enable special token output
bool interactive = false; // interactive mode
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
bool interactive_first = false; // wait for user input immediately
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
bool embedding = false; // get only sentence embedding
bool escape = false; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool interactive_first = false; // wait for user input immediately
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
bool multiline_input = false; // reverse the usage of `\`
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
bool cont_batching = true; // insert new sequences for decoding on-the-fly
@ -161,7 +162,6 @@ struct gpt_params {
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
bool ignore_eos = false; // ignore generated EOS tokens
bool instruct = false; // instruction mode (used for Alpaca models)
bool logits_all = false; // return logits for all tokens in the batch
bool use_mmap = true; // use mmap for faster loads
bool use_mlock = false; // use mlock to keep model in memory
@ -179,6 +179,59 @@ struct gpt_params {
// multimodal models (see examples/llava)
std::string mmproj = ""; // path to multimodal projector
std::vector<std::string> image; // path to image file(s)
// server params
int32_t port = 8080; // server listens on this network port
int32_t timeout_read = 600; // http read timeout in seconds
int32_t timeout_write = timeout_read; // http write timeout in seconds
int32_t n_threads_http = -1; // number of threads to process HTTP requests
std::string hostname = "127.0.0.1";
std::string public_path = "";
std::string chat_template = "";
std::string system_prompt = "";
std::vector<std::string> api_keys;
std::string ssl_file_key = "";
std::string ssl_file_cert = "";
bool endpoint_slots = true;
bool endpoint_metrics = false;
bool log_json = false;
std::string slot_save_path;
float slot_prompt_similarity = 0.5f;
// batched-bench params
bool is_pp_shared = false;
std::vector<int32_t> n_pp;
std::vector<int32_t> n_tg;
std::vector<int32_t> n_pl;
// retrieval params
std::vector<std::string> context_files; // context files to embed
int32_t chunk_size = 64; // chunk size for context embedding
std::string chunk_separator = "\n"; // chunk separator for context embedding
// passkey params
int32_t n_junk = 250; // number of times to repeat the junk text
int32_t i_pos = -1; // position of the passkey in the junk text
// imatrix params
std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
int32_t i_chunk = 0; // start processing from this chunk
bool process_output = false; // collect data for the output tensor
bool compute_ppl = true; // whether to compute perplexity
};
void gpt_params_handle_model_default(gpt_params & params);
@ -198,7 +251,20 @@ std::vector<std::string> string_split(std::string input, char separator);
std::string string_strip(const std::string & str);
std::string string_get_sortable_timestamp();
std::string string_random_prompt(std::mt19937 & rng);
template<class T>
static std::vector<T> string_split(const std::string & str, char delim) {
std::vector<T> values;
std::istringstream str_stream(str);
std::string token;
while (std::getline(str_stream, token, delim)) {
T value;
std::istringstream token_stream(token);
token_stream >> value;
values.push_back(value);
}
return values;
}
bool string_parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
void string_process_escapes(std::string & input);
@ -211,6 +277,7 @@ bool fs_validate_filename(const std::string & filename);
bool fs_create_directory_with_parents(const std::string & path);
std::string fs_get_cache_directory();
std::string fs_get_cache_file(const std::string & filename);
//
// Model utils
@ -281,6 +348,13 @@ std::string llama_detokenize_bpe(
// defaults to true when model type is SPM, otherwise false.
bool llama_should_add_bos_token(const llama_model * model);
//
// Chat template utils
//
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
bool llama_chat_verify_template(const std::string & tmpl);
//
// KV cache utils
//

View file

@ -46,8 +46,12 @@ namespace grammar_parser {
state.rules[rule_id] = rule;
}
static bool is_digit_char(char c) {
return '0' <= c && c <= '9';
}
static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c);
}
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
@ -99,6 +103,17 @@ namespace grammar_parser {
return pos;
}
static const char * parse_int(const char * src) {
const char * pos = src;
while (is_digit_char(*pos)) {
pos++;
}
if (pos == src) {
throw std::runtime_error(std::string("expecting integer at ") + src);
}
return pos;
}
static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
@ -137,6 +152,60 @@ namespace grammar_parser {
bool is_nested) {
size_t last_sym_start = out_elements.size();
const char * pos = src;
auto handle_repetitions = [&](int min_times, int max_times) {
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// the following rewrite rules:
// S{m,n} --> S S S (m times) S'(n-m)
// S'(x) ::= S S'(x-1) |
// (... n-m definitions of these S' rules ...)
// S'(1) ::= S |
// S{m,} --> S S S (m times) S'
// S' ::= S S' |
// S* --> S{0,}
// --> S' ::= S S' |
// S+ --> S{1,}
// --> S S'
// S' ::= S S' |
// S? --> S{0,1}
// --> S'
// S' ::= S |
std::vector<llama_grammar_element> previous_elements(out_elements.begin() + last_sym_start, out_elements.end());
if (min_times == 0) {
out_elements.resize(last_sym_start);
} else {
// Repeat the previous elements (min_times - 1) times
for (int i = 1; i < min_times; i++) {
out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end());
}
}
uint32_t last_rec_rule_id = 0;
auto n_opt = max_times < 0 ? 1 : max_times - min_times;
std::vector<llama_grammar_element> rec_rule(previous_elements);
for (int i = 0; i < n_opt; i++) {
rec_rule.resize(previous_elements.size());
uint32_t rec_rule_id = generate_symbol_id(state, rule_name);
if (i > 0 || max_times < 0) {
rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id});
}
rec_rule.push_back({LLAMA_GRETYPE_ALT, 0});
rec_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, rec_rule_id, rec_rule);
last_rec_rule_id = rec_rule_id;
}
if (n_opt > 0) {
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id});
}
};
while (*pos) {
if (*pos == '"') { // literal string
pos++;
@ -197,40 +266,51 @@ namespace grammar_parser {
throw std::runtime_error(std::string("expecting ')' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*' || *pos == '+' || *pos == '?') { // repetition operator
if (last_sym_start == out_elements.size()) {
throw std::runtime_error(std::string("expecting preceding item to */+/? at ") + pos);
}
// apply transformation to previous symbol (last_sym_start to end) according to
// rewrite rules:
// S* --> S' ::= S S' |
// S+ --> S' ::= S S' | S
// S? --> S' ::= S |
uint32_t sub_rule_id = generate_symbol_id(state, rule_name);
std::vector<llama_grammar_element> sub_rule;
// add preceding symbol to generated rule
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
if (*pos == '*' || *pos == '+') {
// cause generated rule to recurse
sub_rule.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
}
// mark start of alternate def
sub_rule.push_back({LLAMA_GRETYPE_ALT, 0});
if (*pos == '+') {
// add preceding symbol as alternate only for '+' (otherwise empty)
sub_rule.insert(
sub_rule.end(), out_elements.begin() + last_sym_start, out_elements.end());
}
sub_rule.push_back({LLAMA_GRETYPE_END, 0});
add_rule(state, sub_rule_id, sub_rule);
// in original rule, replace previous symbol with reference to generated rule
out_elements.resize(last_sym_start);
out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id});
} else if (*pos == '.') { // any char
last_sym_start = out_elements.size();
out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0});
pos = parse_space(pos + 1, is_nested);
} else if (*pos == '*') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, -1);
} else if (*pos == '+') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(1, -1);
} else if (*pos == '?') {
pos = parse_space(pos + 1, is_nested);
handle_repetitions(0, 1);
} else if (*pos == '{') {
pos = parse_space(pos + 1, is_nested);
if (!is_digit_char(*pos)) {
throw std::runtime_error(std::string("expecting an int at ") + pos);
}
const char * int_end = parse_int(pos);
int min_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
int max_times = -1;
if (*pos == '}') {
max_times = min_times;
pos = parse_space(pos + 1, is_nested);
} else if (*pos == ',') {
pos = parse_space(pos + 1, is_nested);
if (is_digit_char(*pos)) {
const char * int_end = parse_int(pos);
max_times = std::stoul(std::string(pos, int_end - pos));
pos = parse_space(int_end, is_nested);
}
if (*pos != '}') {
throw std::runtime_error(std::string("expecting '}' at ") + pos);
}
pos = parse_space(pos + 1, is_nested);
} else {
throw std::runtime_error(std::string("expecting ',' at ") + pos);
}
handle_repetitions(min_times, max_times);
} else {
break;
}
@ -325,6 +405,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: return true;
case LLAMA_GRETYPE_CHAR_ALT: return true;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true;
case LLAMA_GRETYPE_CHAR_ANY: return true;
default: return false;
}
}
@ -339,6 +420,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT: fprintf(file, "CHAR_NOT"); break;
case LLAMA_GRETYPE_CHAR_RNG_UPPER: fprintf(file, "CHAR_RNG_UPPER"); break;
case LLAMA_GRETYPE_CHAR_ALT: fprintf(file, "CHAR_ALT"); break;
case LLAMA_GRETYPE_CHAR_ANY: fprintf(file, "CHAR_ANY"); break;
}
switch (elem.type) {
case LLAMA_GRETYPE_END:
@ -350,6 +432,7 @@ namespace grammar_parser {
case LLAMA_GRETYPE_CHAR_NOT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, "(\"");
print_grammar_char(file, elem.value);
fprintf(file, "\") ");
@ -407,11 +490,15 @@ namespace grammar_parser {
}
print_grammar_char(file, elem.value);
break;
case LLAMA_GRETYPE_CHAR_ANY:
fprintf(file, ".");
break;
}
if (is_char_element(elem)) {
switch (rule[i + 1].type) {
case LLAMA_GRETYPE_CHAR_ALT:
case LLAMA_GRETYPE_CHAR_RNG_UPPER:
case LLAMA_GRETYPE_CHAR_ANY:
break;
default:
fprintf(file, "] ");

View file

@ -16,92 +16,55 @@ static std::string join(Iterator begin, Iterator end, const std::string & separa
static std::string repeat(const std::string & str, size_t n);
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) {
if (separator_rule.empty()) {
static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
auto has_max = max_items != std::numeric_limits<int>::max();
if (min_items == 0 && max_items == 1) {
return item_rule + "?";
} else if (min_items == 1 && max_items == std::numeric_limits<int>::max()) {
}
if (separator_rule.empty()) {
if (min_items == 1 && !has_max) {
return item_rule + "+";
}
}
std::string result;
if (min_items > 0) {
if (item_rule_is_literal && separator_rule.empty()) {
result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\"";
} else if (min_items == 0 && !has_max) {
return item_rule + "*";
} else {
std::vector<std::string> items(min_items, item_rule);
result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " ");
return item_rule + "{" + std::to_string(min_items) + "," + (has_max ? std::to_string(max_items) : "") + "}";
}
}
std::function<std::string(int, bool)> opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string {
auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule;
if (up_to_n == 0) {
return "";
} else if (up_to_n == 1) {
return "(" + content + ")?";
} else if (!separator_rule.empty() && !prefix_with_sep) {
return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?";
} else {
std::string res = repeat("(" + content + " ", up_to_n);
// strip trailing space
res = res.substr(0, res.length() - 1);
res += repeat(")?", up_to_n);
return res;
auto result = item_rule + " " + build_repetition("(" + separator_rule + " " + item_rule + ")", min_items == 0 ? 0 : min_items - 1, has_max ? max_items - 1 : max_items);
if (min_items == 0) {
result = "(" + result + ")?";
}
};
if (min_items > 0 && max_items != min_items) {
result += " ";
}
if (max_items != std::numeric_limits<int>::max()) {
result += opt_repetitions(max_items - min_items, min_items > 0);
} else {
std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")";
if (min_items == 0 && !separator_rule.empty()) {
result = "(" + item_rule + " " + item_operator + "*)?";
} else {
result += item_operator + "*";
}
}
return result;
}
const std::string SPACE_RULE = "\" \"?";
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
struct BuiltinRule {
std::string content;
std::vector<std::string> deps;
};
const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15);
std::unordered_map<std::string, BuiltinRule> PRIMITIVE_RULES = {
{"boolean", {"(\"true\" | \"false\") space", {}}},
{"decimal-part", {"[0-9] " + _up_to_15_digits, {}}},
{"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}},
{"decimal-part", {"[0-9]{1,16}", {}}},
{"integral-part", {"[0] | [1-9] [0-9]{0,15}", {}}},
{"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}},
{"integer", {"(\"-\"? integral-part) space", {"integral-part"}}},
{"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}},
{"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}},
{"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] "
"\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}},
{"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}},
{"uuid", {"\"\\\"\" [0-9a-fA-F]{8} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{4} \"-\" [0-9a-fA-F]{12} \"\\\"\" space", {}}},
{"char", {"[^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})", {}}},
{"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}},
{"null", {"\"null\" space", {}}},
};
std::unordered_map<std::string, BuiltinRule> STRING_FORMAT_RULES = {
{"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date", {"[0-9]{4} \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}},
{"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9]{3} )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}},
{"date-time", {"date \"T\" time", {"date", "time"}}},
{"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}},
{"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}},
@ -385,8 +348,7 @@ private:
sub_is_literal ? "\"" + sub + "\"" : sub,
min_times,
max_times,
"",
sub_is_literal
""
);
seq.back().second = false;
} else {

View file

@ -1052,7 +1052,7 @@ struct train_params_common get_default_train_params_common() {
params.custom_n_ctx = false;
params.use_flash = true;
params.use_flash = false;
params.use_checkpointing = true;
params.sample_start = "";

View file

@ -1,4 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
@ -81,6 +82,8 @@ models = [
{"name": "jina-v2-en", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
{"name": "jina-v2-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
{"name": "jina-v2-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
{"name": "smaug-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct", },
{"name": "jina-v2-code", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-code", },
]

View file

@ -1,4 +1,5 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from __future__ import annotations
@ -25,8 +26,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
from convert import LlamaHfVocab
logger = logging.getLogger("hf-to-gguf")
@ -48,11 +47,12 @@ class Model:
_model_classes: dict[str, type[Model]] = {}
dir_model: Path
ftype: int
ftype: gguf.LlamaFileType
is_big_endian: bool
endianess: gguf.GGUFEndian
use_temp_file: bool
lazy: bool
model_name: str | None
part_names: list[str]
is_safetensors: bool
hparams: dict[str, Any]
@ -65,7 +65,7 @@ class Model:
# subclasses should define this!
model_arch: gguf.MODEL_ARCH
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
if type(self) is Model:
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
self.dir_model = dir_model
@ -74,10 +74,11 @@ class Model:
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
self.use_temp_file = use_temp_file
self.lazy = not eager
self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors")
self.model_name = model_name
self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
self.is_safetensors = len(self.part_names) > 0
if not self.is_safetensors:
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
self.hparams = Model.load_hparams(self.dir_model)
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
@ -95,7 +96,7 @@ class Model:
ftype_lw: str = ftype_up.lower()
# allow templating the file name with the output ftype, useful with the "auto" ftype
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
@classmethod
def __init_subclass__(cls):
@ -183,7 +184,7 @@ class Model:
return new_name
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.block_count)
if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
@ -313,11 +314,10 @@ class Model:
data = data.astype(np.float32)
data_qtype = gguf.GGMLQuantizationType.F32
block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape
# reverse shape to make it similar to the internal ggml dimension order
shape_str = f"""{{{', '.join(str(n) for n in reversed(
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
)}}}"""
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"
# n_dims is implicit in the shape
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
@ -326,21 +326,21 @@ class Model:
def write(self):
self.write_tensors()
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.write_tensors_to_file(progress=True)
self.gguf_writer.close()
def write_vocab(self):
self.gguf_writer.write_header_to_file()
self.gguf_writer.write_header_to_file(self.fname_out)
self.gguf_writer.write_kv_data_to_file()
self.gguf_writer.close()
@staticmethod
def get_model_part_names(dir_model: Path, suffix: str) -> list[str]:
def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
part_names: list[str] = []
for filename in os.listdir(dir_model):
if filename.endswith(suffix):
if filename.startswith(prefix) and filename.endswith(suffix):
part_names.append(filename)
part_names.sort()
@ -474,6 +474,12 @@ class Model:
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
res = "jina-v2-de"
if chkhsh == "c136ed14d01c2745d4f60a9596ae66800e2b61fa45643e72436041855ad4089d":
# ref: https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct
res = "smaug-bpe"
if chkhsh == "7967bfa498ade6b757b064f31e964dddbb80f8f9a4d68d4ba7998fcf281c531a":
# ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-code
res = "jina-v2-code"
if res is None:
logger.warning("\n")
@ -632,7 +638,7 @@ class Model:
special_vocab.add_to_gguf(self.gguf_writer)
def _set_vocab_llama_hf(self):
vocab = LlamaHfVocab(self.dir_model)
vocab = gguf.LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
@ -661,7 +667,7 @@ class GPTNeoXModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@ -794,7 +800,7 @@ class MPTModel(Model):
def set_gguf_parameters(self):
block_count = self.hparams["n_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(self.hparams["max_seq_len"])
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
self.gguf_writer.add_block_count(block_count)
@ -846,7 +852,7 @@ class OrionModel(Model):
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_file_type(self.ftype)
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@ -883,7 +889,7 @@ class BaichuanModel(Model):
else:
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@ -1006,7 +1012,7 @@ class XverseModel(Model):
else:
raise ValueError("gguf: can not find ctx length parameter.")
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_source_hf_repo(hf_repo)
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
self.gguf_writer.add_context_length(ctx_length)
@ -1202,7 +1208,7 @@ class StableLMModel(Model):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@ -1315,6 +1321,17 @@ class LlamaModel(Model):
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "add_prefix_space" in tokenizer_config_json:
self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"])
# Apply to granite small models only
if self.hparams.get("vocab_size", 32000) == 49152:
self.gguf_writer.add_add_bos_token(False)
@staticmethod
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
if n_head_kv is not None and n_head != n_head_kv:
@ -1329,9 +1346,9 @@ class LlamaModel(Model):
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith("q_proj.weight"):
if name.endswith(("q_proj.weight", "q_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith("k_proj.weight"):
if name.endswith(("k_proj.weight", "k_proj.bias")):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
@ -1666,7 +1683,7 @@ class GPT2Model(Model):
model_arch = gguf.MODEL_ARCH.GPT2
def set_gguf_parameters(self):
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_block_count(self.hparams["n_layer"])
self.gguf_writer.add_context_length(self.hparams["n_ctx"])
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
@ -2233,7 +2250,7 @@ class GemmaModel(Model):
hparams = self.hparams
block_count = hparams["num_hidden_layers"]
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(hparams["max_position_embeddings"])
self.gguf_writer.add_embedding_length(hparams["hidden_size"])
self.gguf_writer.add_block_count(block_count)
@ -2333,7 +2350,7 @@ class MambaModel(Model):
# Fail early for models which don't have a block expansion factor of 2
assert d_inner == 2 * d_model
self.gguf_writer.add_name(self.dir_model.name)
self.gguf_writer.add_name(self.dir_model.name if self.model_name is None else self.model_name)
self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
self.gguf_writer.add_embedding_length(d_model)
self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
@ -2393,7 +2410,8 @@ class CommandR2Model(Model):
# max_position_embeddings = 8192 in config.json but model was actually
# trained on 128k context length
self.hparams["max_position_embeddings"] = self.hparams["model_max_length"]
# aya-23 models don't have model_max_length specified
self.hparams["max_position_embeddings"] = self.find_hparam(["model_max_length", "max_position_embeddings"])
def set_gguf_parameters(self):
super().set_gguf_parameters()
@ -2439,11 +2457,13 @@ class JinaBertV2Model(BertModel):
def get_tensors(self):
for name, data in super().get_tensors():
if 'gated_layers' in name:
if 'gated_layer' in name:
d1 = data[:self.intermediate_size, :]
name1 = name.replace('gated_layers', 'gated_layers_w')
name1 = name1.replace('up_gated_layer', 'gated_layers_v')
d2 = data[self.intermediate_size:, :]
name2 = name.replace('gated_layers', 'gated_layers_v')
name2 = name2.replace('up_gated_layer', 'gated_layers_w')
yield name1, d1
yield name2, d2
continue
@ -2466,6 +2486,236 @@ class JinaBertV2Model(BertModel):
self.gguf_writer.add_add_eos_token(True)
@Model.register("ArcticForCausalLM")
class ArcticModel(Model):
model_arch = gguf.MODEL_ARCH.ARCTIC
def set_vocab(self):
# The reason for using a custom implementation here is that the
# snowflake-arctic-instruct model redefined tokens 31998 and 31999 from
# tokenizer.model and used them as BOS and EOS instead of adding new tokens.
from sentencepiece import SentencePieceProcessor
tokenizer_path = self.dir_model / 'tokenizer.model'
if not tokenizer_path.is_file():
logger.error(f'Error: Missing {tokenizer_path}')
sys.exit(1)
# Read the whole vocabulary from the tokenizer.model file
tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE
tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype
# Use the added_tokens_decoder field from tokeniser_config.json as the source
# of information about added/redefined tokens and modify them accordingly.
tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
if tokenizer_config_file.is_file():
with open(tokenizer_config_file, "r", encoding="utf-8") as f:
tokenizer_config_json = json.load(f)
if "added_tokens_decoder" in tokenizer_config_json:
added_tokens_decoder = tokenizer_config_json["added_tokens_decoder"]
for token_id, token_json in added_tokens_decoder.items():
token_id = int(token_id)
if (token_id >= vocab_size):
logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
continue
token_content = token_json["content"]
token_type = SentencePieceTokenTypes.USER_DEFINED
token_score = -10000.0
# Map unk_token to UNKNOWN, other special tokens to CONTROL
# Set the score to 0.0 as in the original tokenizer.model
if ("special" in token_json) and token_json["special"]:
if token_content == tokenizer_config_json["unk_token"]:
token_type = SentencePieceTokenTypes.UNKNOWN
else:
token_type = SentencePieceTokenTypes.CONTROL
token_score = 0.0
logger.info(f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})")
tokens[token_id] = token_content.encode("utf-8")
toktypes[token_id] = token_type
scores[token_id] = token_score
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"])
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
n_head = self.hparams["num_attention_heads"]
n_kv_head = self.hparams.get("num_key_value_heads")
if name.endswith("q_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
if name.endswith("k_proj.weight"):
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
n_experts = self.hparams["num_local_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for wid in ["w1", "w2", "w3"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def write_tensors(self):
super().write_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
@Model.register("DeepseekV2ForCausalLM")
class DeepseekV2Model(Model):
model_arch = gguf.MODEL_ARCH.DEEPSEEK2
def set_vocab(self):
self._set_vocab_gpt2()
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"])
self.gguf_writer.add_vocab_size(hparams["vocab_size"])
if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None:
self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"])
self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"])
self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"])
self.gguf_writer.add_value_length(hparams["v_head_dim"])
self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"])
self.gguf_writer.add_expert_count(hparams["n_routed_experts"])
self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"])
self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"])
self.gguf_writer.add_rope_dimension_count(hparams["qk_rope_head_dim"])
if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
if self.hparams["rope_scaling"].get("type") == "yarn":
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)
self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"])
self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
self.gguf_writer.add_rope_scaling_yarn_log_mul(0.1 * hparams["rope_scaling"]["mscale_all_dim"])
_experts: list[dict[str, Tensor]] | None = None
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# process the experts separately
if name.find("mlp.experts") != -1:
n_experts = self.hparams["n_routed_experts"]
assert bid is not None
if self._experts is None:
self._experts = [{} for _ in range(self.block_count)]
self._experts[bid][name] = data_torch
if len(self._experts[bid]) >= n_experts * 3:
tensors: list[tuple[str, Tensor]] = []
# merge the experts into a single 3d tensor
for w_name in ["down_proj", "gate_proj", "up_proj"]:
datas: list[Tensor] = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch = torch.stack(datas, dim=0)
merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight"
new_name = self.map_tensor_name(merged_name)
tensors.append((new_name, data_torch))
return tensors
else:
return []
return [(self.map_tensor_name(name), data_torch)]
def write_tensors(self):
super().write_tensors()
if self._experts is not None:
# flatten `list[dict[str, Tensor]]` into `list[str]`
experts = [k for d in self._experts for k in d.keys()]
if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts}")
###### CONVERSION LOGIC ######
@ -2598,8 +2848,13 @@ def main() -> None:
hparams = Model.load_hparams(dir_model)
with torch.inference_mode():
try:
model_class = Model.from_model_architecture(hparams["architectures"][0])
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
except NotImplementedError:
logger.error(f"Model {hparams['architectures'][0]} is not supported")
sys.exit(1)
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
logger.info("Set model parameters")
model_instance.set_gguf_parameters()

View file

@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
### 1. Convert the model to GGUF
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.

View file

@ -15,7 +15,6 @@ else()
add_subdirectory(baby-llama)
add_subdirectory(batched)
add_subdirectory(batched-bench)
add_subdirectory(beam-search)
add_subdirectory(benchmark)
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)

View file

@ -1,19 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m ./models/alpaca.13b.ggmlv3.q8_0.bin \
--color \
-f ./prompts/alpaca.txt \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 7

View file

@ -522,8 +522,8 @@ static struct ggml_tensor * forward(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, 1]
// Kcur shape [n_embd/n_head, n_head, N, 1]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), KQ_pos, n_rot, 0);
// store key and value to memory
{
@ -759,8 +759,8 @@ static struct ggml_tensor * forward_batch(
// wk shape [n_embd, n_embd, 1, 1]
// Qcur shape [n_embd/n_head, n_head, N, n_batch]
// Kcur shape [n_embd/n_head, n_head, N, n_batch]
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0, 0);
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_4d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N, n_batch), KQ_pos, n_rot, 0);
assert_shape_4d(Qcur, n_embd/n_head, n_head, N, n_batch);
assert_shape_4d(Kcur, n_embd/n_head, n_head, N, n_batch);
@ -1056,7 +1056,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wqb,
cur)),
n_embd/n_head, n_head, N),
KQ_pos, n_rot, 0, 0);
KQ_pos, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope(ctx0,
ggml_reshape_3d(ctx0,
ggml_mul_mat(ctx0,
@ -1065,7 +1065,7 @@ static struct ggml_tensor * forward_lora(
model->layers[il].wkb,
cur)),
n_embd/n_head, n_head, N),
KQ_pos, n_rot, 0, 0);
KQ_pos, n_rot, 0);
// store key and value to memory
{

View file

@ -10,16 +10,16 @@ There are 2 modes of operation:
- `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`)
```bash
./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] <PP> <TG> <PL>
./batched-bench -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]
# LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared
./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99
./batched-bench -m ./models/llama-7b/ggml-model-f16.gguf -c 16384 -b 2048 -ub 512 -ngl 99
# LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 16384 -b 2048 -ub 512 -ngl 99 -pps
# custom set of batches
./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32
./batched-bench -m ./models/llama-7b/ggml-model-q8_0.gguf -c 2048 -b 512 -ub 512 -ngl 999 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32
```
## Sample results

View file

@ -28,67 +28,27 @@ static std::vector<int> parse_list(char * p) {
return ret;
}
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -c 2048 -b 2048 -ub 512 -npp 128,256,512 -ntg 128,256 -npl 1,2,4,8,16,32 [-pps]\n", argv[0]);
LOG_TEE("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
printf(" <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
int n_kv_max = 2048;
int n_batch = 2048;
int n_ubatch = 512;
bool flash_attn = false;
int is_pp_shared = 0;
int n_gpu_layers = 0;
int is_pp_shared = params.is_pp_shared;
std::vector<int> n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, };
std::vector<int> n_tg = { 128, 256, };
std::vector<int> n_pl = { 1, 2, 4, 8, 16, 32, };
//std::vector<int> n_pl = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, };
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
n_kv_max = std::atoi(argv[2]);
}
if (argc >= 4) {
n_batch = std::atoi(argv[3]);
}
if (argc >= 5) {
n_ubatch = std::atoi(argv[4]);
}
if (argc >= 6) {
flash_attn = std::atoi(argv[5]);
}
if (argc >= 7) {
is_pp_shared = std::atoi(argv[6]);
}
if (argc >= 8) {
n_gpu_layers = std::atoi(argv[7]);
}
if (argc >= 9) {
n_pp = parse_list(argv[8]);
}
if (argc >= 10) {
n_tg = parse_list(argv[9]);
}
if (argc >= 11) {
n_pl = parse_list(argv[10]);
}
std::vector<int> n_pp = params.n_pp;
std::vector<int> n_tg = params.n_tg;
std::vector<int> n_pl = params.n_pl;
// init LLM
@ -97,12 +57,7 @@ int main(int argc, char ** argv) {
// initialize the model
llama_model_params model_params = llama_model_default_params();
const std::vector<float> t_split(llama_max_devices(), 0.0f);
model_params.n_gpu_layers = n_gpu_layers;
model_params.tensor_split = t_split.data();
llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -111,16 +66,7 @@ int main(int argc, char ** argv) {
return 1;
}
llama_context_params ctx_params = llama_context_default_params();
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_max;
ctx_params.n_batch = n_batch;
ctx_params.n_ubatch = n_ubatch;
ctx_params.flash_attn = flash_attn;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
// ensure enough sequences are available
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
@ -132,6 +78,8 @@ int main(int argc, char ** argv) {
return 1;
}
const int32_t n_kv_max = llama_n_ctx(ctx);
llama_batch batch = llama_batch_init(n_kv_max, 0, 1);
// decode in batches of ctx_params.n_batch tokens
@ -175,7 +123,7 @@ int main(int argc, char ** argv) {
}
LOG_TEE("\n");
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, params.n_batch, params.n_ubatch, params.flash_attn, params.is_pp_shared, params.n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
LOG_TEE("\n");
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");

View file

@ -3,7 +3,7 @@
The example demonstrates batched generation from a given prompt
```bash
./batched ./models/llama-7b-v2/ggml-model-f16.gguf "Hello my name is" 4
./batched -m ./models/llama-7b-v2/ggml-model-f16.gguf -p "Hello my name is" -np 4
...

View file

@ -7,48 +7,31 @@
#include <string>
#include <vector>
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s -m model.gguf -p \"Hello my name is\" -n 32 -np 4\n", argv[0]);
LOG_TEE("\n");
}
int main(int argc, char ** argv) {
gpt_params params;
if (argc == 1 || argv[1][0] == '-') {
printf("usage: %s MODEL_PATH [PROMPT] [PARALLEL] [LEN] [NGL]\n" , argv[0]);
params.prompt = "Hello my name is";
params.n_predict = 32;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
// number of parallel batches
int n_parallel = 1;
int n_parallel = params.n_parallel;
// total length of the sequences including the prompt
int n_len = 32;
// number of layers to offload to the GPU
int n_gpu_layers = 0;
if (argc >= 2) {
params.model = argv[1];
}
if (argc >= 3) {
params.prompt = argv[2];
}
if (argc >= 4) {
n_parallel = std::atoi(argv[3]);
}
if (argc >= 5) {
n_len = std::atoi(argv[4]);
}
if (argc >= 6) {
n_gpu_layers = std::atoi(argv[5]);
}
if (params.prompt.empty()) {
params.prompt = "Hello my name is";
}
string_process_escapes(params.prompt);
int n_predict = 32;
// init LLM
@ -57,9 +40,7 @@ int main(int argc, char ** argv) {
// initialize the model
llama_model_params model_params = llama_model_default_params();
model_params.n_gpu_layers = n_gpu_layers;
llama_model_params model_params = llama_model_params_from_gpt_params(params);
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
@ -73,18 +54,14 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens_list;
tokens_list = ::llama_tokenize(model, params.prompt, true);
const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size())*n_parallel;
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
// initialize the context
llama_context_params ctx_params = llama_context_default_params();
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
ctx_params.seed = 1234;
ctx_params.n_ctx = n_kv_req;
ctx_params.n_batch = std::max(n_len, n_parallel);
ctx_params.n_seq_max = n_parallel;
ctx_params.n_threads = params.n_threads;
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
ctx_params.n_batch = std::max(n_predict, n_parallel);
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
@ -95,7 +72,7 @@ int main(int argc, char ** argv) {
const int n_ctx = llama_n_ctx(ctx);
LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
LOG_TEE("\n%s: n_predict = %d, n_ctx = %d, n_batch = %u, n_parallel = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, ctx_params.n_batch, n_parallel, n_kv_req);
// make sure the KV cache is big enough to hold all the prompt and generated tokens
if (n_kv_req > n_ctx) {
@ -156,7 +133,7 @@ int main(int argc, char ** argv) {
const auto t_main_start = ggml_time_us();
while (n_cur <= n_len) {
while (n_cur <= n_predict) {
// prepare the next batch
llama_batch_clear(batch);
@ -192,7 +169,7 @@ int main(int argc, char ** argv) {
//const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
// is it an end of generation? -> mark the stream as finished
if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
i_batch[i] = -1;
LOG_TEE("\n");
if (n_parallel > 1) {

View file

@ -1,5 +0,0 @@
set(TARGET beam-search)
add_executable(${TARGET} beam-search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -1,188 +0,0 @@
#include "common.h"
#include "llama.h"
#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
# define NOMINMAX
#endif
#include <windows.h>
#include <signal.h>
#endif
// Used for debugging to print out beam tokens.
struct ostream_beam_view {
llama_context * ctx;
llama_beam_view beam_view;
};
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
}
return os << ')';
}
// Put here anything you want back in beam_search_callback().
struct beam_search_callback_data {
llama_context * ctx;
std::vector<llama_token> response;
};
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
// For example, eob can be flagged due to maximum token length, stop words, etc.
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]);
}
// Function matching type llama_beam_search_callback_fn_t.
// Custom callback example is called each time the beams lengths increase:
// * Show progress by printing ',' following by number of convergent beam tokens if any.
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
// This is also called when the stop condition is met.
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
// Mark beams as EOS as needed.
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
llama_beam_view& beam_view = beams_state.beam_views[i];
if (!beam_view.eob && is_at_eob(callback_data, beam_view.tokens, beam_view.n_tokens)) {
beam_view.eob = true;
}
}
printf(","); // Show progress
if (const size_t n = beams_state.common_prefix_length) {
callback_data.response.resize(callback_data.response.size() + n);
assert(0u < beams_state.n_beams);
const llama_token * tokens = beams_state.beam_views[0].tokens;
std::copy(tokens, tokens + n, callback_data.response.end() - n);
printf("%zu", n);
}
fflush(stdout);
#if 1 // DEBUG: print current beams for this iteration
std::cout << "\n\nCurrent beams (last_call=" << beams_state.last_call << "):\n";
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
std::cout << "beams["<<i<<"]: " << ostream_beam_view{callback_data.ctx,beams_state.beam_views[i]} << std::endl;
}
#endif
}
int main(int argc, char ** argv)
{
gpt_params params;
//params.n_gpu_layers = 200;
//---------------------------------
// Print help :
//---------------------------------
if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [BEAM_WIDTH=2] [PROMPT]\n" , argv[0] );
return 1 ;
}
//---------------------------------
// Load parameters :
//---------------------------------
params.model = argv[1];
params.n_beams = 2 < argc ? std::stoi(argv[2]) : 2;
if ( argc > 3 )
{
params.prompt = argv[3];
}
if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}
//---------------------------------
// Init LLM :
//---------------------------------
llama_backend_init();
llama_numa_init(params.numa);
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params( params );
if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}
//---------------------------------
// Tokenize the prompt :
//---------------------------------
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);
const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;
if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}
fprintf( stderr, "\n\n" );
// Print the tokens from the prompt :
for( auto id : tokens_list )
{
std::cout << llama_token_to_piece(ctx, id);
}
std::cout << std::flush;
int n_past = 0;
if (llama_decode(ctx, llama_batch_get_one(tokens_list.data(), tokens_list.size(), n_past, 0)))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();
beam_search_callback_data callback_data{ctx, {}};
size_t const beam_width = static_cast<size_t>(params.n_beams);
int const n_predict = 256;
llama_beam_search(ctx, beam_search_callback, &callback_data, beam_width, n_past, n_predict);
std::cout << "\n\n";
for (llama_token const token_id : callback_data.response) {
std::cout << llama_token_to_piece(ctx,token_id);
}
std::cout << std::endl;
llama_free( ctx );
llama_free_model( model );
llama_backend_free();
return 0;
}

View file

@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from dataclasses import dataclass
from pathlib import Path
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
import numpy as np
from sentencepiece import SentencePieceProcessor
if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
# use .parent.parent since we are in "examples" directory
sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
import gguf
from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
@ -174,7 +176,7 @@ class Params:
rope_scaling_type: gguf.RopeScalingType | None = None
f_rope_freq_base: float | None = None
f_rope_scale: float | None = None
n_orig_ctx: int | None = None
n_ctx_orig: int | None = None
rope_finetuned: bool | None = None
ftype: GGMLFileType | None = None
@ -224,7 +226,7 @@ class Params:
with open(config_path) as f:
config = json.load(f)
rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None
rope_scaling_type = f_rope_scale = n_ctx_orig = rope_finetuned = None
rope_scaling = config.get("rope_scaling")
if rope_scaling is not None and (typ := rope_scaling.get("type")):
@ -234,7 +236,7 @@ class Params:
rope_scaling_type = gguf.RopeScalingType.LINEAR
elif typ == "yarn":
rope_scaling_type = gguf.RopeScalingType.YARN
n_orig_ctx = rope_scaling['original_max_position_embeddings']
n_ctx_orig = rope_scaling['original_max_position_embeddings']
rope_finetuned = rope_scaling['finetuned']
else:
raise NotImplementedError(f'Unknown rope scaling type: {typ}')
@ -270,7 +272,7 @@ class Params:
f_rope_freq_base = config.get("rope_theta"),
rope_scaling_type = rope_scaling_type,
f_rope_scale = f_rope_scale,
n_orig_ctx = n_orig_ctx,
n_ctx_orig = n_ctx_orig,
rope_finetuned = rope_finetuned,
)
@ -380,306 +382,6 @@ class Metadata:
return metadata
#
# vocab
#
@runtime_checkable
class BaseVocab(Protocol):
tokenizer_model: ClassVar[str]
name: ClassVar[str]
class NoVocab(BaseVocab):
tokenizer_model = "no_vocab"
name = "no_vocab"
def __repr__(self) -> str:
return "<NoVocab for a model without integrated vocabulary>"
@runtime_checkable
class Vocab(BaseVocab, Protocol):
vocab_size: int
added_tokens_dict: dict[str, int]
added_tokens_list: list[str]
fname_tokenizer: Path
def __init__(self, base_path: Path): ...
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
class BpeVocab(Vocab):
tokenizer_model = "gpt2"
name = "bpe"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'vocab.json').exists():
# "slow" tokenizer
with open(fname_tokenizer, encoding="utf-8") as f:
self.vocab = json.load(f)
try:
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
else:
# "fast" tokenizer
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding="utf-8") as f:
tokenizer_json = json.load(f)
tokenizer_model: dict[str, Any] = tokenizer_json['model']
if (
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'ByteLevel'
):
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
self.vocab = tokenizer_model["vocab"]
if (added := tokenizer_json.get('added_tokens')) is not None:
# Added tokens here can be duplicates of the main vocabulary.
added_tokens = {item['content']: item['id']
for item in added
if item['content'] not in self.vocab}
vocab_size = len(self.vocab)
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
actual_ids = sorted(added_tokens.values())
if expected_ids != actual_ids:
expected_end_id = vocab_size + len(actual_ids) - 1
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
self.added_tokens_dict = added_tokens
self.added_tokens_list = [text for (text, idx) in items]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
for i, _ in enumerate(self.vocab):
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.bpe_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class SentencePieceVocab(Vocab):
tokenizer_model = "llama"
name = "spm"
def __init__(self, base_path: Path):
added_tokens: dict[str, int] = {}
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
# normal location
try:
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
added_tokens = json.load(f)
except FileNotFoundError:
pass
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
# not found in alternate location either
raise FileNotFoundError('Cannot find tokenizer.model')
self.sentencepiece_tokenizer = SentencePieceProcessor()
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
vocab_size = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
actual_new_ids = sorted(new_tokens.keys())
if expected_new_ids != actual_new_ids:
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
# Token pieces that were added to the base vocabulary.
self.added_tokens_dict = added_tokens
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
self.vocab_size_base = vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(i)
text = piece.encode("utf-8")
score: float = tokenizer.GetScore(i)
toktype = gguf.TokenType.NORMAL
if tokenizer.IsUnknown(i):
toktype = gguf.TokenType.UNKNOWN
if tokenizer.IsControl(i):
toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.IsUnused(i):
toktype = gguf.TokenType.UNUSED
if tokenizer.IsByte(i):
toktype = gguf.TokenType.BYTE
yield text, score, toktype
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
score = -1000.0
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.sentencepiece_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
class LlamaHfVocab(Vocab):
tokenizer_model = "llama"
name = "hfft"
def __init__(self, base_path: Path):
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
# if this fails, FileNotFoundError propagates to caller
with open(fname_tokenizer, encoding='utf-8') as f:
tokenizer_json = json.load(f)
# pre-check so we know if we need transformers
tokenizer_model: dict[str, Any] = tokenizer_json['model']
is_llama3 = (
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
and not tokenizer_model.get('byte_fallback', True)
)
if is_llama3:
raise TypeError('Llama 3 must be converted with BpeVocab')
if not is_llama3 and (
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
or tokenizer_json['decoder']['type'] != 'Sequence'
):
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
try:
from transformers import AutoTokenizer
except ImportError as e:
raise ImportError(
"To use LlamaHfVocab, please install the `transformers` package. "
"You can install it with `pip install transformers`."
) from e
# Allow the tokenizer to default to slow or fast versions.
# Explicitly set tokenizer to use local paths.
self.tokenizer = AutoTokenizer.from_pretrained(
base_path,
cache_dir=base_path,
local_files_only=True,
)
assert self.tokenizer.is_fast # assume tokenizer.json is used
# Initialize lists and dictionaries for added tokens
self.added_tokens_list = []
self.added_tokens_dict = dict()
self.added_tokens_ids = set()
# Process added tokens
for tok, tokidx in sorted(
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
):
# Only consider added tokens that are not in the base vocabulary
if tokidx >= self.tokenizer.vocab_size:
self.added_tokens_list.append(tok)
self.added_tokens_dict[tok] = tokidx
self.added_tokens_ids.add(tokidx)
# Store special tokens and their IDs
self.specials = {
tok: self.tokenizer.get_vocab()[tok]
for tok in self.tokenizer.all_special_tokens
}
self.special_ids = set(self.tokenizer.all_special_ids)
# Set vocabulary sizes
self.vocab_size_base = self.tokenizer.vocab_size
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
self.fname_tokenizer = fname_tokenizer
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
reverse_vocab = {
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
}
for token_id in range(self.vocab_size_base):
# Skip processing added tokens here
if token_id in self.added_tokens_ids:
continue
# Convert token text to bytes
token_text = reverse_vocab[token_id].encode("utf-8")
# Yield token text, score, and type
yield token_text, self.get_token_score(token_id), self.get_token_type(
token_id, token_text, self.special_ids # Reuse already stored special IDs
)
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
# Special case for byte tokens
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
return gguf.TokenType.BYTE
# Determine token type based on whether it's a special token
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
def get_token_score(self, token_id: int) -> float:
# Placeholder for actual logic to determine the token's score
# This needs to be implemented based on specific requirements
return -1000.0 # Default score
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
for text in self.added_tokens_list:
if text in self.specials:
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
score = self.get_token_score(self.specials[text])
else:
toktype = gguf.TokenType.USER_DEFINED
score = -1000.0
yield text.encode("utf-8"), score, toktype
def has_newline_token(self):
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
yield from self.hf_tokens()
yield from self.added_tokens()
def __repr__(self) -> str:
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
#
# data loading
# TODO: reuse (probably move to gguf.py?)
@ -1162,8 +864,8 @@ class OutputFile:
self.gguf.add_rope_scaling_type(params.rope_scaling_type)
self.gguf.add_rope_scaling_factor(params.f_rope_scale)
if params.n_orig_ctx is not None:
self.gguf.add_rope_scaling_orig_ctx_len(params.n_orig_ctx)
if params.n_ctx_orig is not None:
self.gguf.add_rope_scaling_orig_ctx_len(params.n_ctx_orig)
if params.rope_finetuned is not None:
self.gguf.add_rope_scaling_finetuned(params.rope_finetuned)

View file

@ -774,7 +774,7 @@ static struct train_params get_default_train_params() {
params.samples_start_after_nl = false;
params.use_adam = true;
params.use_flash = true;
params.use_flash = false;
params.use_scratch = true;
// only adam

View file

@ -63,6 +63,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
@ -79,9 +80,6 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);

View file

@ -140,20 +140,18 @@ static bool run(llama_context * ctx, const gpt_params & params) {
}
int main(int argc, char ** argv) {
callback_data cb_data;
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
print_build_info();
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
llama_backend_init();
llama_numa_init(params.numa);

View file

@ -564,7 +564,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
const int rope_mode = 0;
return ggml_rope_ext(ctx,
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx, 0,
t, KQ_pos, nullptr, n_rot, rope_mode, n_ctx,
rope_freq_base, rope_freq_scale, 0.0f, 1.0f, 0.0f, 0.0f
);
};

View file

@ -61,10 +61,10 @@ static size_t split_str_to_n_bytes(std::string str) {
int n;
if (str.back() == 'M') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024; // megabytes
n_bytes = (size_t)n * 1000 * 1000; // megabytes
} else if (str.back() == 'G') {
sscanf(str.c_str(), "%d", &n);
n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes
n_bytes = (size_t)n * 1000 * 1000 * 1000; // gigabytes
} else {
throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back()));
}
@ -284,7 +284,7 @@ struct split_strategy {
struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i));
total_size += ggml_nbytes(t);
}
total_size = total_size / 1024 / 1024; // convert to megabytes
total_size = total_size / 1000 / 1000; // convert to megabytes
printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size);
i_split++;
}

View file

@ -41,7 +41,7 @@ echo PASS
echo
# 2b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --n-predict 32
echo PASS
echo
@ -51,7 +51,7 @@ echo PASS
echo
# 3b. Test the merged model is loading properly
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-merge.gguf --n-predict 32
echo PASS
echo
@ -61,7 +61,7 @@ echo PASS
echo
# 4b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --n-predict 32
echo PASS
echo
@ -71,7 +71,7 @@ echo
#echo
# 5b. Test the merged model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32
#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --n-predict 32
#echo PASS
#echo
@ -81,7 +81,7 @@ echo PASS
echo
# 6b. Test the sharded model is loading properly
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32
$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --n-predict 32
echo PASS
echo

View file

@ -1,15 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main --color --instruct --threads 4 \
--model ./models/gpt4all-7B/gpt4all-lora-quantized.bin \
--file ./prompts/alpaca.txt \
--batch_size 8 --ctx_size 2048 -n -1 \
--repeat_last_n 64 --repeat_penalty 1.3 \
--n_predict 128 --temp 0.1 --top_k 40 --top_p 0.95

View file

@ -153,7 +153,9 @@ static std::string gritlm_instruction(const std::string & instruction) {
int main(int argc, char * argv[]) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}

View file

@ -6,16 +6,19 @@ More information is available here: https://github.com/ggerganov/llama.cpp/pull/
## Usage
```
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
[-ofreq num_chunks] [-ow <0 or 1>] [other common params]
./imatrix \
-m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \
[--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \
[--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]
```
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
The parameters in square brackets are optional and have the following meaning:
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
* `--output-frequency` specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
* `--save-frequency` specifies how often to save a copy of the imatrix in a separate file. Default is 0 (i.e., never)
* `--process-output` specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
For faster computation, make sure to use GPU offloading via the `-ngl` argument

View file

@ -17,39 +17,37 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\nexample usage:\n");
LOG_TEE("\n %s \\\n"
" -m model.gguf -f some-text.txt [-o imatrix.dat] [--process-output] [--verbosity 1] \\\n"
" [--no-ppl] [--chunk 123] [--output-frequency 10] [--save-frequency 0] \\\n"
" [--in-file imatrix-prev-0.dat --in-file imatrix-prev-1.dat ...]\n" , argv[0]);
LOG_TEE("\n");
}
struct Stats {
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
};
struct StatParams {
std::string dataset;
std::string ofile = "imatrix.dat";
int n_output_frequency = 10;
int verbosity = 1;
int keep_every = 0;
bool collect_output_weight = false;
};
class IMatrixCollector {
public:
IMatrixCollector() = default;
void set_parameters(StatParams&& params) { m_params = std::move(params); }
void set_params(gpt_params params) { m_params = std::move(params); }
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
void save_imatrix() const;
bool load_imatrix(const char * file_name, bool add);
static bool load_imatrix(const char * file_name, std::unordered_map<std::string, Stats>& imatrix);
void save_imatrix(int ncall = -1) const;
bool load_imatrix(const char * file_name);
private:
std::unordered_map<std::string, Stats> m_stats;
StatParams m_params;
gpt_params m_params;
std::mutex m_mutex;
int m_last_call = 0;
std::vector<float> m_src1_data;
std::vector<char> m_ids; // the expert ids from ggml_mul_mat_id
//
void save_imatrix(const char * file_name, const char * dataset) const;
void keep_imatrix(int ncall) const;
};
// remove any prefix and suffixes from the name
@ -85,7 +83,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (t->op != GGML_OP_MUL_MAT) return false;
// why are small batches ignored (<16 tokens)?
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false;
if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
return true;
}
@ -153,16 +151,20 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
if (!std::isfinite(e.values[e_start + j])) {
fprintf(stderr, "%f detected in %s\n", e.values[e_start + j], wname.c_str());
exit(1);
}
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) {
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
@ -185,15 +187,19 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
if (!std::isfinite(e.values[j])) {
fprintf(stderr, "%f detected in %s\n", e.values[j], wname.c_str());
exit(1);
}
}
}
if (e.ncall > m_last_call) {
m_last_call = e.ncall;
if (m_last_call % m_params.n_output_frequency == 0) {
if (m_last_call % m_params.n_out_freq == 0) {
save_imatrix();
}
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
keep_imatrix(m_last_call);
if (m_params.n_save_freq > 0 && m_last_call%m_params.n_save_freq == 0) {
save_imatrix(m_last_call);
}
}
}
@ -201,33 +207,75 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
return true;
}
void IMatrixCollector::save_imatrix() const {
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str());
void IMatrixCollector::save_imatrix(int ncall) const {
auto fname = m_params.out_file;
if (fname.empty()) {
fname = "imatrix.dat";
}
void IMatrixCollector::keep_imatrix(int ncall) const {
auto file_name = m_params.ofile;
if (file_name.empty()) file_name = "imatrix.dat";
file_name += ".at_";
file_name += std::to_string(ncall);
save_imatrix(file_name.c_str(), m_params.dataset.c_str());
if (ncall > 0) {
fname += ".at_";
fname += std::to_string(ncall);
}
// avoid writing imatrix entries that do not have full data
// this can happen with MoE models where some of the experts end up not being exercised by the provided training data
int n_entries = 0;
std::vector<std::string> to_store;
bool is_first = true; // for printing
for (const auto & kv : m_stats) {
const int n_all = kv.second.counts.size();
if (n_all == 0) {
continue;
}
int n_zeros = 0;
for (const int c : kv.second.counts) {
if (c == 0) {
n_zeros++;
}
}
if (n_zeros != 0 && is_first) {
fprintf(stderr, "\n");
is_first = false;
}
if (n_zeros == n_all) {
fprintf(stderr, "%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
continue;
}
if (n_zeros > 0) {
fprintf(stderr, "%s: entry '%40s' has partial data (%.2f%%) - skipping\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
continue;
}
n_entries++;
to_store.push_back(kv.first);
}
if (to_store.size() < m_stats.size()) {
fprintf(stderr, "%s: warning: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
}
void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const {
std::ofstream out(fname, std::ios::binary);
int n_entries = m_stats.size();
out.write((const char *) &n_entries, sizeof(n_entries));
for (const auto & p : m_stats) {
int len = p.first.size();
for (const auto & name : to_store) {
const auto & stat = m_stats.at(name);
int len = name.size();
out.write((const char *) &len, sizeof(len));
out.write(p.first.c_str(), len);
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write(name.c_str(), len);
out.write((const char *) &stat.ncall, sizeof(stat.ncall));
int nval = stat.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) {
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
tmp[i] = (stat.values[i] / static_cast<float>(stat.counts[i])) * static_cast<float>(stat.ncall);
}
out.write((const char*)tmp.data(), nval*sizeof(float));
}
@ -236,26 +284,28 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
// Write the number of call the matrix was computed with
out.write((const char *) &m_last_call, sizeof(m_last_call));
// Write the dataset name at the end of the file to later on specify it in quantize
int n_dataset = strlen(dataset);
out.write((const char *) &n_dataset, sizeof(n_dataset));
out.write(dataset, n_dataset);
// Write the input filename at the end of the file to later on specify it in quantize
{
int len = m_params.prompt_file.size();
out.write((const char *) &len, sizeof(len));
out.write(m_params.prompt_file.c_str(), len);
}
if (m_params.verbosity > 0) {
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname);
fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname.c_str());
}
}
bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_map<std::string, Stats>& imatrix_data) {
std::ifstream in(imatrix_file, std::ios::binary);
bool IMatrixCollector::load_imatrix(const char * fname) {
std::ifstream in(fname, std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__,imatrix_file);
printf("%s: failed to open %s\n",__func__, fname);
return false;
}
int n_entries;
in.read((char*)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file);
printf("%s: no data in file %s\n", __func__, fname);
return false;
}
for (int i = 0; i < n_entries; ++i) {
@ -263,23 +313,22 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file);
printf("%s: failed reading name for entry %d from %s\n",__func__,i+1, fname);
return false;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto& e = imatrix_data[std::move(name)];
auto & e = m_stats[std::move(name)];
int ncall;
in.read((char*)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n",__func__,i);
imatrix_data = {};
m_stats = {};
return false;
}
// When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
@ -289,7 +338,7 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
m_stats = {};
return false;
}
@ -304,13 +353,6 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
return true;
}
bool IMatrixCollector::load_imatrix(const char * file_name, bool add) {
if (!add) {
m_stats.clear();
}
return load_imatrix(file_name, m_stats);
}
static IMatrixCollector g_collector;
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
@ -358,8 +400,7 @@ static results_log_softmax log_softmax(int n_vocab, const float * logits, int to
static void process_logits(
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
double & nll, double & nll2, float * logit_history, float * prob_history
) {
double & nll, double & nll2, float * logit_history, float * prob_history) {
std::mutex mutex;
int counter = 0;
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
@ -391,8 +432,7 @@ static void process_logits(
}
}
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) {
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1);
const int n_ctx = llama_n_ctx(ctx);
@ -405,13 +445,13 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
auto tim2 = std::chrono::high_resolution_clock::now();
fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
if (from_chunk > 0) {
if (size_t((from_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, from_chunk);
if (params.i_chunk > 0) {
if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
fprintf(stderr, "%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
return false;
}
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, from_chunk, from_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + from_chunk*n_ctx);
fprintf(stderr, "%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
}
if (int(tokens.size()) < 2*n_ctx) {
@ -424,7 +464,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
std::vector<float> logit_history;
std::vector<float> prob_history;
if (compute_ppl) {
if (params.compute_ppl) {
logit_history.resize(tokens.size());
prob_history.resize(tokens.size());
}
@ -446,7 +486,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
std::vector<float> logits;
if (compute_ppl && num_batches > 1) {
if (params.compute_ppl && num_batches > 1) {
logits.reserve((size_t)n_ctx * n_vocab);
}
@ -482,7 +522,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
// restore the original token in case it was set to BOS
tokens[batch_start] = token_org;
if (compute_ppl && num_batches > 1) {
if (params.compute_ppl && num_batches > 1) {
const auto * batch_logits = llama_get_logits(ctx);
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
}
@ -501,7 +541,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
}
if (compute_ppl) {
if (params.compute_ppl) {
const int first = n_ctx/2;
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
@ -516,7 +556,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
printf("\n");
if (compute_ppl) {
if (params.compute_ppl) {
nll2 /= count;
nll /= count;
const double ppl = exp(nll);
@ -533,111 +573,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
}
int main(int argc, char ** argv) {
StatParams sparams;
std::string prev_result_file;
std::string combine_files;
bool compute_ppl = true;
int from_chunk = 0;
std::vector<char*> args;
args.push_back(argv[0]);
int iarg = 1;
for (; iarg < argc-1; ++iarg) {
std::string arg{argv[iarg]};
if (arg == "-o" || arg == "--output-file") {
sparams.ofile = argv[++iarg];
}
else if (arg == "-ofreq" || arg == "--output-frequency") {
sparams.n_output_frequency = std::stoi(argv[++iarg]);
}
else if (arg == "-ow" || arg == "--output-weight") {
sparams.collect_output_weight = std::stoi(argv[++iarg]);
}
else if (arg == "--verbosity") {
sparams.verbosity = std::stoi(argv[++iarg]);
} else if (arg == "--no-ppl") {
compute_ppl = false;
} else if (arg == "--keep-imatrix") {
sparams.keep_every = std::stoi(argv[++iarg]);
} else if (arg == "--continue-from") {
prev_result_file = argv[++iarg];
} else if (arg == "--combine") {
combine_files = argv[++iarg];
}
else if (arg == "--from-chunk") {
from_chunk = std::stoi(argv[++iarg]);
} else {
args.push_back(argv[iarg]);
}
}
if (iarg < argc) {
std::string arg{argv[iarg]};
if (arg == "--no-ppl") {
compute_ppl = false;
} else {
args.push_back(argv[iarg]);
}
}
gpt_params params;
params.n_batch = 512;
if (!gpt_params_parse(args.size(), args.data(), params)) {
params.n_ctx = 512;
params.logits_all = true;
params.verbosity = 1;
if (!gpt_params_parse(argc, argv, params)) {
print_usage(argc, argv, params);
return 1;
}
params.logits_all = true;
params.n_batch = std::min(params.n_batch, params.n_ctx);
print_build_info();
g_collector.set_params(params);
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
fprintf(stderr, "%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
sparams.dataset = params.prompt_file;
g_collector.set_parameters(std::move(sparams));
if (!combine_files.empty()) {
std::vector<std::string> files;
size_t pos = 0;
while (true) {
auto new_pos = combine_files.find(',', pos);
if (new_pos != std::string::npos) {
files.emplace_back(combine_files.substr(pos, new_pos - pos));
pos = new_pos + 1;
} else {
files.emplace_back(combine_files.substr(pos));
break;
}
}
if (files.size() < 2) {
fprintf(stderr, "You must provide at least two comma separated files to use --combine\n");
return 1;
}
printf("Combining the following %d files\n", int(files.size()));
for (auto& file : files) {
printf(" %s\n", file.c_str());
if (!g_collector.load_imatrix(file.c_str(), true)) {
fprintf(stderr, "Failed to load %s\n", file.c_str());
for (const auto & in_file : params.in_files) {
printf("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
if (!g_collector.load_imatrix(in_file.c_str())) {
fprintf(stderr, "%s : failed to load %s\n", __func__, in_file.c_str());
return 1;
}
}
if (params.in_files.size() > 1) {
printf("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
g_collector.save_imatrix();
return 0;
}
if (!prev_result_file.empty()) {
if (!g_collector.load_imatrix(prev_result_file.c_str(), false)) {
fprintf(stderr, "=============== Failed to load %s\n", prev_result_file.c_str());
return 1;
}
}
llama_backend_init();
@ -652,6 +613,7 @@ int main(int argc, char ** argv) {
// init
llama_model * model;
llama_context * ctx;
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (model == nullptr || ctx == nullptr) {
fprintf(stderr, "%s : failed to init\n", __func__);
@ -670,8 +632,7 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s\n", gpt_params_get_system_info(params).c_str());
}
bool OK = compute_imatrix(ctx, params, compute_ppl, from_chunk);
if (!OK) {
if (!compute_imatrix(ctx, params)) {
return 1;
}

View file

@ -107,6 +107,7 @@ int main(int argc, char ** argv) {
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
@ -139,27 +140,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;
}
if (params.instruct) {
printf("\n************\n");
printf("%s: please use the 'main' tool for instruct mode\n", __func__);
printf("************\n\n");
return 0;
}
if (params.chatml) {
printf("\n************\n");
printf("%s: please use the 'main' tool for chatml mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.antiprompt.empty()) {
printf("\n************\n");
printf("%s: please use the 'main' tool for antiprompt mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) {
printf("\n************\n");
printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__);
@ -167,20 +147,6 @@ int main(int argc, char ** argv) {
return 0;
}
if (params.random_prompt) {
printf("\n************\n");
printf("%s: please use the 'main' tool for random prompt mode\n", __func__);
printf("************\n\n");
return 0;
}
if (!params.path_prompt_cache.empty()) {
printf("\n************\n");
printf("%s: infill does not support prompt caching\n", __func__);
printf("************\n\n");
return 0;
}
if (params.rope_freq_base != 0.0) {
LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base);
@ -207,17 +173,13 @@ int main(int argc, char ** argv) {
llama_model * model;
llama_context * ctx;
llama_context * ctx_guidance = NULL;
g_model = &model;
g_ctx = &ctx;
// load the model and apply lora adapter, if any
LOG("%s: load the model and apply lora adapter, if any\n", __func__);
std::tie(model, ctx) = llama_init_from_gpt_params(params);
if (sparams.cfg_scale > 1.f) {
struct llama_context_params lparams = llama_context_params_from_gpt_params(params);
ctx_guidance = llama_new_context_with_model(model, lparams);
}
if (model == NULL) {
LOG_TEE("%s: error: unable to load model\n", __func__);
@ -273,25 +235,6 @@ int main(int argc, char ** argv) {
LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
}
// Tokenize negative prompt
std::vector<llama_token> guidance_inp;
int guidance_offset = 0;
int original_prompt_len = 0;
if (ctx_guidance) {
LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt));
guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true);
LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str());
std::vector<llama_token> original_inp = ::llama_tokenize(ctx, params.prompt, true);
LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str());
original_prompt_len = original_inp.size();
guidance_offset = (int)guidance_inp.size() - original_prompt_len;
LOG("original_prompt_len: %s", log_tostr(original_prompt_len));
LOG("guidance_offset: %s", log_tostr(guidance_offset));
}
if ((int) embd_inp.size() > n_ctx - 4) {
LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4);
return 1;
@ -319,15 +262,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
}
if (ctx_guidance) {
LOG_TEE("\n");
LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str());
LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size());
for (int i = 0; i < (int) guidance_inp.size(); i++) {
LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str());
}
}
if (params.n_keep > 0) {
LOG_TEE("%s: static prompt based on n_keep: '", __func__);
for (int i = 0; i < params.n_keep; i++) {
@ -400,7 +334,6 @@ int main(int argc, char ** argv) {
int n_past = 0;
int n_remain = params.n_predict;
int n_consumed = 0;
int n_past_guidance = 0;
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
@ -410,7 +343,6 @@ int main(int argc, char ** argv) {
console::set_display(console::prompt);
std::vector<llama_token> embd;
std::vector<llama_token> embd_guidance;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams);
@ -436,7 +368,7 @@ int main(int argc, char ** argv) {
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (n_past + (int) embd.size() > n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
@ -453,11 +385,7 @@ int main(int argc, char ** argv) {
n_past -= n_discard;
if (ctx_guidance) {
n_past_guidance -= n_discard;
}
LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance);
LOG("after swap: n_past = %d\n", n_past);
LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str());
@ -465,45 +393,6 @@ int main(int argc, char ** argv) {
// evaluate tokens in batches
// embd is typically prepared beforehand to fit within a batch, but not always
if (ctx_guidance) {
int input_size = 0;
llama_token * input_buf = NULL;
if (n_past_guidance < (int) guidance_inp.size()) {
// Guidance context should have the same data with these modifications:
//
// * Replace the initial prompt
// * Shift everything by guidance_offset
embd_guidance = guidance_inp;
if (embd.begin() + original_prompt_len < embd.end()) {
embd_guidance.insert(
embd_guidance.end(),
embd.begin() + original_prompt_len,
embd.end()
);
}
input_buf = embd_guidance.data();
input_size = embd_guidance.size();
LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str());
} else {
input_buf = embd.data();
input_size = embd.size();
}
for (int i = 0; i < input_size; i += params.n_batch) {
int n_eval = std::min(input_size - i, params.n_batch);
if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) {
LOG_TEE("%s : failed to eval\n", __func__);
return 1;
}
n_past_guidance += n_eval;
}
}
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
int n_eval = (int) embd.size() - i;
if (n_eval > params.n_batch) {
@ -525,11 +414,9 @@ int main(int argc, char ** argv) {
}
embd.clear();
embd_guidance.clear();
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance);
const llama_token id = llama_sampling_sample(ctx_sampling, ctx, nullptr);
llama_sampling_accept(ctx_sampling, ctx, id, true);
@ -583,7 +470,6 @@ int main(int argc, char ** argv) {
// if not currently processing queued inputs;
if ((int) embd_inp.size() <= n_consumed) {
// deal with eot token in infill mode
if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){
if (is_interacting && !params.interactive_first) {
@ -644,7 +530,6 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
embd_inp.push_back(llama_token_middle(model));
embd.clear();
embd_guidance.clear();
n_remain = params.n_predict;
n_past = 0;
n_consumed = 0;
@ -751,7 +636,6 @@ int main(int argc, char ** argv) {
llama_print_timings(ctx);
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
if (ctx_guidance) { llama_free(ctx_guidance); }
llama_free(ctx);
llama_free_model(model);

View file

@ -6,52 +6,22 @@ import re
import sys
from typing import Any, Dict, List, Set, Tuple, Union
def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False):
if not separator_rule:
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
if min_items == 0 and max_items == 1:
return f'{item_rule}?'
elif min_items == 1 and max_items is None:
if not separator_rule:
if min_items == 1 and max_items is None:
return f'{item_rule}+'
result = ''
if min_items > 0:
if item_rule_is_literal and separator_rule is None:
result = '"' + (item_rule[1:-1] * min_items) + '"'
elif min_items == 0 and max_items is None:
return f'{item_rule}*'
else:
result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items)
return f'{item_rule}{{{min_items},{max_items if max_items is not None else ""}}}'
def opt_repetitions(up_to_n, prefix_with_sep=False):
'''
- n=4, no sep: '(a (a (a (a)?)?)?)?'
- n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?'
- n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?'
'''
content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule
if up_to_n == 0:
return ''
elif up_to_n == 1:
return f'({content})?'
elif separator_rule and not prefix_with_sep:
return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?'
else:
return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n)
if min_items > 0 and max_items != min_items:
result += ' '
if max_items is not None:
result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0)
else:
item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})'
if min_items == 0 and separator_rule:
result = f'({item_rule} {item_operator}*)?'
else:
result += f'{item_operator}*'
return result
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
return f'({result})?' if min_items == 0 else result
class BuiltinRule:
@ -59,31 +29,28 @@ class BuiltinRule:
self.content = content
self.deps = deps or []
_up_to_15_digits = _build_repetition('[0-9]', 0, 15)
# whitespace is constrained to a single space char to prevent model "running away" in
# whitespace. Also maybe improves generation quality?
SPACE_RULE = '" "?'
# Constraining spaces to prevent model "running away".
SPACE_RULE = '| " " | "\\n" [ \\t]{0,20}'
PRIMITIVE_RULES = {
'boolean' : BuiltinRule('("true" | "false") space', []),
'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []),
'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []),
'decimal-part' : BuiltinRule('[0-9]{1,16}', []),
'integral-part': BuiltinRule('[0] | [1-9] [0-9]{0,15}', []),
'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']),
'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']),
'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']),
'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']),
'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']),
'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []),
'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []),
'uuid' : BuiltinRule(r'"\"" [0-9a-fA-F]{8} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{4} "-" [0-9a-fA-F]{12} "\"" space', []),
'char' : BuiltinRule(r'[^"\\\x7F\x00-\x1F] | [\\] (["\\bfnrt] | "u" [0-9a-fA-F]{4})', []),
'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']),
'null' : BuiltinRule('"null" space', []),
}
# TODO: support "uri", "email" string formats
STRING_FORMAT_RULES = {
'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date' : BuiltinRule('[0-9]{4} "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []),
'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9]{3} )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []),
'date-time' : BuiltinRule('date "T" time', ['date', 'time']),
'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']),
'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']),
@ -333,7 +300,7 @@ class SchemaConverter:
sub_rule_ids[sub] = id
sub = id
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False)
seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times), False)
else:
literal = ''
while i < length:

View file

@ -162,7 +162,7 @@ $ ./llama-bench -o csv
```
```csv
build_commit,build_number,cuda,opencl,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
```
@ -179,7 +179,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@ -210,7 +209,6 @@ $ ./llama-bench -o json
"build_commit": "3469684",
"build_number": 1275,
"cuda": true,
"opencl": false,
"metal": false,
"gpu_blas": true,
"blas": true,
@ -253,7 +251,6 @@ CREATE TABLE IF NOT EXISTS test (
build_commit TEXT,
build_number INTEGER,
cuda INTEGER,
opencl INTEGER,
metal INTEGER,
gpu_blas INTEGER,
blas INTEGER,
@ -279,6 +276,6 @@ CREATE TABLE IF NOT EXISTS test (
stddev_ts REAL
);
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, opencl, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
```

View file

@ -41,20 +41,6 @@ static std::string join(const std::vector<T> & values, const std::string & delim
return str.str();
}
template<class T>
static std::vector<T> split(const std::string & str, char delim) {
std::vector<T> values;
std::istringstream str_stream(str);
std::string token;
while (std::getline(str_stream, token, delim)) {
T value;
std::istringstream token_stream(token);
token_stream >> value;
values.push_back(value);
}
return values;
}
template<typename T, typename F>
static std::vector<std::string> transform_to_str(const std::vector<T> & values, F f) {
std::vector<std::string> str_values;
@ -140,10 +126,11 @@ static std::string get_gpu_info() {
}
// command line params
enum output_formats {CSV, JSON, MARKDOWN, SQL};
enum output_formats {NONE, CSV, JSON, MARKDOWN, SQL};
static const char * output_format_str(output_formats format) {
switch (format) {
case NONE: return "none";
case CSV: return "csv";
case JSON: return "json";
case MARKDOWN: return "md";
@ -152,6 +139,23 @@ static const char * output_format_str(output_formats format) {
}
}
static bool output_format_from_str(const std::string & s, output_formats & format) {
if (s == "none") {
format = NONE;
} else if (s == "csv") {
format = CSV;
} else if (s == "json") {
format = JSON;
} else if (s == "md") {
format = MARKDOWN;
} else if (s == "sql") {
format = SQL;
} else {
return false;
}
return true;
}
static const char * split_mode_str(llama_split_mode mode) {
switch (mode) {
case LLAMA_SPLIT_MODE_NONE: return "none";
@ -178,6 +182,7 @@ struct cmd_params {
std::vector<ggml_type> type_v;
std::vector<int> n_threads;
std::vector<int> n_gpu_layers;
std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
@ -189,6 +194,7 @@ struct cmd_params {
int reps;
bool verbose;
output_formats output_format;
output_formats output_format_stderr;
};
static const cmd_params cmd_params_defaults = {
@ -202,6 +208,7 @@ static const cmd_params cmd_params_defaults = {
/* type_v */ {GGML_TYPE_F16},
/* n_threads */ {cpu_get_num_math()},
/* n_gpu_layers */ {99},
/* rpc_servers */ {""},
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
/* main_gpu */ {0},
/* no_kv_offload */ {false},
@ -212,7 +219,8 @@ static const cmd_params cmd_params_defaults = {
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
/* output_format */ MARKDOWN,
/* output_format_stderr */ NONE,
};
static void print_usage(int /* argc */, char ** argv) {
@ -230,6 +238,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ctv, --cache-type-v <t> (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
@ -240,6 +249,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
printf(" -o, --output <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|md|sql> (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf("\n");
printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
@ -281,6 +291,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
params.verbose = cmd_params_defaults.verbose;
params.output_format = cmd_params_defaults.output_format;
params.output_format_stderr = cmd_params_defaults.output_format_stderr;
params.reps = cmd_params_defaults.reps;
for (int i = 1; i < argc; i++) {
@ -297,28 +308,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
auto p = string_split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
auto p = string_split<int>(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
auto p = string_split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], ',');
auto p = string_split<std::string>(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
@ -329,21 +340,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
auto p = string_split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
auto p = string_split<int>(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
@ -359,7 +370,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
@ -375,21 +386,27 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
auto p = string_split<int>(argv[i], split_delim);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<int>(argv[i], split_delim);
auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<std::string>(argv[i], split_delim);
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<llama_split_mode> modes;
for (const auto & m : p) {
llama_split_mode mode;
@ -411,13 +428,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
params.main_gpu = split<int>(argv[i], split_delim);
params.main_gpu = string_split<int>(argv[i], split_delim);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
@ -435,28 +452,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim);
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
} else if (arg == "-mmp" || arg == "--mmap") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim);
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
} else if (arg == "-embd" || arg == "--embeddings") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim);
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
} else if (arg == "-ts" || arg == "--tensor-split") {
if (++i >= argc) {
invalid_param = true;
break;
}
for (auto ts : split<std::string>(argv[i], split_delim)) {
for (auto ts : string_split<std::string>(argv[i], split_delim)) {
// split string by ; and /
const std::regex regex{R"([;/]+)"};
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
@ -484,18 +501,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true;
break;
}
if (argv[i] == std::string("csv")) {
params.output_format = CSV;
} else if (argv[i] == std::string("json")) {
params.output_format = JSON;
} else if (argv[i] == std::string("md")) {
params.output_format = MARKDOWN;
} else if (argv[i] == std::string("sql")) {
params.output_format = SQL;
} else {
invalid_param = !output_format_from_str(argv[i], params.output_format);
} else if (arg == "-oe" || arg == "--output-err") {
if (++i >= argc) {
invalid_param = true;
break;
}
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else {
@ -519,6 +531,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; }
if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; }
if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; }
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
@ -541,6 +554,7 @@ struct cmd_params_instance {
ggml_type type_v;
int n_threads;
int n_gpu_layers;
std::string rpc_servers;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
@ -553,6 +567,9 @@ struct cmd_params_instance {
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
if (!rpc_servers.empty()) {
mparams.rpc_servers = rpc_servers.c_str();
}
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
mparams.tensor_split = tensor_split.data();
@ -564,6 +581,7 @@ struct cmd_params_instance {
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model &&
n_gpu_layers == other.n_gpu_layers &&
rpc_servers == other.rpc_servers &&
split_mode == other.split_mode &&
main_gpu == other.main_gpu &&
use_mmap == other.use_mmap &&
@ -592,6 +610,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
// this ordering minimizes the number of times that each model needs to be reloaded
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
for (const auto & rpc : params.rpc_servers)
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
for (const auto & ts : params.tensor_split)
@ -618,6 +637,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@ -643,6 +663,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@ -668,6 +689,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .type_v = */ tv,
/* .n_threads = */ nt,
/* .n_gpu_layers = */ nl,
/* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
@ -687,11 +709,11 @@ struct test {
static const std::string build_commit;
static const int build_number;
static const bool cuda;
static const bool opencl;
static const bool vulkan;
static const bool kompute;
static const bool metal;
static const bool sycl;
static const bool rpc;
static const bool gpu_blas;
static const bool blas;
static const std::string cpu_info;
@ -775,9 +797,6 @@ struct test {
if (cuda) {
return GGML_CUDA_NAME;
}
if (opencl) {
return "OpenCL";
}
if (vulkan) {
return "Vulkan";
}
@ -790,6 +809,9 @@ struct test {
if (sycl) {
return GGML_SYCL_NAME;
}
if (rpc) {
return "RPC";
}
if (gpu_blas) {
return "GPU BLAS";
}
@ -803,7 +825,7 @@ struct test {
static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
"build_commit", "build_number",
"cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas",
"cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas",
"cpu_info", "gpu_info",
"model_filename", "model_type", "model_size", "model_n_params",
"n_batch", "n_ubatch",
@ -829,7 +851,7 @@ struct test {
field == "avg_ns" || field == "stddev_ns") {
return INT;
}
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" ||
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
return BOOL;
@ -858,8 +880,8 @@ struct test {
}
std::vector<std::string> values = {
build_commit, std::to_string(build_number),
std::to_string(cuda), std::to_string(opencl), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(sycl), std::to_string(gpu_blas), std::to_string(blas),
std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan),
std::to_string(metal), std::to_string(sycl), std::to_string(rpc), std::to_string(gpu_blas), std::to_string(blas),
cpu_info, gpu_info,
model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
std::to_string(n_batch), std::to_string(n_ubatch),
@ -887,13 +909,13 @@ struct test {
const std::string test::build_commit = LLAMA_COMMIT;
const int test::build_number = LLAMA_BUILD_NUMBER;
const bool test::cuda = !!ggml_cpu_has_cuda();
const bool test::opencl = !!ggml_cpu_has_clblast();
const bool test::vulkan = !!ggml_cpu_has_vulkan();
const bool test::kompute = !!ggml_cpu_has_kompute();
const bool test::metal = !!ggml_cpu_has_metal();
const bool test::gpu_blas = !!ggml_cpu_has_gpublas();
const bool test::blas = !!ggml_cpu_has_blas();
const bool test::sycl = !!ggml_cpu_has_sycl();
const bool test::rpc = !!ggml_cpu_has_rpc();
const std::string test::cpu_info = get_cpu_info();
const std::string test::gpu_info = get_gpu_info();
@ -1011,6 +1033,27 @@ struct markdown_printer : public printer {
if (field == "n_gpu_layers") {
return 3;
}
if (field == "n_threads") {
return 7;
}
if (field == "n_batch") {
return 7;
}
if (field == "n_ubatch") {
return 8;
}
if (field == "type_k" || field == "type_v") {
return 6;
}
if (field == "split_mode") {
return 5;
}
if (field == "flash_attn") {
return 2;
}
if (field == "use_mmap") {
return 4;
}
if (field == "test") {
return 13;
}
@ -1254,6 +1297,22 @@ static void llama_null_log_callback(enum ggml_log_level level, const char * text
(void) user_data;
}
static std::unique_ptr<printer> create_printer(output_formats format) {
switch (format) {
case NONE:
return nullptr;
case CSV:
return std::unique_ptr<printer>(new csv_printer());
case JSON:
return std::unique_ptr<printer>(new json_printer());
case MARKDOWN:
return std::unique_ptr<printer>(new markdown_printer());
case SQL:
return std::unique_ptr<printer>(new sql_printer());
}
GGML_ASSERT(false);
}
int main(int argc, char ** argv) {
// try to set locale for unicode characters in markdown
setlocale(LC_CTYPE, ".UTF-8");
@ -1280,26 +1339,18 @@ int main(int argc, char ** argv) {
llama_numa_init(params.numa);
// initialize printer
std::unique_ptr<printer> p;
switch (params.output_format) {
case CSV:
p.reset(new csv_printer());
break;
case JSON:
p.reset(new json_printer());
break;
case MARKDOWN:
p.reset(new markdown_printer());
break;
case SQL:
p.reset(new sql_printer());
break;
default:
assert(false);
exit(1);
}
std::unique_ptr<printer> p = create_printer(params.output_format);
std::unique_ptr<printer> p_err = create_printer(params.output_format_stderr);
if (p) {
p->fout = stdout;
p->print_header(params);
}
if (p_err) {
p_err->fout = stderr;
p_err->print_header(params);
}
std::vector<cmd_params_instance> params_instances = get_cmd_params_instances(params);
@ -1357,7 +1408,15 @@ int main(int argc, char ** argv) {
t.samples_ns.push_back(t_ns);
}
if (p) {
p->print_test(t);
fflush(p->fout);
}
if (p_err) {
p_err->print_test(t);
fflush(p_err->fout);
}
llama_print_timings(ctx);
@ -1366,7 +1425,13 @@ int main(int argc, char ** argv) {
llama_free_model(lmodel);
if (p) {
p->print_footer();
}
if (p_err) {
p_err->print_footer();
}
llama_backend_free();

View file

@ -7,8 +7,6 @@ android {
namespace = "com.example.llama"
compileSdk = 34
ndkVersion = "26.1.10909125"
defaultConfig {
applicationId = "com.example.llama"
minSdk = 33
@ -20,17 +18,6 @@ android {
vectorDrawables {
useSupportLibrary = true
}
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
}
externalNativeBuild {
cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()
}
}
}
buildTypes {
@ -55,17 +42,6 @@ android {
composeOptions {
kotlinCompilerExtensionVersion = "1.5.1"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
externalNativeBuild {
cmake {
path = file("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
}
dependencies {
@ -78,6 +54,7 @@ dependencies {
implementation("androidx.compose.ui:ui-graphics")
implementation("androidx.compose.ui:ui-tooling-preview")
implementation("androidx.compose.material3:material3")
implementation(project(":llama"))
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")

View file

@ -1,5 +1,6 @@
package com.example.llama
import android.llama.cpp.LLamaAndroid
import android.util.Log
import androidx.compose.runtime.getValue
import androidx.compose.runtime.mutableStateOf
@ -9,7 +10,7 @@ import androidx.lifecycle.viewModelScope
import kotlinx.coroutines.flow.catch
import kotlinx.coroutines.launch
class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
class MainViewModel(private val llamaAndroid: LLamaAndroid = LLamaAndroid.instance()): ViewModel() {
companion object {
@JvmStatic
private val NanosPerSecond = 1_000_000_000.0
@ -28,7 +29,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch {
try {
llm.unload()
llamaAndroid.unload()
} catch (exc: IllegalStateException) {
messages += exc.message!!
}
@ -44,7 +45,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
messages += ""
viewModelScope.launch {
llm.send(text)
llamaAndroid.send(text)
.catch {
Log.e(tag, "send() failed", it)
messages += it.message!!
@ -57,7 +58,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
viewModelScope.launch {
try {
val start = System.nanoTime()
val warmupResult = llm.bench(pp, tg, pl, nr)
val warmupResult = llamaAndroid.bench(pp, tg, pl, nr)
val end = System.nanoTime()
messages += warmupResult
@ -70,7 +71,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
return@launch
}
messages += llm.bench(512, 128, 1, 3)
messages += llamaAndroid.bench(512, 128, 1, 3)
} catch (exc: IllegalStateException) {
Log.e(tag, "bench() failed", exc)
messages += exc.message!!
@ -81,7 +82,7 @@ class MainViewModel(private val llm: Llm = Llm.instance()): ViewModel() {
fun load(pathToModel: String) {
viewModelScope.launch {
try {
llm.load(pathToModel)
llamaAndroid.load(pathToModel)
messages += "Loaded $pathToModel"
} catch (exc: IllegalStateException) {
Log.e(tag, "load() failed", exc)

View file

@ -2,4 +2,5 @@
plugins {
id("com.android.application") version "8.2.0" apply false
id("org.jetbrains.kotlin.android") version "1.9.0" apply false
id("com.android.library") version "8.2.0" apply false
}

View file

@ -0,0 +1 @@
/build

View file

@ -0,0 +1,68 @@
plugins {
id("com.android.library")
id("org.jetbrains.kotlin.android")
}
android {
namespace = "android.llama.cpp"
compileSdk = 34
defaultConfig {
minSdk = 33
testInstrumentationRunner = "androidx.test.runner.AndroidJUnitRunner"
consumerProguardFiles("consumer-rules.pro")
ndk {
// Add NDK properties if wanted, e.g.
// abiFilters += listOf("arm64-v8a")
}
externalNativeBuild {
cmake {
arguments += "-DCMAKE_BUILD_TYPE=Release"
cppFlags += listOf()
arguments += listOf()
cppFlags("")
}
}
}
buildTypes {
release {
isMinifyEnabled = false
proguardFiles(
getDefaultProguardFile("proguard-android-optimize.txt"),
"proguard-rules.pro"
)
}
}
externalNativeBuild {
cmake {
path("src/main/cpp/CMakeLists.txt")
version = "3.22.1"
}
}
compileOptions {
sourceCompatibility = JavaVersion.VERSION_1_8
targetCompatibility = JavaVersion.VERSION_1_8
}
kotlinOptions {
jvmTarget = "1.8"
}
packaging {
resources {
excludes += "/META-INF/{AL2.0,LGPL2.1}"
}
}
}
dependencies {
implementation("androidx.core:core-ktx:1.12.0")
implementation("androidx.appcompat:appcompat:1.6.1")
implementation("com.google.android.material:material:1.11.0")
testImplementation("junit:junit:4.13.2")
androidTestImplementation("androidx.test.ext:junit:1.1.5")
androidTestImplementation("androidx.test.espresso:espresso-core:3.5.1")
}

View file

@ -0,0 +1,21 @@
# Add project specific ProGuard rules here.
# You can control the set of applied configuration files using the
# proguardFiles setting in build.gradle.
#
# For more details, see
# http://developer.android.com/guide/developing/tools/proguard.html
# If your project uses WebView with JS, uncomment the following
# and specify the fully qualified class name to the JavaScript interface
# class:
#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
# public *;
#}
# Uncomment this to preserve the line number information for
# debugging stack traces.
#-keepattributes SourceFile,LineNumberTable
# If you keep the line number information, uncomment this to
# hide the original source file name.
#-renamesourcefileattribute SourceFile

View file

@ -0,0 +1,24 @@
package android.llama.cpp
import androidx.test.platform.app.InstrumentationRegistry
import androidx.test.ext.junit.runners.AndroidJUnit4
import org.junit.Test
import org.junit.runner.RunWith
import org.junit.Assert.*
/**
* Instrumented test, which will execute on an Android device.
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
@RunWith(AndroidJUnit4::class)
class ExampleInstrumentedTest {
@Test
fun useAppContext() {
// Context of the app under test.
val appContext = InstrumentationRegistry.getInstrumentation().targetContext
assertEquals("android.llama.cpp.test", appContext.packageName)
}
}

View file

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<manifest xmlns:android="http://schemas.android.com/apk/res/android">
</manifest>

View file

@ -0,0 +1,49 @@
# For more information about using CMake with Android Studio, read the
# documentation: https://d.android.com/studio/projects/add-native-code.html.
# For more examples on how to use CMake, see https://github.com/android/ndk-samples.
# Sets the minimum CMake version required for this project.
cmake_minimum_required(VERSION 3.22.1)
# Declares the project name. The project name can be accessed via ${ PROJECT_NAME},
# Since this is the top level CMakeLists.txt, the project name is also accessible
# with ${CMAKE_PROJECT_NAME} (both CMake variables are in-sync within the top level
# build script scope).
project("llama-android")
include(FetchContent)
FetchContent_Declare(
llama
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
GIT_TAG master
)
# Also provides "common"
FetchContent_MakeAvailable(llama)
# Creates and names a library, sets it as either STATIC
# or SHARED, and provides the relative paths to its source code.
# You can define multiple libraries, and CMake builds them for you.
# Gradle automatically packages shared libraries with your APK.
#
# In this top level CMakeLists.txt, ${CMAKE_PROJECT_NAME} is used to define
# the target library name; in the sub-module's CMakeLists.txt, ${PROJECT_NAME}
# is preferred for the same purpose.
#
# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
# for GameActivity/NativeActivity derived applications, the same library name must be
# used in the AndroidManifest.xml file.
add_library(${CMAKE_PROJECT_NAME} SHARED
# List C/C++ source files with relative paths to this CMakeLists.txt.
llama-android.cpp)
# Specifies libraries CMake should link to your target library. You
# can link libraries from various origins, such as libraries defined in this
# build script, prebuilt third-party libraries, or Android system libraries.
target_link_libraries(${CMAKE_PROJECT_NAME}
# List libraries link to the target library
llama
common
android
log)

View file

@ -81,7 +81,7 @@ static void log_callback(ggml_log_level level, const char * fmt, void * data) {
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
Java_android_llama_cpp_LLamaAndroid_load_1model(JNIEnv *env, jobject, jstring filename) {
llama_model_params model_params = llama_model_default_params();
auto path_to_model = env->GetStringUTFChars(filename, 0);
@ -101,13 +101,13 @@ Java_com_example_llama_Llm_load_1model(JNIEnv *env, jobject, jstring filename) {
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1model(JNIEnv *, jobject, jlong model) {
Java_android_llama_cpp_LLamaAndroid_free_1model(JNIEnv *, jobject, jlong model) {
llama_free_model(reinterpret_cast<llama_model *>(model));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
Java_android_llama_cpp_LLamaAndroid_new_1context(JNIEnv *env, jobject, jlong jmodel) {
auto model = reinterpret_cast<llama_model *>(jmodel);
if (!model) {
@ -139,25 +139,25 @@ Java_com_example_llama_Llm_new_1context(JNIEnv *env, jobject, jlong jmodel) {
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1context(JNIEnv *, jobject, jlong context) {
Java_android_llama_cpp_LLamaAndroid_free_1context(JNIEnv *, jobject, jlong context) {
llama_free(reinterpret_cast<llama_context *>(context));
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1free(JNIEnv *, jobject) {
Java_android_llama_cpp_LLamaAndroid_backend_1free(JNIEnv *, jobject) {
llama_backend_free();
}
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_log_1to_1android(JNIEnv *, jobject) {
Java_android_llama_cpp_LLamaAndroid_log_1to_1android(JNIEnv *, jobject) {
llama_log_set(log_callback, NULL);
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_bench_1model(
Java_android_llama_cpp_LLamaAndroid_bench_1model(
JNIEnv *env,
jobject,
jlong context_pointer,
@ -271,13 +271,13 @@ Java_com_example_llama_Llm_bench_1model(
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
Java_android_llama_cpp_LLamaAndroid_free_1batch(JNIEnv *, jobject, jlong batch_pointer) {
llama_batch_free(*reinterpret_cast<llama_batch *>(batch_pointer));
}
extern "C"
JNIEXPORT jlong JNICALL
Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
Java_android_llama_cpp_LLamaAndroid_new_1batch(JNIEnv *, jobject, jint n_tokens, jint embd, jint n_seq_max) {
// Source: Copy of llama.cpp:llama_batch_init but heap-allocated.
@ -313,19 +313,19 @@ Java_com_example_llama_Llm_new_1batch(JNIEnv *, jobject, jint n_tokens, jint emb
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_backend_1init(JNIEnv *, jobject) {
Java_android_llama_cpp_LLamaAndroid_backend_1init(JNIEnv *, jobject) {
llama_backend_init();
}
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_system_1info(JNIEnv *env, jobject) {
Java_android_llama_cpp_LLamaAndroid_system_1info(JNIEnv *env, jobject) {
return env->NewStringUTF(llama_print_system_info());
}
extern "C"
JNIEXPORT jint JNICALL
Java_com_example_llama_Llm_completion_1init(
Java_android_llama_cpp_LLamaAndroid_completion_1init(
JNIEnv *env,
jobject,
jlong context_pointer,
@ -376,7 +376,7 @@ Java_com_example_llama_Llm_completion_1init(
extern "C"
JNIEXPORT jstring JNICALL
Java_com_example_llama_Llm_completion_1loop(
Java_android_llama_cpp_LLamaAndroid_completion_1loop(
JNIEnv * env,
jobject,
jlong context_pointer,
@ -438,6 +438,6 @@ Java_com_example_llama_Llm_completion_1loop(
extern "C"
JNIEXPORT void JNICALL
Java_com_example_llama_Llm_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
}

View file

@ -1,4 +1,4 @@
package com.example.llama
package android.llama.cpp
import android.util.Log
import kotlinx.coroutines.CoroutineDispatcher
@ -10,7 +10,7 @@ import kotlinx.coroutines.withContext
import java.util.concurrent.Executors
import kotlin.concurrent.thread
class Llm {
class LLamaAndroid {
private val tag: String? = this::class.simpleName
private val threadLocalState: ThreadLocal<State> = ThreadLocal.withInitial { State.Idle }
@ -165,8 +165,8 @@ class Llm {
}
// Enforce only one instance of Llm.
private val _instance: Llm = Llm()
private val _instance: LLamaAndroid = LLamaAndroid()
fun instance(): Llm = _instance
fun instance(): LLamaAndroid = _instance
}
}

View file

@ -0,0 +1,17 @@
package android.llama.cpp
import org.junit.Test
import org.junit.Assert.*
/**
* Example local unit test, which will execute on the development machine (host).
*
* See [testing documentation](http://d.android.com/tools/testing).
*/
class ExampleUnitTest {
@Test
fun addition_isCorrect() {
assertEquals(4, 2 + 2)
}
}

View file

@ -15,3 +15,4 @@ dependencyResolutionManagement {
rootProject.name = "LlamaAndroid"
include(":app")
include(":llama")

View file

@ -1,18 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/13B/llama-2-13b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

View file

@ -1,18 +0,0 @@
#!/bin/bash
#
# Temporary script - will be removed in the future
#
cd `dirname $0`
cd ..
./main -m models/available/Llama2/7B/llama-2-7b.ggmlv3.q4_0.bin \
--color \
--ctx_size 2048 \
-n -1 \
-ins -b 256 \
--top_k 10000 \
--temp 0.2 \
--repeat_penalty 1.1 \
-t 8

View file

@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
--projector-type ldpv2
```
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./convert.py path/to/MobileVLM-1.7B
python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
```
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`

View file

@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
```
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
```sh
python ./convert.py ../llava-v1.5-7b --skip-unknown
python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
6) Then convert the model to gguf format:
```console
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
```
7) And finally we can run the llava-cli using the 1.6 model version:

View file

@ -68,7 +68,7 @@ CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

View file

@ -112,9 +112,12 @@ struct llava_context {
struct llama_model * model = NULL;
};
static void show_additional_info(int /*argc*/, char ** argv) {
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
static void print_usage(int argc, char ** argv, const gpt_params & params) {
gpt_params_print_usage(argc, argv, params);
LOG_TEE("\n example usage:\n");
LOG_TEE("\n %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
LOG_TEE("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
}
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
@ -278,7 +281,7 @@ int main(int argc, char ** argv) {
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
show_additional_info(argc, argv);
print_usage(argc, argv, params);
return 1;
}
@ -290,8 +293,7 @@ int main(int argc, char ** argv) {
#endif // LOG_DISABLE_LOGS
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
gpt_params_print_usage(argc, argv, params);
show_additional_info(argc, argv);
print_usage(argc, argv, {});
return 1;
}
auto model = llava_init(&params);

View file

@ -1,3 +1,3 @@
-r ../../requirements/requirements-convert.txt
-r ../../requirements/requirements-convert-legacy-llama.txt
pillow~=10.2.0
torch~=2.1.1

View file

@ -37,7 +37,8 @@ struct ngram_container {
int main(int argc, char ** argv) {
gpt_params params;
if (gpt_params_parse(argc, argv, params) == false) {
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}

View file

@ -14,8 +14,10 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
// init llama.cpp
llama_backend_init();
llama_numa_init(params.numa);

View file

@ -16,6 +16,7 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}

View file

@ -15,6 +15,7 @@ int main(int argc, char ** argv){
gpt_params params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}

View file

@ -8,16 +8,14 @@ Because this example is "outside of the source tree", it is important to first b
### Considerations
When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_.
When hardware acceleration libraries are used (e.g. CUDA, Metal, etc.), CMake must be able to locate the associated CMake package.
### Build llama.cpp and install to C:\LlamaCPP directory
In this case, CLBlast was already installed so the CMake package is referenced in `CMAKE_PREFIX_PATH`.
```cmd
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64
cmake -B build -DBUILD_SHARED_LIBS=OFF -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/LlamaCPP
```
@ -27,7 +25,7 @@ cmake --install build --prefix C:/LlamaCPP
```cmd
cd ..\examples\main-cmake-pkg
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64
cmake --build build --config Release
cmake --install build --prefix C:/MyLlamaApp
```

View file

@ -53,13 +53,13 @@ The following command generates "infinite" text from a starting prompt (you can
#### Unix-based systems (Linux, macOS, etc.):
```bash
./main -m models/7B/ggml-model.bin --ignore-eos -n -1 --random-prompt
./main -m models/7B/ggml-model.bin --ignore-eos -n -1
```
#### Windows:
```powershell
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1
```
## Common Options
@ -69,7 +69,6 @@ In this section, we cover the most commonly used options for running the `main`
- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
- `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
- `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
- `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
@ -80,11 +79,10 @@ The `main` program provides several ways to interact with the LLaMA models using
- `--prompt PROMPT`: Provide a prompt directly as a command-line option.
- `--file FNAME`: Provide a file containing a prompt or multiple prompts.
- `--interactive-first`: Run the program in interactive mode and wait for input right away. (More on this below.)
- `--random-prompt`: Start with a randomized prompt.
## Interaction
The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive`, `--interactive-first`, and `--instruct`.
The `main` program offers a seamless way to interact with LLaMA models, allowing users to engage in real-time conversations or provide instructions for specific tasks. The interactive mode can be triggered using various options, including `--interactive` and `--interactive-first`.
In interactive mode, users can participate in text generation by injecting their input during the process. Users can press `Ctrl+C` at any time to interject and type their input, followed by pressing `Return` to submit it to the LLaMA model. To submit additional lines without finalizing input, users can end the current line with a backslash (`\`) and continue typing.
@ -92,7 +90,6 @@ In interactive mode, users can participate in text generation by injecting their
- `-i, --interactive`: Run the program in interactive mode, allowing users to engage in real-time conversations or provide specific instructions to the model.
- `--interactive-first`: Run the program in interactive mode and immediately wait for user input before starting the text generation.
- `-ins, --instruct`: Run the program in instruction mode, which is specifically designed to work with Alpaca models that excel in completing tasks based on user instructions.
- `--color`: Enable colorized output to differentiate visually distinguishing between prompts, user input, and generated text.
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
@ -121,16 +118,6 @@ The `--in-suffix` flag is used to add a suffix after your input. This is useful
./main -r "User:" --in-prefix " " --in-suffix "Assistant:"
```
### Instruction Mode
Instruction mode is particularly useful when working with Alpaca models, which are designed to follow user instructions for specific tasks:
- `-ins, --instruct`: Enable instruction mode to leverage the capabilities of Alpaca models in completing tasks based on user-provided instructions.
Technical detail: the user's input is internally prefixed with the reverse prompt (or `### Instruction:` as the default), and followed by `### Response:` (except if you just press Return without any input, to keep generating a longer response).
By understanding and utilizing these interaction options, you can create engaging and dynamic experiences with the LLaMA models, tailoring the text generation process to your specific needs.
## Context Management
During text generation, LLaMA models have a limited context size, which means they can only consider a certain number of tokens from the input and generated text. When the context fills up, the model resets internally, potentially losing some information from the beginning of the conversation or instructions. Context management options help maintain continuity and coherence in these situations.

View file

@ -122,8 +122,10 @@ int main(int argc, char ** argv) {
g_params = &params;
if (!gpt_params_parse(argc, argv, params)) {
gpt_params_print_usage(argc, argv, params);
return 1;
}
llama_sampling_params & sparams = params.sparams;
#ifndef LOG_DISABLE_LOGS
@ -180,9 +182,6 @@ int main(int argc, char ** argv) {
LOG_TEE("%s: seed = %u\n", __func__, params.seed);
std::mt19937 rng(params.seed);
if (params.random_prompt) {
params.prompt = string_random_prompt(rng);
}
LOG("%s: llama backend init\n", __func__);
llama_backend_init();
@ -250,11 +249,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd_inp;
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
if (params.chatml) {
params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>";
}
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
} else {
LOG("use session tokens\n");
@ -332,37 +328,13 @@ int main(int argc, char ** argv) {
}
// number of tokens to keep when resetting context
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) {
if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) {
params.n_keep = (int)embd_inp.size();
} else {
params.n_keep += add_bos; // always keep the BOS token
}
// prefix & suffix for instruct mode
const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true);
const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true);
LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str());
LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str());
// chatml prefix & suffix
const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true);
const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true);
LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str());
LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str());
// in instruct mode, we inject a prefix and a suffix to each input by the user
if (params.instruct) {
params.interactive_first = true;
params.antiprompt.emplace_back("### Instruction:\n\n");
}
// similar for chatml mode
else if (params.chatml) {
params.interactive_first = true;
params.antiprompt.emplace_back("<|im_start|>user\n");
}
else if (params.conversation) {
if (params.conversation) {
params.interactive_first = true;
}
@ -740,18 +712,26 @@ int main(int argc, char ** argv) {
// display text
if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
printf("%s", token_str.c_str());
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
// Console/Stream Output
fprintf(stdout, "%s", token_str.c_str());
// Record Displayed Tokens To Log
// Note: Generated tokens are created one by one hence this check
if (embd.size() > 1) {
// Incoming Requested Tokens
input_tokens.push_back(id);
} else {
// Outgoing Generated Tokens
output_tokens.push_back(id);
output_ss << token_str;
}
}
fflush(stdout);
}
}
// reset color to default if there is no pending user input
if (input_echo && (int) embd_inp.size() == n_consumed) {
console::set_display(console::reset);
@ -815,15 +795,13 @@ int main(int argc, char ** argv) {
is_interacting = true;
printf("\n");
} else if (params.instruct || params.chatml) {
is_interacting = true;
}
}
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
if (params.conversation || params.instruct || params.chatml) {
if (params.conversation) {
printf("\n> ");
}
@ -866,24 +844,12 @@ int main(int argc, char ** argv) {
const size_t original_size = embd_inp.size();
// instruct mode: insert instruction prefix
if (params.instruct && !is_antiprompt) {
LOG("inserting instruction prefix\n");
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end());
}
// chatml mode: insert user chat prefix
if (params.chatml && !is_antiprompt) {
LOG("inserting chatml prefix\n");
n_consumed = embd_inp.size();
embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end());
}
if (params.escape) {
string_process_escapes(buffer);
}
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials);
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
@ -892,17 +858,6 @@ int main(int argc, char ** argv) {
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end());
// instruct mode: insert response suffix
if (params.instruct) {
LOG("inserting instruction suffix\n");
embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end());
}
// chatml mode: insert assistant chat suffix
if (params.chatml) {
LOG("inserting chatml suffix\n");
embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end());
}
for (size_t i = original_size; i < embd_inp.size(); ++i) {
const llama_token token = embd_inp[i];
output_tokens.push_back(token);
@ -927,7 +882,7 @@ int main(int argc, char ** argv) {
}
// end of generation
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) {
if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.interactive)) {
LOG_TEE(" [end of text]\n");
break;
}

View file

@ -1,98 +0,0 @@
#!/usr/bin/env python3
"""
This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
Usage:
python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
Arguments:
- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
Old quant types (some base model types require these):
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
New quant types (recommended):
- Q2_K: smallest, extreme quality loss - not recommended
- Q3_K: alias for Q3_K_M
- Q3_K_S: very small, very high quality loss
- Q3_K_M: very small, very high quality loss
- Q3_K_L: small, substantial quality loss
- Q4_K: alias for Q4_K_M
- Q4_K_S: small, significant quality loss
- Q4_K_M: medium, balanced quality - recommended
- Q5_K: alias for Q5_K_M
- Q5_K_S: large, low quality loss - recommended
- Q5_K_M: large, very low quality loss - recommended
- Q6_K: very large, extremely low quality loss
- Q8_0: very large, extremely low quality loss - not recommended
- F16: extremely large, virtually no quality loss - not recommended
- F32: absolutely huge, lossless - not recommended
"""
import subprocess
subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
import argparse
import os
from huggingface_hub import snapshot_download
def main(model, model_type, outname, outdir, quants, keep_fp16):
if not os.path.isdir(model):
print(f"Model not found at {model}. Downloading...")
try:
if outname is None:
outname = model.split('/')[-1]
model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
except Exception as e:
raise Exception(f"Could not download the model: {e}")
if outdir is None:
outdir = f'../models/{outname}'
if not os.path.isfile(f"{model}/config.json"):
raise Exception(f"Could not find config.json in {model}")
os.makedirs(outdir, exist_ok=True)
print("Building llama.cpp")
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
print(f"Making unquantised GGUF at {fp16}")
if not os.path.isfile(fp16):
if model_type != "llama":
subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
else:
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
else:
print(f"Unquantised GGML already exists at: {fp16}")
print("Making quants")
for type in quants:
outfile = f"{outdir}/{outname}.gguf.{type}.bin"
print(f"Making {type} : {outfile}")
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
if not keep_fp16:
os.remove(fp16)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
parser.add_argument('--outname', default=None, help='Output model(s) name')
parser.add_argument('--outdir', default=None, help='Output directory')
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
args = parser.parse_args()
main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)

Some files were not shown because too many files have changed in this diff Show more