Merge branch 'master' into auto-model-support
This commit is contained in:
commit
c2e48979e2
53 changed files with 842 additions and 741 deletions
|
@ -31,6 +31,6 @@ ENV LLAMA_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT ["/app/.devops/tools.sh"]
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
||||||
|
|
|
@ -18,7 +18,7 @@ COPY . .
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV LLAMA_CUDA=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y git
|
apt-get install -y git
|
||||||
|
|
|
@ -40,6 +40,6 @@ ENV LLAMA_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/main" ]
|
ENTRYPOINT [ "/app/main" ]
|
||||||
|
|
|
@ -9,7 +9,7 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ ENV LLAMA_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,14 @@ ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
ARG LLAMA_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
@ -19,6 +27,14 @@ RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
|
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
||||||
|
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
||||||
|
rm /etc/apt/sources.list.d/intel-graphics.list && \
|
||||||
|
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
||||||
|
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
||||||
|
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,6 @@ ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/server" ]
|
ENTRYPOINT [ "/app/server" ]
|
||||||
|
|
|
@ -11,7 +11,7 @@ COPY . .
|
||||||
|
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
RUN make
|
RUN make -j$(nproc)
|
||||||
|
|
||||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ arg1="$1"
|
||||||
shift
|
shift
|
||||||
|
|
||||||
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
|
||||||
python3 ./convert.py "$@"
|
python3 ./convert-hf-to-gguf.py "$@"
|
||||||
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
|
||||||
./quantize "$@"
|
./quantize "$@"
|
||||||
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
|
||||||
|
|
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
38
.github/ISSUE_TEMPLATE/06-question.yml
vendored
|
@ -1,38 +0,0 @@
|
||||||
name: Question
|
|
||||||
description: Used to ask questions about llama.cpp
|
|
||||||
title: "Question: "
|
|
||||||
labels: ["question"]
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: |
|
|
||||||
[Please search your question first in Discussion if you got a common general question.](https://github.com/ggerganov/llama.cpp/discussions/categories/q-a)
|
|
||||||
|
|
||||||
- type: checkboxes
|
|
||||||
id: prerequisites
|
|
||||||
attributes:
|
|
||||||
label: Prerequisites
|
|
||||||
description: Please confirm the following before submitting your question.
|
|
||||||
options:
|
|
||||||
- label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
|
|
||||||
required: true
|
|
||||||
- label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new useful question to share that cannot be answered within Discussions.
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: background-description
|
|
||||||
attributes:
|
|
||||||
label: Background Description
|
|
||||||
description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an question.
|
|
||||||
placeholder: Detailed description of your question
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
|
|
||||||
- type: textarea
|
|
||||||
id: possible-answer
|
|
||||||
attributes:
|
|
||||||
label: Possible Answer
|
|
||||||
description: If you have some idea of possible answers you want to confirm, that would also be appreciated.
|
|
||||||
placeholder: Your idea of possible answers
|
|
||||||
validations:
|
|
||||||
required: false
|
|
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
Normal file
52
.github/ISSUE_TEMPLATE/06-research.yml
vendored
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
name: Research
|
||||||
|
description: Track new technical research area
|
||||||
|
title: "Research: "
|
||||||
|
labels: ["research 🔬"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: research-stage
|
||||||
|
attributes:
|
||||||
|
label: Research Stage
|
||||||
|
description: Track general state of this research ticket
|
||||||
|
options:
|
||||||
|
- label: Background Research (Let's try to avoid reinventing the wheel)
|
||||||
|
- label: Hypothesis Formed (How do you think this will work and it's effect?)
|
||||||
|
- label: Strategy / Implementation Forming
|
||||||
|
- label: Analysis of results
|
||||||
|
- label: Debrief / Documentation (So people in the future can learn from us)
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: background
|
||||||
|
attributes:
|
||||||
|
label: Previous existing literature and research
|
||||||
|
description: Whats the current state of the art and whats the motivation for this research?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: hypothesis
|
||||||
|
attributes:
|
||||||
|
label: Hypothesis
|
||||||
|
description: How do you think this will work and it's effect?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: implementation
|
||||||
|
attributes:
|
||||||
|
label: Implementation
|
||||||
|
description: Got an approach? e.g. a PR ready to go?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: analysis
|
||||||
|
attributes:
|
||||||
|
label: Analysis
|
||||||
|
description: How does the proposed implementation behave?
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: logs
|
||||||
|
attributes:
|
||||||
|
label: Relevant log output
|
||||||
|
description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
|
||||||
|
render: shell
|
13
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
13
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
blank_issues_enabled: true
|
||||||
|
contact_links:
|
||||||
|
- name: Got an idea?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
|
||||||
|
about: Pop it there. It may then become an enhancement ticket.
|
||||||
|
- name: Got a question?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
|
||||||
|
about: Ask a question there!
|
||||||
|
- name: Want to contribute?
|
||||||
|
url: https://github.com/ggerganov/llama.cpp/wiki/contribute
|
||||||
|
about: Head to the contribution guide page of the wiki for areas you can help with
|
||||||
|
|
||||||
|
|
5
.github/workflows/docker.yml
vendored
5
.github/workflows/docker.yml
vendored
|
@ -42,9 +42,8 @@ jobs:
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
# TODO: Disabled due to build issues https://github.com/ggerganov/llama.cpp/issues/7507
|
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
#- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
#- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
|
||||||
steps:
|
steps:
|
||||||
- name: Check out the repo
|
- name: Check out the repo
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
|
@ -1314,7 +1314,7 @@ set_target_properties(llama PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
install(TARGETS llama LIBRARY PUBLIC_HEADER)
|
||||||
|
|
||||||
install(
|
install(
|
||||||
FILES convert.py
|
FILES convert-hf-to-gguf.py
|
||||||
PERMISSIONS
|
PERMISSIONS
|
||||||
OWNER_READ
|
OWNER_READ
|
||||||
OWNER_WRITE
|
OWNER_WRITE
|
||||||
|
|
1
Makefile
1
Makefile
|
@ -571,6 +571,7 @@ ifdef LLAMA_HIP_UMA
|
||||||
MK_CPPFLAGS += -DGGML_HIP_UMA
|
MK_CPPFLAGS += -DGGML_HIP_UMA
|
||||||
endif # LLAMA_HIP_UMA
|
endif # LLAMA_HIP_UMA
|
||||||
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||||
|
MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
|
||||||
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||||
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
|
||||||
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
|
||||||
|
|
48
README.md
48
README.md
|
@ -2,7 +2,9 @@
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT) [](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
[](https://opensource.org/licenses/MIT)
|
||||||
|
[](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
|
[](https://conan.io/center/llama-cpp)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
|
@ -20,7 +22,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
|
- **`convert.py` has been deprecated and moved to `examples/convert-legacy-llama.py`, please use `convert-hf-to-gguf.py`** https://github.com/ggerganov/llama.cpp/pull/7430
|
||||||
|
- Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021
|
||||||
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
||||||
|
@ -200,6 +203,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
- [KodiBot](https://github.com/firatkiral/kodibot) (GPL)
|
||||||
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT)
|
||||||
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT)
|
||||||
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
@ -315,8 +319,6 @@ In order to build llama.cpp you have four different options.
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: for `Debug` builds, run `make LLAMA_DEBUG=1`
|
|
||||||
|
|
||||||
- On Windows:
|
- On Windows:
|
||||||
|
|
||||||
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases).
|
||||||
|
@ -328,23 +330,32 @@ In order to build llama.cpp you have four different options.
|
||||||
make
|
make
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- Notes:
|
||||||
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
|
||||||
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
|
- For debug builds, run `make LLAMA_DEBUG=1`
|
||||||
|
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build
|
cmake -B build
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: for `Debug` builds, there are two cases:
|
**Notes**:
|
||||||
|
|
||||||
- Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
|
||||||
|
- For faster repeated compilation, install [ccache](https://ccache.dev/).
|
||||||
|
- For debug builds, there are two cases:
|
||||||
|
|
||||||
|
1. Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DCMAKE_BUILD_TYPE=Debug
|
cmake -B build -DCMAKE_BUILD_TYPE=Debug
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
|
|
||||||
- Multi-config generators (`-G` param set to Visual Studio, XCode...):
|
2. Multi-config generators (`-G` param set to Visual Studio, XCode...):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -G "Xcode"
|
cmake -B build -G "Xcode"
|
||||||
|
@ -379,6 +390,14 @@ In order to build llama.cpp you have four different options.
|
||||||
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
|
CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read
|
||||||
the instructions for use and activate this options in this document below.
|
the instructions for use and activate this options in this document below.
|
||||||
|
|
||||||
|
### Homebrew
|
||||||
|
|
||||||
|
On Mac and Linux, the homebrew package manager can be used via
|
||||||
|
```
|
||||||
|
brew install llama.cpp
|
||||||
|
```
|
||||||
|
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
|
||||||
|
|
||||||
### Metal Build
|
### Metal Build
|
||||||
|
|
||||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||||
|
@ -697,7 +716,8 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
|
||||||
|
|
||||||
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
Note: `convert.py` has been moved to `examples/convert-legacy-llama.py` and shouldn't be used for anything other than `Llama/Llama2/Mistral` models and their derievatives.
|
||||||
|
It does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# obtain the official LLaMA model weights and place them in ./models
|
# obtain the official LLaMA model weights and place them in ./models
|
||||||
|
@ -714,10 +734,10 @@ ls ./models
|
||||||
python3 -m pip install -r requirements.txt
|
python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
# convert the model to ggml FP16 format
|
# convert the model to ggml FP16 format
|
||||||
python3 convert.py models/mymodel/
|
python3 convert-hf-to-gguf.py models/mymodel/
|
||||||
|
|
||||||
# [Optional] for models using BPE tokenizers
|
# [Optional] for models using BPE tokenizers
|
||||||
python convert.py models/mymodel/ --vocab-type bpe
|
python convert-hf-to-gguf.py models/mymodel/ --vocab-type bpe
|
||||||
|
|
||||||
# quantize the model to 4-bits (using Q4_K_M method)
|
# quantize the model to 4-bits (using Q4_K_M method)
|
||||||
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
./quantize ./models/mymodel/ggml-model-f16.gguf ./models/mymodel/ggml-model-Q4_K_M.gguf Q4_K_M
|
||||||
|
|
|
@ -287,7 +287,7 @@ function gg_run_open_llama_7b_v2 {
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
|
||||||
|
|
|
@ -34,8 +34,6 @@ if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path('gguf-py')))
|
sys.path.insert(1, str(Path('gguf-py')))
|
||||||
import gguf
|
import gguf
|
||||||
|
|
||||||
from convert import LlamaHfVocab
|
|
||||||
|
|
||||||
logger = logging.getLogger("hf-to-gguf")
|
logger = logging.getLogger("hf-to-gguf")
|
||||||
|
|
||||||
|
|
||||||
|
@ -582,7 +580,7 @@ class Model:
|
||||||
special_vocab.add_to_gguf(self.gguf_writer)
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
def _set_vocab_llama_hf(self):
|
def _set_vocab_llama_hf(self):
|
||||||
vocab = LlamaHfVocab(self.dir_model)
|
vocab = gguf.LlamaHfVocab(self.dir_model)
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
@ -2790,7 +2788,12 @@ def main() -> None:
|
||||||
hparams = Model.load_hparams(dir_model)
|
hparams = Model.load_hparams(dir_model)
|
||||||
|
|
||||||
with torch.inference_mode():
|
with torch.inference_mode():
|
||||||
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
try:
|
||||||
|
model_class = Model.from_model_architecture(hparams["architectures"][0])
|
||||||
|
except NotImplementedError:
|
||||||
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy)
|
||||||
|
|
||||||
logger.info("Set model parameters")
|
logger.info("Set model parameters")
|
||||||
|
|
|
@ -17,7 +17,7 @@ Also, it is important to check that the examples and main ggml backends (CUDA, M
|
||||||
### 1. Convert the model to GGUF
|
### 1. Convert the model to GGUF
|
||||||
|
|
||||||
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library.
|
||||||
Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py).
|
Depending on the model architecture, you can use either [convert-hf-to-gguf.py](../convert-hf-to-gguf.py) or [examples/convert-legacy-llama.py](../examples/convert-legacy-llama.py) (for `llama/llama2` models in `.pth` format).
|
||||||
|
|
||||||
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors.
|
||||||
|
|
||||||
|
|
|
@ -24,14 +24,16 @@ from abc import ABC, abstractmethod
|
||||||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
from typing import TYPE_CHECKING, Any, Callable, IO, Iterable, Literal, TypeVar, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from sentencepiece import SentencePieceProcessor
|
|
||||||
|
|
||||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
# use .parent.parent since we are in "examples" directory
|
||||||
|
sys.path.insert(1, str(Path(__file__).parent.parent / 'gguf-py'))
|
||||||
|
|
||||||
import gguf
|
import gguf
|
||||||
|
from gguf import BaseVocab, Vocab, NoVocab, BpeVocab, SentencePieceVocab, LlamaHfVocab
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from typing_extensions import Self, TypeAlias
|
from typing_extensions import Self, TypeAlias
|
||||||
|
@ -380,306 +382,6 @@ class Metadata:
|
||||||
return metadata
|
return metadata
|
||||||
|
|
||||||
|
|
||||||
#
|
|
||||||
# vocab
|
|
||||||
#
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class BaseVocab(Protocol):
|
|
||||||
tokenizer_model: ClassVar[str]
|
|
||||||
name: ClassVar[str]
|
|
||||||
|
|
||||||
|
|
||||||
class NoVocab(BaseVocab):
|
|
||||||
tokenizer_model = "no_vocab"
|
|
||||||
name = "no_vocab"
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return "<NoVocab for a model without integrated vocabulary>"
|
|
||||||
|
|
||||||
|
|
||||||
@runtime_checkable
|
|
||||||
class Vocab(BaseVocab, Protocol):
|
|
||||||
vocab_size: int
|
|
||||||
added_tokens_dict: dict[str, int]
|
|
||||||
added_tokens_list: list[str]
|
|
||||||
fname_tokenizer: Path
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path): ...
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
|
||||||
|
|
||||||
|
|
||||||
class BpeVocab(Vocab):
|
|
||||||
tokenizer_model = "gpt2"
|
|
||||||
name = "bpe"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
added_tokens: dict[str, int] = {}
|
|
||||||
|
|
||||||
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
|
||||||
# "slow" tokenizer
|
|
||||||
with open(fname_tokenizer, encoding="utf-8") as f:
|
|
||||||
self.vocab = json.load(f)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
|
||||||
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
|
||||||
added_tokens = json.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# "fast" tokenizer
|
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
|
||||||
|
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
|
||||||
with open(fname_tokenizer, encoding="utf-8") as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
|
||||||
if (
|
|
||||||
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
|
||||||
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
|
||||||
):
|
|
||||||
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
|
||||||
|
|
||||||
self.vocab = tokenizer_model["vocab"]
|
|
||||||
|
|
||||||
if (added := tokenizer_json.get('added_tokens')) is not None:
|
|
||||||
# Added tokens here can be duplicates of the main vocabulary.
|
|
||||||
added_tokens = {item['content']: item['id']
|
|
||||||
for item in added
|
|
||||||
if item['content'] not in self.vocab}
|
|
||||||
|
|
||||||
vocab_size = len(self.vocab)
|
|
||||||
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
|
||||||
actual_ids = sorted(added_tokens.values())
|
|
||||||
if expected_ids != actual_ids:
|
|
||||||
expected_end_id = vocab_size + len(actual_ids) - 1
|
|
||||||
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
|
||||||
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
|
||||||
|
|
||||||
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
|
||||||
self.added_tokens_dict = added_tokens
|
|
||||||
self.added_tokens_list = [text for (text, idx) in items]
|
|
||||||
self.vocab_size_base = vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
|
||||||
|
|
||||||
for i, _ in enumerate(self.vocab):
|
|
||||||
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
score = -1000.0
|
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.bpe_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
class SentencePieceVocab(Vocab):
|
|
||||||
tokenizer_model = "llama"
|
|
||||||
name = "spm"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
added_tokens: dict[str, int] = {}
|
|
||||||
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
|
||||||
# normal location
|
|
||||||
try:
|
|
||||||
with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f:
|
|
||||||
added_tokens = json.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
pass
|
|
||||||
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
|
||||||
# not found in alternate location either
|
|
||||||
raise FileNotFoundError('Cannot find tokenizer.model')
|
|
||||||
|
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
|
||||||
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
|
|
||||||
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
|
||||||
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
|
||||||
actual_new_ids = sorted(new_tokens.keys())
|
|
||||||
|
|
||||||
if expected_new_ids != actual_new_ids:
|
|
||||||
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
|
||||||
|
|
||||||
# Token pieces that were added to the base vocabulary.
|
|
||||||
self.added_tokens_dict = added_tokens
|
|
||||||
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
|
||||||
self.vocab_size_base = vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
|
||||||
for i in range(tokenizer.vocab_size()):
|
|
||||||
piece = tokenizer.IdToPiece(i)
|
|
||||||
text = piece.encode("utf-8")
|
|
||||||
score: float = tokenizer.GetScore(i)
|
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
|
||||||
if tokenizer.IsUnknown(i):
|
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
|
||||||
if tokenizer.IsControl(i):
|
|
||||||
toktype = gguf.TokenType.CONTROL
|
|
||||||
|
|
||||||
# NOTE: I think added_tokens are user defined.
|
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
|
||||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
|
||||||
|
|
||||||
if tokenizer.IsUnused(i):
|
|
||||||
toktype = gguf.TokenType.UNUSED
|
|
||||||
if tokenizer.IsByte(i):
|
|
||||||
toktype = gguf.TokenType.BYTE
|
|
||||||
|
|
||||||
yield text, score, toktype
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
score = -1000.0
|
|
||||||
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.sentencepiece_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
class LlamaHfVocab(Vocab):
|
|
||||||
tokenizer_model = "llama"
|
|
||||||
name = "hfft"
|
|
||||||
|
|
||||||
def __init__(self, base_path: Path):
|
|
||||||
fname_tokenizer = base_path / FAST_TOKENIZER_FILE
|
|
||||||
# if this fails, FileNotFoundError propagates to caller
|
|
||||||
with open(fname_tokenizer, encoding='utf-8') as f:
|
|
||||||
tokenizer_json = json.load(f)
|
|
||||||
|
|
||||||
# pre-check so we know if we need transformers
|
|
||||||
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
|
||||||
is_llama3 = (
|
|
||||||
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
|
|
||||||
and not tokenizer_model.get('byte_fallback', True)
|
|
||||||
)
|
|
||||||
if is_llama3:
|
|
||||||
raise TypeError('Llama 3 must be converted with BpeVocab')
|
|
||||||
|
|
||||||
if not is_llama3 and (
|
|
||||||
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
|
||||||
or tokenizer_json['decoder']['type'] != 'Sequence'
|
|
||||||
):
|
|
||||||
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
|
||||||
|
|
||||||
try:
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
except ImportError as e:
|
|
||||||
raise ImportError(
|
|
||||||
"To use LlamaHfVocab, please install the `transformers` package. "
|
|
||||||
"You can install it with `pip install transformers`."
|
|
||||||
) from e
|
|
||||||
|
|
||||||
# Allow the tokenizer to default to slow or fast versions.
|
|
||||||
# Explicitly set tokenizer to use local paths.
|
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
base_path,
|
|
||||||
cache_dir=base_path,
|
|
||||||
local_files_only=True,
|
|
||||||
)
|
|
||||||
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
|
||||||
|
|
||||||
# Initialize lists and dictionaries for added tokens
|
|
||||||
self.added_tokens_list = []
|
|
||||||
self.added_tokens_dict = dict()
|
|
||||||
self.added_tokens_ids = set()
|
|
||||||
|
|
||||||
# Process added tokens
|
|
||||||
for tok, tokidx in sorted(
|
|
||||||
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
|
|
||||||
):
|
|
||||||
# Only consider added tokens that are not in the base vocabulary
|
|
||||||
if tokidx >= self.tokenizer.vocab_size:
|
|
||||||
self.added_tokens_list.append(tok)
|
|
||||||
self.added_tokens_dict[tok] = tokidx
|
|
||||||
self.added_tokens_ids.add(tokidx)
|
|
||||||
|
|
||||||
# Store special tokens and their IDs
|
|
||||||
self.specials = {
|
|
||||||
tok: self.tokenizer.get_vocab()[tok]
|
|
||||||
for tok in self.tokenizer.all_special_tokens
|
|
||||||
}
|
|
||||||
self.special_ids = set(self.tokenizer.all_special_ids)
|
|
||||||
|
|
||||||
# Set vocabulary sizes
|
|
||||||
self.vocab_size_base = self.tokenizer.vocab_size
|
|
||||||
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
|
||||||
|
|
||||||
self.fname_tokenizer = fname_tokenizer
|
|
||||||
|
|
||||||
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
reverse_vocab = {
|
|
||||||
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
|
||||||
}
|
|
||||||
|
|
||||||
for token_id in range(self.vocab_size_base):
|
|
||||||
# Skip processing added tokens here
|
|
||||||
if token_id in self.added_tokens_ids:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Convert token text to bytes
|
|
||||||
token_text = reverse_vocab[token_id].encode("utf-8")
|
|
||||||
|
|
||||||
# Yield token text, score, and type
|
|
||||||
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
|
||||||
token_id, token_text, self.special_ids # Reuse already stored special IDs
|
|
||||||
)
|
|
||||||
|
|
||||||
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
|
|
||||||
# Special case for byte tokens
|
|
||||||
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
|
||||||
return gguf.TokenType.BYTE
|
|
||||||
|
|
||||||
# Determine token type based on whether it's a special token
|
|
||||||
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
|
||||||
|
|
||||||
def get_token_score(self, token_id: int) -> float:
|
|
||||||
# Placeholder for actual logic to determine the token's score
|
|
||||||
# This needs to be implemented based on specific requirements
|
|
||||||
return -1000.0 # Default score
|
|
||||||
|
|
||||||
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
for text in self.added_tokens_list:
|
|
||||||
if text in self.specials:
|
|
||||||
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
|
|
||||||
score = self.get_token_score(self.specials[text])
|
|
||||||
else:
|
|
||||||
toktype = gguf.TokenType.USER_DEFINED
|
|
||||||
score = -1000.0
|
|
||||||
|
|
||||||
yield text.encode("utf-8"), score, toktype
|
|
||||||
|
|
||||||
def has_newline_token(self):
|
|
||||||
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
|
|
||||||
|
|
||||||
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
|
||||||
yield from self.hf_tokens()
|
|
||||||
yield from self.added_tokens()
|
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
|
||||||
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# data loading
|
# data loading
|
||||||
# TODO: reuse (probably move to gguf.py?)
|
# TODO: reuse (probably move to gguf.py?)
|
|
@ -54,10 +54,10 @@ python ./examples/llava/convert-image-encoder-to-gguf \
|
||||||
--projector-type ldpv2
|
--projector-type ldpv2
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
4. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./convert.py path/to/MobileVLM-1.7B
|
python ./examples/convert-legacy-llama.py path/to/MobileVLM-1.7B
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||||
|
|
|
@ -50,10 +50,10 @@ python ./examples/llava/llava-surgery.py -m ../llava-v1.5-7b
|
||||||
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
|
||||||
```
|
```
|
||||||
|
|
||||||
5. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
5. Use `examples/convert-legacy-llama.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
python ./convert.py ../llava-v1.5-7b --skip-unknown
|
python ./examples/convert-legacy-llama.py ../llava-v1.5-7b --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
|
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
|
||||||
|
@ -92,7 +92,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m vit --llava-projecto
|
||||||
|
|
||||||
6) Then convert the model to gguf format:
|
6) Then convert the model to gguf format:
|
||||||
```console
|
```console
|
||||||
python ./convert.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
python ./examples/convert-legacy-llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
7) And finally we can run the llava-cli using the 1.6 model version:
|
7) And finally we can run the llava-cli using the 1.6 model version:
|
||||||
|
|
|
@ -1,3 +1,3 @@
|
||||||
-r ../../requirements/requirements-convert.txt
|
-r ../../requirements/requirements-convert-legacy-llama.txt
|
||||||
pillow~=10.2.0
|
pillow~=10.2.0
|
||||||
torch~=2.1.1
|
torch~=2.1.1
|
||||||
|
|
|
@ -1,98 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
This script converts Hugging Face Llama, StarCoder, Falcon, Baichuan, and GPT-NeoX models to GGUF and quantizes them.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python make-ggml.py {model_dir_or_hf_repo_name} --model_type {model_type} [--outname {output_name} (Optional)] [--outdir {output_directory} (Optional)] [--quants {quant_types} (Optional)] [--keep_fp16 (Optional)]
|
|
||||||
|
|
||||||
Arguments:
|
|
||||||
- model: (Required) The directory of the downloaded Hugging Face model or the name of the Hugging Face model repository. If the model directory does not exist, it will be downloaded from the Hugging Face model hub.
|
|
||||||
- --model_type: (Required) The type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.
|
|
||||||
- --outname: (Optional) The name of the output model. If not specified, the last part of the model directory path or the Hugging Face model repo name will be used.
|
|
||||||
- --outdir: (Optional) The directory where the output model(s) will be stored. If not specified, '../models/{outname}' will be used.
|
|
||||||
- --quants: (Optional) The types of quantization to apply. This should be a space-separated list. The default is 'Q4_K_M Q5_K_S'.
|
|
||||||
- --keep_fp16: (Optional) If specified, the FP16 model will not be deleted after the quantized models are created.
|
|
||||||
|
|
||||||
Old quant types (some base model types require these):
|
|
||||||
- Q4_0: small, very high quality loss - legacy, prefer using Q3_K_M
|
|
||||||
- Q4_1: small, substantial quality loss - legacy, prefer using Q3_K_L
|
|
||||||
- Q5_0: medium, balanced quality - legacy, prefer using Q4_K_M
|
|
||||||
- Q5_1: medium, low quality loss - legacy, prefer using Q5_K_M
|
|
||||||
|
|
||||||
New quant types (recommended):
|
|
||||||
- Q2_K: smallest, extreme quality loss - not recommended
|
|
||||||
- Q3_K: alias for Q3_K_M
|
|
||||||
- Q3_K_S: very small, very high quality loss
|
|
||||||
- Q3_K_M: very small, very high quality loss
|
|
||||||
- Q3_K_L: small, substantial quality loss
|
|
||||||
- Q4_K: alias for Q4_K_M
|
|
||||||
- Q4_K_S: small, significant quality loss
|
|
||||||
- Q4_K_M: medium, balanced quality - recommended
|
|
||||||
- Q5_K: alias for Q5_K_M
|
|
||||||
- Q5_K_S: large, low quality loss - recommended
|
|
||||||
- Q5_K_M: large, very low quality loss - recommended
|
|
||||||
- Q6_K: very large, extremely low quality loss
|
|
||||||
- Q8_0: very large, extremely low quality loss - not recommended
|
|
||||||
- F16: extremely large, virtually no quality loss - not recommended
|
|
||||||
- F32: absolutely huge, lossless - not recommended
|
|
||||||
"""
|
|
||||||
import subprocess
|
|
||||||
subprocess.run(f"pip install huggingface-hub==0.16.4", shell=True, check=True)
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import os
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
|
|
||||||
def main(model, model_type, outname, outdir, quants, keep_fp16):
|
|
||||||
if not os.path.isdir(model):
|
|
||||||
print(f"Model not found at {model}. Downloading...")
|
|
||||||
try:
|
|
||||||
if outname is None:
|
|
||||||
outname = model.split('/')[-1]
|
|
||||||
model = snapshot_download(repo_id=model, cache_dir='../models/hf_cache')
|
|
||||||
except Exception as e:
|
|
||||||
raise Exception(f"Could not download the model: {e}")
|
|
||||||
|
|
||||||
if outdir is None:
|
|
||||||
outdir = f'../models/{outname}'
|
|
||||||
|
|
||||||
if not os.path.isfile(f"{model}/config.json"):
|
|
||||||
raise Exception(f"Could not find config.json in {model}")
|
|
||||||
|
|
||||||
os.makedirs(outdir, exist_ok=True)
|
|
||||||
|
|
||||||
print("Building llama.cpp")
|
|
||||||
subprocess.run(f"cd .. && make quantize", shell=True, check=True)
|
|
||||||
|
|
||||||
fp16 = f"{outdir}/{outname}.gguf.fp16.bin"
|
|
||||||
|
|
||||||
print(f"Making unquantised GGUF at {fp16}")
|
|
||||||
if not os.path.isfile(fp16):
|
|
||||||
if model_type != "llama":
|
|
||||||
subprocess.run(f"python3 ../convert-{model_type}-hf-to-gguf.py {model} 1 --outfile {fp16}", shell=True, check=True)
|
|
||||||
else:
|
|
||||||
subprocess.run(f"python3 ../convert.py {model} --outtype f16 --outfile {fp16}", shell=True, check=True)
|
|
||||||
else:
|
|
||||||
print(f"Unquantised GGML already exists at: {fp16}")
|
|
||||||
|
|
||||||
print("Making quants")
|
|
||||||
for type in quants:
|
|
||||||
outfile = f"{outdir}/{outname}.gguf.{type}.bin"
|
|
||||||
print(f"Making {type} : {outfile}")
|
|
||||||
subprocess.run(f"../quantize {fp16} {outfile} {type}", shell=True, check=True)
|
|
||||||
|
|
||||||
if not keep_fp16:
|
|
||||||
os.remove(fp16)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(description='Convert/Quantize HF models to GGUF. If you have the HF model downloaded already, pass the path to the model dir. Otherwise, pass the Hugging Face model repo name. You need to be in the /examples folder for it to work.')
|
|
||||||
parser.add_argument('model', help='Downloaded model dir or Hugging Face model repo name')
|
|
||||||
parser.add_argument('--model_type', required=True, choices=['llama', 'starcoder', 'falcon', 'baichuan', 'gptneox'], help='Type of the model to be converted. Choose from llama, starcoder, falcon, baichuan, or gptneox.')
|
|
||||||
parser.add_argument('--outname', default=None, help='Output model(s) name')
|
|
||||||
parser.add_argument('--outdir', default=None, help='Output directory')
|
|
||||||
parser.add_argument('--quants', nargs='*', default=["Q4_K_M", "Q5_K_S"], help='Quant types')
|
|
||||||
parser.add_argument('--keep_fp16', action='store_true', help='Keep fp16 model', default=False)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
main(args.model, args.model_type, args.outname, args.outdir, args.quants, args.keep_fp16)
|
|
|
@ -1870,7 +1870,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
// use cublasGemmStridedBatchedEx
|
// use cublasGemmStridedBatchedEx
|
||||||
CUBLAS_CHECK(
|
CUBLAS_CHECK(
|
||||||
|
@ -2886,7 +2886,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
||||||
case GGML_OP_CONT:
|
case GGML_OP_CONT:
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_SOFT_MAX:
|
case GGML_OP_SOFT_MAX:
|
||||||
|
return true;
|
||||||
case GGML_OP_ROPE:
|
case GGML_OP_ROPE:
|
||||||
|
return ggml_is_contiguous(op->src[0]);
|
||||||
case GGML_OP_IM2COL:
|
case GGML_OP_IM2COL:
|
||||||
case GGML_OP_POOL_2D:
|
case GGML_OP_POOL_2D:
|
||||||
case GGML_OP_SUM_ROWS:
|
case GGML_OP_SUM_ROWS:
|
||||||
|
|
|
@ -170,6 +170,8 @@ void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -188,6 +190,8 @@ void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
@ -202,6 +206,8 @@ void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||||
|
|
||||||
|
|
|
@ -61,7 +61,7 @@ static __global__ void rope(
|
||||||
template<typename T, bool has_pos, bool has_freq_facs>
|
template<typename T, bool has_pos, bool has_freq_facs>
|
||||||
static __global__ void rope_neox(
|
static __global__ void rope_neox(
|
||||||
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
|
||||||
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims, const float * freq_factors
|
float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors
|
||||||
) {
|
) {
|
||||||
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y);
|
||||||
|
|
||||||
|
@ -85,15 +85,13 @@ static __global__ void rope_neox(
|
||||||
const int i = row*ncols + ib*n_dims + ic/2;
|
const int i = row*ncols + ib*n_dims + ic/2;
|
||||||
const int i2 = row/p_delta_rows;
|
const int i2 = row/p_delta_rows;
|
||||||
|
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
|
|
||||||
const int p = has_pos ? pos[i2] : 0;
|
const int p = has_pos ? pos[i2] : 0;
|
||||||
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
const float freq_factor = has_freq_facs ? freq_factors[ic/2] : 1.0f;
|
||||||
|
|
||||||
const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f)/freq_factor;
|
const float theta_base = p*powf(theta_scale, col/2.0f)/freq_factor;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta_base, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
const float x0 = x[i + 0];
|
const float x0 = x[i + 0];
|
||||||
const float x1 = x[i + n_dims/2];
|
const float x1 = x[i + n_dims/2];
|
||||||
|
@ -174,30 +172,29 @@ static void rope_neox_cuda(
|
||||||
const dim3 block_nums(nrows, num_blocks_x, 1);
|
const dim3 block_nums(nrows, num_blocks_x, 1);
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
const float inv_ndims = -1.0f / n_dims;
|
|
||||||
|
|
||||||
if (pos == nullptr) {
|
if (pos == nullptr) {
|
||||||
if (freq_factors == nullptr) {
|
if (freq_factors == nullptr) {
|
||||||
rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, false, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, false, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (freq_factors == nullptr) {
|
if (freq_factors == nullptr) {
|
||||||
rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, true, false><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
|
rope_neox<T, true, true><<<block_nums, block_dims, 0, stream>>>(
|
||||||
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims,
|
||||||
theta_scale, inv_ndims, freq_factors
|
theta_scale, freq_factors
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -254,6 +251,7 @@ void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
float * dst_d = (float *)dst->data;
|
float * dst_d = (float *)dst->data;
|
||||||
cudaStream_t stream = ctx.stream();
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
|
||||||
GGML_ASSERT(src0->type == dst->type);
|
GGML_ASSERT(src0->type == dst->type);
|
||||||
|
|
|
@ -1597,7 +1597,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
|
|
||||||
// TODO: assert that dim2 and dim3 are contiguous
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
|
|
||||||
|
|
|
@ -1519,7 +1519,6 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 == ne10);
|
GGML_ASSERT(ne00 == ne10);
|
||||||
|
|
||||||
// TODO: assert that dim2 and dim3 are contiguous
|
|
||||||
GGML_ASSERT(ne12 % ne02 == 0);
|
GGML_ASSERT(ne12 % ne02 == 0);
|
||||||
GGML_ASSERT(ne13 % ne03 == 0);
|
GGML_ASSERT(ne13 % ne03 == 0);
|
||||||
|
|
||||||
|
@ -2187,6 +2186,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_RMS_NORM:
|
case GGML_OP_RMS_NORM:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
|
|
||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
@ -2214,6 +2214,7 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
case GGML_OP_GROUP_NORM:
|
case GGML_OP_GROUP_NORM:
|
||||||
{
|
{
|
||||||
GGML_ASSERT(ne00 % 4 == 0);
|
GGML_ASSERT(ne00 % 4 == 0);
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
//float eps;
|
//float eps;
|
||||||
//memcpy(&eps, dst->op_params, sizeof(float));
|
//memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
@ -2247,6 +2248,8 @@ static enum ggml_status ggml_metal_graph_compute(
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_NORM:
|
case GGML_OP_NORM:
|
||||||
{
|
{
|
||||||
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
|
|
||||||
float eps;
|
float eps;
|
||||||
memcpy(&eps, dst->op_params, sizeof(float));
|
memcpy(&eps, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
|
|
|
@ -1767,13 +1767,13 @@ kernel void kernel_rope(
|
||||||
|
|
||||||
const int64_t p = pos[i2];
|
const int64_t p = pos[i2];
|
||||||
|
|
||||||
const float theta_0 = (float)p;
|
const float theta_base = (float)p;
|
||||||
const float inv_ndims = -1.f/n_dims;
|
const float inv_ndims = -1.f/n_dims;
|
||||||
|
|
||||||
if (!is_neox) {
|
if (!is_neox) {
|
||||||
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
for (int64_t i0 = 2*tiitg; i0 < ne0; i0 += 2*tptg.x) {
|
||||||
|
const float theta = theta_base * pow(freq_base, inv_ndims*i0);
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, inv_ndims*i0);
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
|
@ -1789,18 +1789,14 @@ kernel void kernel_rope(
|
||||||
} else {
|
} else {
|
||||||
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
for (int64_t ic = 2*tiitg; ic < ne0; ic += 2*tptg.x) {
|
||||||
if (ic < n_dims) {
|
if (ic < n_dims) {
|
||||||
const int64_t ib = 0;
|
const int64_t i0 = ic/2;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
const float freq_factor = src2 != src0 ? src2[i0] : 1.0f;
|
||||||
const float cur_rot = inv_ndims*ic - ib;
|
|
||||||
const float freq_factor = src2 != src0 ? src2[ic/2] : 1.0f;
|
|
||||||
|
|
||||||
const float theta = theta_0 * pow(freq_base, cur_rot) / freq_factor;
|
const float theta = theta_base * pow(freq_base, inv_ndims*ic);
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
rope_yarn(theta/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor, &cos_theta, &sin_theta);
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
device const T * const src = (device T *)((device char *) src0 + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
device T * dst_data = (device T *)((device char *) dst + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
|
@ -6088,6 +6088,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
const uint8_t * restrict q2 = x[i].qs;
|
const uint8_t * restrict q2 = x[i].qs;
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
|
||||||
const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
|
const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
|
||||||
const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
|
const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
|
||||||
const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
|
const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
|
||||||
|
@ -6807,6 +6808,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
|
const uint8_t * restrict q3 = x[i].qs;
|
||||||
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
// Set up scales
|
// Set up scales
|
||||||
memcpy(aux, x[i].scales, 12);
|
memcpy(aux, x[i].scales, 12);
|
||||||
__m128i scales128 = lsx_set_w(
|
__m128i scales128 = lsx_set_w(
|
||||||
|
@ -6828,29 +6831,32 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
int bit = 0;
|
int bit = 0;
|
||||||
int is = 0;
|
int is = 0;
|
||||||
|
__m256i xvbit;
|
||||||
|
|
||||||
const uint8_t * restrict q3 = x[i].qs;
|
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
|
||||||
|
|
||||||
for (int j = 0; j < QK_K/128; ++j) {
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
// load low 2 bits
|
// load low 2 bits
|
||||||
const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
|
const __m256i q3bits = __lasx_xvld((const __m256i*)q3, 0); q3 += 32;
|
||||||
|
|
||||||
|
xvbit = __lasx_xvreplgr2vr_h(bit);
|
||||||
// prepare low and high bits
|
// prepare low and high bits
|
||||||
const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
|
const __m256i q3l_0 = __lasx_xvand_v(q3bits, m3);
|
||||||
const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
|
const __m256i q3h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
||||||
++bit;
|
++bit;
|
||||||
|
|
||||||
|
xvbit = __lasx_xvreplgr2vr_h(bit);
|
||||||
const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
|
const __m256i q3l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 2), m3);
|
||||||
const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
|
const __m256i q3h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
||||||
++bit;
|
++bit;
|
||||||
|
|
||||||
|
xvbit = __lasx_xvreplgr2vr_h(bit);
|
||||||
const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
|
const __m256i q3l_2 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 4), m3);
|
||||||
const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
|
const __m256i q3h_2 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
||||||
++bit;
|
++bit;
|
||||||
|
|
||||||
|
xvbit = __lasx_xvreplgr2vr_h(bit);
|
||||||
const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
|
const __m256i q3l_3 = __lasx_xvand_v(__lasx_xvsrli_h(q3bits, 6), m3);
|
||||||
const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvandn_v(hbits, __lasx_xvslli_h(mone, bit)), bit), 2);
|
const __m256i q3h_3 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvandn_v(hbits, __lasx_xvsll_h(mone, xvbit)), xvbit), 2);
|
||||||
++bit;
|
++bit;
|
||||||
|
|
||||||
// load Q8 quants
|
// load Q8 quants
|
||||||
|
@ -7399,6 +7405,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
*s = vec_extract(vsumf0, 0);
|
*s = vec_extract(vsumf0, 0);
|
||||||
|
|
||||||
#elif defined __loongarch_asx
|
#elif defined __loongarch_asx
|
||||||
|
GGML_UNUSED(kmask1);
|
||||||
|
GGML_UNUSED(kmask2);
|
||||||
|
GGML_UNUSED(kmask3);
|
||||||
|
|
||||||
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
||||||
|
|
||||||
|
@ -7411,6 +7420,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
|
||||||
memcpy(utmp, x[i].scales, 12);
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux = utmp[1] & kmask1;
|
||||||
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[2] = uaux;
|
||||||
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
const uint8_t * restrict q4 = x[i].qs;
|
const uint8_t * restrict q4 = x[i].qs;
|
||||||
const int8_t * restrict q8 = y[i].qs;
|
const int8_t * restrict q8 = y[i].qs;
|
||||||
|
@ -7450,16 +7464,17 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
||||||
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
|
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
|
||||||
__m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
|
__m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
|
||||||
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
||||||
|
|
||||||
|
|
||||||
ft_union fi;
|
ft_union fi;
|
||||||
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
|
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
|
||||||
*s = hsum_float_8(acc) + fi.f ;
|
*s = hsum_float_8(acc) + fi.f ;
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
||||||
|
@ -7997,6 +8012,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
*s = vec_extract(vsumf0, 0);
|
*s = vec_extract(vsumf0, 0);
|
||||||
|
|
||||||
#elif defined __loongarch_asx
|
#elif defined __loongarch_asx
|
||||||
|
GGML_UNUSED(kmask1);
|
||||||
|
GGML_UNUSED(kmask2);
|
||||||
|
GGML_UNUSED(kmask3);
|
||||||
|
|
||||||
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
||||||
const __m128i mzero = __lsx_vldi(0);
|
const __m128i mzero = __lsx_vldi(0);
|
||||||
|
@ -8015,6 +8033,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
|
||||||
memcpy(utmp, x[i].scales, 12);
|
memcpy(utmp, x[i].scales, 12);
|
||||||
|
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
||||||
|
const uint32_t uaux = utmp[1] & kmask1;
|
||||||
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
||||||
|
utmp[2] = uaux;
|
||||||
|
utmp[0] &= kmask1;
|
||||||
|
|
||||||
const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
|
const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
|
||||||
|
|
||||||
|
@ -8033,6 +8056,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
__m256i sumi = __lasx_xvldi(0);
|
__m256i sumi = __lasx_xvldi(0);
|
||||||
|
|
||||||
int bit = 0;
|
int bit = 0;
|
||||||
|
__m256i xvbit;
|
||||||
|
|
||||||
for (int j = 0; j < QK_K/64; ++j) {
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
|
||||||
|
@ -8041,13 +8065,15 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
|
|
||||||
const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
|
const __m256i q5bits = __lasx_xvld((const __m256i*)q5, 0); q5 += 32;
|
||||||
|
|
||||||
|
xvbit = __lasx_xvreplgr2vr_h(bit++);
|
||||||
const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
|
const __m256i q5l_0 = __lasx_xvand_v(q5bits, m4);
|
||||||
const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
|
const __m256i q5h_0 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
|
||||||
const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
|
const __m256i q5_0 = __lasx_xvadd_b(q5l_0, q5h_0);
|
||||||
hmask = __lasx_xvslli_h(hmask, 1);
|
hmask = __lasx_xvslli_h(hmask, 1);
|
||||||
|
|
||||||
|
xvbit = __lasx_xvreplgr2vr_h(bit++);
|
||||||
const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
|
const __m256i q5l_1 = __lasx_xvand_v(__lasx_xvsrli_h(q5bits, 4), m4);
|
||||||
const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrli_h(__lasx_xvand_v(hbits, hmask), bit++), 4);
|
const __m256i q5h_1 = __lasx_xvslli_h(__lasx_xvsrl_h(__lasx_xvand_v(hbits, hmask), xvbit), 4);
|
||||||
const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
|
const __m256i q5_1 = __lasx_xvadd_b(q5l_1, q5h_1);
|
||||||
hmask = __lasx_xvslli_h(hmask, 1);
|
hmask = __lasx_xvslli_h(hmask, 1);
|
||||||
|
|
||||||
|
@ -8061,10 +8087,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
||||||
p16_1 = lasx_madd_h(scale_1, p16_1);
|
p16_1 = lasx_madd_h(scale_1, p16_1);
|
||||||
|
|
||||||
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
|
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
||||||
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = hsum_float_8(acc) + summs;
|
*s = hsum_float_8(acc) + summs;
|
||||||
|
|
|
@ -15183,7 +15183,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0,
|
||||||
const int64_t r2 = ne12/ne02;
|
const int64_t r2 = ne12/ne02;
|
||||||
const int64_t r3 = ne13/ne03;
|
const int64_t r3 = ne13/ne03;
|
||||||
|
|
||||||
if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
|
if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
|
||||||
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
// there is no broadcast and src0, src1 are contiguous across dims 2, 3
|
||||||
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
|
||||||
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
*g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans,
|
||||||
|
|
130
ggml.c
130
ggml.c
|
@ -1576,11 +1576,11 @@ do { \
|
||||||
|
|
||||||
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
||||||
|
|
||||||
#define GGML_F32Cx8 __m256
|
#define GGML_F32Cx8 __m256
|
||||||
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
||||||
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
||||||
|
|
||||||
static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
||||||
float tmp[8];
|
float tmp[8];
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++) {
|
for (int i = 0; i < 8; i++) {
|
||||||
|
@ -1589,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(ggml_fp16_t *x) {
|
||||||
|
|
||||||
return (__m256)__lasx_xvld(tmp, 0);
|
return (__m256)__lasx_xvld(tmp, 0);
|
||||||
}
|
}
|
||||||
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||||
float arr[8];
|
float arr[8];
|
||||||
|
|
||||||
__lasx_xvst(y, arr, 0);
|
__lasx_xvst(y, arr, 0);
|
||||||
|
|
||||||
for (int i = 0; i < 8; i++)
|
for (int i = 0; i < 8; i++) {
|
||||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
||||||
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
||||||
|
@ -1671,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||||
#define GGML_F16_STEP 32
|
#define GGML_F16_STEP 32
|
||||||
#define GGML_F16_EPR 4
|
#define GGML_F16_EPR 4
|
||||||
|
|
||||||
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
||||||
float tmp[4];
|
float tmp[4];
|
||||||
|
|
||||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||||
|
@ -1682,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
||||||
return __lsx_vld(tmp, 0);
|
return __lsx_vld(tmp, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||||
float arr[4];
|
float arr[4];
|
||||||
|
|
||||||
__lsx_vst(y, arr, 0);
|
__lsx_vst(y, arr, 0);
|
||||||
|
@ -2315,32 +2316,27 @@ inline static __m512 ggml_v_expf(__m512 x) {
|
||||||
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
const __m512 r = _mm512_set1_ps(0x1.8p23f);
|
||||||
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
const __m512 z = _mm512_fmadd_ps(x, _mm512_set1_ps(0x1.715476p+0f), r);
|
||||||
const __m512 n = _mm512_sub_ps(z, r);
|
const __m512 n = _mm512_sub_ps(z, r);
|
||||||
const __m512 b = _mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
const __m512 b =
|
||||||
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.7f7d1cp-20f),
|
||||||
const __m512i e = _mm512_slli_epi32(_mm512_castps_si512(z), 23);
|
_mm512_fnmadd_ps(n, _mm512_set1_ps(0x1.62e4p-1f), x));
|
||||||
const __m512 k = _mm512_castsi512_ps(_mm512_add_epi32(e, _mm512_castps_si512(_mm512_set1_ps(1))));
|
|
||||||
const __mmask16 c = _mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(126), _CMP_GT_OQ);
|
|
||||||
const __m512 u = _mm512_mul_ps(b, b);
|
|
||||||
const __m512 j = _mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
|
||||||
_mm512_set1_ps(0x1.573e2ep-5f)), u,
|
|
||||||
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
|
||||||
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
|
||||||
u, _mm512_mul_ps(_mm512_set1_ps(0x1.ffffecp-1f), b));
|
|
||||||
if (_mm512_kortestz(c, c))
|
|
||||||
return _mm512_fmadd_ps(j, k, k);
|
|
||||||
const __m512i g = _mm512_and_si512(
|
|
||||||
_mm512_movm_epi32(_mm512_cmp_ps_mask(n, _mm512_setzero_ps(), _CMP_LE_OQ)),
|
|
||||||
_mm512_set1_epi32(0x82000000u));
|
|
||||||
const __m512 s1 =
|
|
||||||
_mm512_castsi512_ps(_mm512_add_epi32(g, _mm512_set1_epi32(0x7f000000u)));
|
|
||||||
const __m512 s2 = _mm512_castsi512_ps(_mm512_sub_epi32(e, g));
|
|
||||||
const __mmask16 d =
|
const __mmask16 d =
|
||||||
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
_mm512_cmp_ps_mask(_mm512_abs_ps(n), _mm512_set1_ps(192), _CMP_GT_OQ);
|
||||||
return _mm512_mask_blend_ps(
|
const __m512 u = _mm512_mul_ps(b, b);
|
||||||
d, _mm512_mask_blend_ps(
|
const __m512 j = _mm512_fmadd_ps(
|
||||||
c, _mm512_fmadd_ps(k, j, k),
|
_mm512_fmadd_ps(_mm512_fmadd_ps(_mm512_set1_ps(0x1.0e4020p-7f), b,
|
||||||
_mm512_mul_ps(_mm512_fmadd_ps(s2, j, s2), s1)),
|
_mm512_set1_ps(0x1.573e2ep-5f)),
|
||||||
_mm512_mul_ps(s1, s1));
|
u,
|
||||||
|
_mm512_fmadd_ps(_mm512_set1_ps(0x1.555e66p-3f), b,
|
||||||
|
_mm512_set1_ps(0x1.fffdb6p-2f))),
|
||||||
|
u,
|
||||||
|
_mm512_fmadd_ps(_mm512_set1_ps(0x1.ffffecp-1f), b, _mm512_set1_ps(1.0F)));
|
||||||
|
const __m512 res = _mm512_scalef_ps(j, n);
|
||||||
|
if (_mm512_kortestz(d, d))
|
||||||
|
return res;
|
||||||
|
const __m512 zero = _mm512_setzero_ps();
|
||||||
|
const __m512 alt = _mm512_mask_blend_ps(
|
||||||
|
_mm512_cmp_ps_mask(n, zero, _CMP_LE_OQ), _mm512_set1_ps(INFINITY), zero);
|
||||||
|
return _mm512_mask_blend_ps(d, res, alt);
|
||||||
}
|
}
|
||||||
|
|
||||||
// computes silu x/(1+exp(-x)) in single precision vector
|
// computes silu x/(1+exp(-x)) in single precision vector
|
||||||
|
@ -3221,7 +3217,11 @@ GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
|
||||||
|
return ggml_is_contiguous(tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
|
@ -3230,6 +3230,14 @@ static inline bool ggml_is_contiguous_except_dim_1(const struct ggml_tensor * te
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor) {
|
||||||
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
return
|
||||||
|
tensor->nb[0] == ggml_type_size(tensor->type) &&
|
||||||
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
GGML_CALL bool ggml_is_permuted(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
|
@ -11420,8 +11428,8 @@ static void ggml_compute_forward_gelu_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11483,8 +11491,8 @@ static void ggml_compute_forward_gelu_quick_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11546,8 +11554,8 @@ static void ggml_compute_forward_silu_f32(
|
||||||
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
|
|
||||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||||
|
@ -11658,9 +11666,9 @@ static void ggml_compute_forward_silu_back_f32(
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * grad = dst->src[1];
|
const struct ggml_tensor * grad = dst->src[1];
|
||||||
|
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(grad));
|
GGML_ASSERT(ggml_is_contiguous_1(grad));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
|
GGML_ASSERT(ggml_is_contiguous_1(src0));
|
||||||
GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
|
GGML_ASSERT(ggml_is_contiguous_1(dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
GGML_ASSERT(ggml_are_same_shape(src0, dst));
|
||||||
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
GGML_ASSERT(ggml_are_same_shape(src0, grad));
|
||||||
|
|
||||||
|
@ -14358,7 +14366,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
const float inv_ndims = -1.f/n_dims;
|
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
|
@ -14407,7 +14415,7 @@ static void ggml_compute_forward_rope_f32(
|
||||||
const float cos_block_theta = cosf(block_theta);
|
const float cos_block_theta = cosf(block_theta);
|
||||||
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
block_theta *= theta_scale;
|
block_theta *= theta_scale;
|
||||||
|
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
@ -14442,29 +14450,22 @@ static void ggml_compute_forward_rope_f32(
|
||||||
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||||
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
|
||||||
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
|
||||||
theta_base *= freq_scale;
|
|
||||||
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
if (ic < n_dims) {
|
if (ic < n_dims) {
|
||||||
const int64_t ib = 0;
|
const int64_t i0 = ic/2;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
sin_theta *= sin_sign;
|
|
||||||
|
|
||||||
|
sin_theta *= sin_sign;
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const float * const src = (float *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
@ -14543,7 +14544,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
int ir = 0;
|
int ir = 0;
|
||||||
|
|
||||||
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
const float theta_scale = powf(freq_base, -2.0f/n_dims);
|
||||||
const float inv_ndims = -1.f/n_dims;
|
|
||||||
float corr_dims[2];
|
float corr_dims[2];
|
||||||
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims);
|
||||||
|
|
||||||
|
@ -14592,7 +14593,7 @@ static void ggml_compute_forward_rope_f16(
|
||||||
const float cos_block_theta = cosf(block_theta);
|
const float cos_block_theta = cosf(block_theta);
|
||||||
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
const float sin_block_theta = sinf(block_theta) * sin_sign;
|
||||||
|
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
block_theta *= theta_scale;
|
block_theta *= theta_scale;
|
||||||
|
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
|
@ -14623,29 +14624,22 @@ static void ggml_compute_forward_rope_f16(
|
||||||
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
// ref: https://github.com/jquesnelle/yarn/blob/master/scaled_rope/LlamaYaRNScaledRotaryEmbedding.py
|
||||||
// it seems we have to rope just the first n_dims elements and do nothing with the rest
|
|
||||||
// ref: https://github.com/ml-explore/mlx/blob/dc2edc762c797e3b8de50b1dad4dc0a131691033/benchmarks/python/llama_jax_bench.py#L11-L26
|
|
||||||
theta_base *= freq_scale;
|
|
||||||
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
for (int64_t ic = 0; ic < ne0; ic += 2) {
|
||||||
if (ic < n_dims) {
|
if (ic < n_dims) {
|
||||||
const int64_t ib = 0;
|
const int64_t i0 = ic/2;
|
||||||
|
|
||||||
// simplified from `(ib * n_dims + ic) * inv_ndims`
|
const float freq_factor = freq_factors ? freq_factors[i0] : 1.0f;
|
||||||
float cur_rot = inv_ndims * ic - ib;
|
|
||||||
float freq_factor = freq_factors ? freq_factors[ic/2] : 1.0f;
|
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(
|
rope_yarn(
|
||||||
theta_base/freq_factor, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor,
|
theta_base/freq_factor, freq_scale, corr_dims, ic, ext_factor, attn_factor,
|
||||||
&cos_theta, &sin_theta
|
&cos_theta, &sin_theta
|
||||||
);
|
);
|
||||||
sin_theta *= sin_sign;
|
|
||||||
|
|
||||||
|
sin_theta *= sin_sign;
|
||||||
theta_base *= theta_scale;
|
theta_base *= theta_scale;
|
||||||
|
|
||||||
const int64_t i0 = ib*n_dims + ic/2;
|
|
||||||
|
|
||||||
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01 + i0*nb00);
|
||||||
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
|
||||||
|
|
||||||
|
|
6
ggml.h
6
ggml.h
|
@ -756,7 +756,6 @@ extern "C" {
|
||||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
|
||||||
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||||
|
@ -765,6 +764,11 @@ extern "C" {
|
||||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||||
|
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||||
|
|
||||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
|
||||||
|
|
|
@ -2670,14 +2670,12 @@ void main() {
|
||||||
const uint i = row*p.ncols + ib*p.ndims + ic/2;
|
const uint i = row*p.ncols + ib*p.ndims + ic/2;
|
||||||
const uint i2 = row/p.p_delta_rows;
|
const uint i2 = row/p.p_delta_rows;
|
||||||
|
|
||||||
const float cur_rot = p.inv_ndims * ic - ib;
|
|
||||||
|
|
||||||
const int pos = data_b[i2];
|
const int pos = data_b[i2];
|
||||||
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
|
const float freq_factor = p.has_freq_facs != 0 ? data_freq_factors[ic/2] : 1.0f;
|
||||||
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
|
const float theta_base = pos*p.freq_scale*pow(p.theta_scale, col/2.0f) / freq_factor;
|
||||||
|
|
||||||
float cos_theta, sin_theta;
|
float cos_theta, sin_theta;
|
||||||
rope_yarn(theta_base, uint(cur_rot), cos_theta, sin_theta);
|
rope_yarn(theta_base, ic, cos_theta, sin_theta);
|
||||||
|
|
||||||
const float x0 = float(data_a[i + 0]);
|
const float x0 = float(data_a[i + 0]);
|
||||||
const float x1 = float(data_a[i + p.ndims/2]);
|
const float x1 = float(data_a[i + p.ndims/2]);
|
||||||
|
|
|
@ -1,10 +1,15 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Sequence, Mapping, Iterable
|
from typing import Any, Callable, Sequence, Mapping, Iterable, Protocol, ClassVar, runtime_checkable
|
||||||
|
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
|
||||||
|
import gguf
|
||||||
|
|
||||||
from .gguf_writer import GGUFWriter
|
from .gguf_writer import GGUFWriter
|
||||||
|
|
||||||
|
@ -163,3 +168,298 @@ class SpecialVocab:
|
||||||
for typ in self.special_token_types:
|
for typ in self.special_token_types:
|
||||||
self._set_special_token(typ, config.get(f'{typ}_token_id'))
|
self._set_special_token(typ, config.get(f'{typ}_token_id'))
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class BaseVocab(Protocol):
|
||||||
|
tokenizer_model: ClassVar[str]
|
||||||
|
name: ClassVar[str]
|
||||||
|
|
||||||
|
|
||||||
|
@runtime_checkable
|
||||||
|
class Vocab(BaseVocab, Protocol):
|
||||||
|
vocab_size: int
|
||||||
|
added_tokens_dict: dict[str, int]
|
||||||
|
added_tokens_list: list[str]
|
||||||
|
fname_tokenizer: Path
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path): ...
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ...
|
||||||
|
|
||||||
|
|
||||||
|
class NoVocab(BaseVocab):
|
||||||
|
tokenizer_model = "no_vocab"
|
||||||
|
name = "no_vocab"
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return "<NoVocab for a model without integrated vocabulary>"
|
||||||
|
|
||||||
|
|
||||||
|
class BpeVocab(Vocab):
|
||||||
|
tokenizer_model = "gpt2"
|
||||||
|
name = "bpe"
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path):
|
||||||
|
added_tokens: dict[str, int] = {}
|
||||||
|
|
||||||
|
if (fname_tokenizer := base_path / 'vocab.json').exists():
|
||||||
|
# "slow" tokenizer
|
||||||
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
self.vocab = json.load(f)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.
|
||||||
|
with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
|
||||||
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# "fast" tokenizer
|
||||||
|
fname_tokenizer = base_path / 'tokenizer.json'
|
||||||
|
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding="utf-8") as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
if (
|
||||||
|
tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'ByteLevel'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer')
|
||||||
|
|
||||||
|
self.vocab = tokenizer_model["vocab"]
|
||||||
|
|
||||||
|
if (added := tokenizer_json.get('added_tokens')) is not None:
|
||||||
|
# Added tokens here can be duplicates of the main vocabulary.
|
||||||
|
added_tokens = {item['content']: item['id']
|
||||||
|
for item in added
|
||||||
|
if item['content'] not in self.vocab}
|
||||||
|
|
||||||
|
vocab_size = len(self.vocab)
|
||||||
|
expected_ids = list(range(vocab_size, vocab_size + len(added_tokens)))
|
||||||
|
actual_ids = sorted(added_tokens.values())
|
||||||
|
if expected_ids != actual_ids:
|
||||||
|
expected_end_id = vocab_size + len(actual_ids) - 1
|
||||||
|
raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range "
|
||||||
|
f"{vocab_size} - {expected_end_id}; got {actual_ids}")
|
||||||
|
|
||||||
|
items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1])
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
|
self.added_tokens_list = [text for (text, idx) in items]
|
||||||
|
self.vocab_size_base = vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()}
|
||||||
|
|
||||||
|
for i, _ in enumerate(self.vocab):
|
||||||
|
yield reverse_vocab[i], 0.0, gguf.TokenType.NORMAL
|
||||||
|
|
||||||
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
for text in self.added_tokens_list:
|
||||||
|
score = -1000.0
|
||||||
|
yield text.encode("utf-8"), score, gguf.TokenType.CONTROL
|
||||||
|
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
yield from self.bpe_tokens()
|
||||||
|
yield from self.added_tokens()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<BpeVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
|
class SentencePieceVocab(Vocab):
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "spm"
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path):
|
||||||
|
added_tokens: dict[str, int] = {}
|
||||||
|
if (fname_tokenizer := base_path / 'tokenizer.model').exists():
|
||||||
|
# normal location
|
||||||
|
try:
|
||||||
|
with open(base_path / 'added_tokens.json', encoding="utf-8") as f:
|
||||||
|
added_tokens = json.load(f)
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists():
|
||||||
|
# not found in alternate location either
|
||||||
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
||||||
|
|
||||||
|
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
||||||
|
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
|
||||||
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
|
expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens)))
|
||||||
|
actual_new_ids = sorted(new_tokens.keys())
|
||||||
|
|
||||||
|
if expected_new_ids != actual_new_ids:
|
||||||
|
raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}")
|
||||||
|
|
||||||
|
# Token pieces that were added to the base vocabulary.
|
||||||
|
self.added_tokens_dict = added_tokens
|
||||||
|
self.added_tokens_list = [new_tokens[id] for id in actual_new_ids]
|
||||||
|
self.vocab_size_base = vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
tokenizer = self.sentencepiece_tokenizer
|
||||||
|
for i in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(i)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score: float = tokenizer.GetScore(i)
|
||||||
|
|
||||||
|
toktype = gguf.TokenType.NORMAL
|
||||||
|
if tokenizer.IsUnknown(i):
|
||||||
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
|
if tokenizer.IsControl(i):
|
||||||
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
|
||||||
|
# NOTE: I think added_tokens are user defined.
|
||||||
|
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||||
|
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
|
if tokenizer.IsUnused(i):
|
||||||
|
toktype = gguf.TokenType.UNUSED
|
||||||
|
if tokenizer.IsByte(i):
|
||||||
|
toktype = gguf.TokenType.BYTE
|
||||||
|
|
||||||
|
yield text, score, toktype
|
||||||
|
|
||||||
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
for text in self.added_tokens_list:
|
||||||
|
score = -1000.0
|
||||||
|
yield text.encode("utf-8"), score, gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
yield from self.sentencepiece_tokens()
|
||||||
|
yield from self.added_tokens()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<SentencePieceVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
||||||
|
|
||||||
|
class LlamaHfVocab(Vocab):
|
||||||
|
tokenizer_model = "llama"
|
||||||
|
name = "hfft"
|
||||||
|
|
||||||
|
def __init__(self, base_path: Path):
|
||||||
|
fname_tokenizer = base_path / 'tokenizer.json'
|
||||||
|
# if this fails, FileNotFoundError propagates to caller
|
||||||
|
with open(fname_tokenizer, encoding='utf-8') as f:
|
||||||
|
tokenizer_json = json.load(f)
|
||||||
|
|
||||||
|
# pre-check so we know if we need transformers
|
||||||
|
tokenizer_model: dict[str, Any] = tokenizer_json['model']
|
||||||
|
is_llama3 = (
|
||||||
|
tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False)
|
||||||
|
and not tokenizer_model.get('byte_fallback', True)
|
||||||
|
)
|
||||||
|
if is_llama3:
|
||||||
|
raise TypeError('Llama 3 must be converted with BpeVocab')
|
||||||
|
|
||||||
|
if not is_llama3 and (
|
||||||
|
tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False)
|
||||||
|
or tokenizer_json['decoder']['type'] != 'Sequence'
|
||||||
|
):
|
||||||
|
raise FileNotFoundError('Cannot find Llama BPE tokenizer')
|
||||||
|
|
||||||
|
try:
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError(
|
||||||
|
"To use LlamaHfVocab, please install the `transformers` package. "
|
||||||
|
"You can install it with `pip install transformers`."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
# Allow the tokenizer to default to slow or fast versions.
|
||||||
|
# Explicitly set tokenizer to use local paths.
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
base_path,
|
||||||
|
cache_dir=base_path,
|
||||||
|
local_files_only=True,
|
||||||
|
)
|
||||||
|
assert self.tokenizer.is_fast # assume tokenizer.json is used
|
||||||
|
|
||||||
|
# Initialize lists and dictionaries for added tokens
|
||||||
|
self.added_tokens_list = []
|
||||||
|
self.added_tokens_dict = dict()
|
||||||
|
self.added_tokens_ids = set()
|
||||||
|
|
||||||
|
# Process added tokens
|
||||||
|
for tok, tokidx in sorted(
|
||||||
|
self.tokenizer.get_added_vocab().items(), key=lambda x: x[1]
|
||||||
|
):
|
||||||
|
# Only consider added tokens that are not in the base vocabulary
|
||||||
|
if tokidx >= self.tokenizer.vocab_size:
|
||||||
|
self.added_tokens_list.append(tok)
|
||||||
|
self.added_tokens_dict[tok] = tokidx
|
||||||
|
self.added_tokens_ids.add(tokidx)
|
||||||
|
|
||||||
|
# Store special tokens and their IDs
|
||||||
|
self.specials = {
|
||||||
|
tok: self.tokenizer.get_vocab()[tok]
|
||||||
|
for tok in self.tokenizer.all_special_tokens
|
||||||
|
}
|
||||||
|
self.special_ids = set(self.tokenizer.all_special_ids)
|
||||||
|
|
||||||
|
# Set vocabulary sizes
|
||||||
|
self.vocab_size_base = self.tokenizer.vocab_size
|
||||||
|
self.vocab_size = self.vocab_size_base + len(self.added_tokens_list)
|
||||||
|
|
||||||
|
self.fname_tokenizer = fname_tokenizer
|
||||||
|
|
||||||
|
def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
reverse_vocab = {
|
||||||
|
id: encoded_tok for encoded_tok, id in self.tokenizer.get_vocab().items()
|
||||||
|
}
|
||||||
|
|
||||||
|
for token_id in range(self.vocab_size_base):
|
||||||
|
# Skip processing added tokens here
|
||||||
|
if token_id in self.added_tokens_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Convert token text to bytes
|
||||||
|
token_text = reverse_vocab[token_id].encode("utf-8")
|
||||||
|
|
||||||
|
# Yield token text, score, and type
|
||||||
|
yield token_text, self.get_token_score(token_id), self.get_token_type(
|
||||||
|
token_id, token_text, self.special_ids # Reuse already stored special IDs
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_token_type(self, token_id: int, token_text: bytes, special_ids: set[int]) -> gguf.TokenType:
|
||||||
|
# Special case for byte tokens
|
||||||
|
if re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text):
|
||||||
|
return gguf.TokenType.BYTE
|
||||||
|
|
||||||
|
# Determine token type based on whether it's a special token
|
||||||
|
return gguf.TokenType.CONTROL if token_id in special_ids else gguf.TokenType.NORMAL
|
||||||
|
|
||||||
|
def get_token_score(self, token_id: int) -> float:
|
||||||
|
# Placeholder for actual logic to determine the token's score
|
||||||
|
# This needs to be implemented based on specific requirements
|
||||||
|
return -1000.0 # Default score
|
||||||
|
|
||||||
|
def added_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
for text in self.added_tokens_list:
|
||||||
|
if text in self.specials:
|
||||||
|
toktype = self.get_token_type(self.specials[text], b'', self.special_ids)
|
||||||
|
score = self.get_token_score(self.specials[text])
|
||||||
|
else:
|
||||||
|
toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
score = -1000.0
|
||||||
|
|
||||||
|
yield text.encode("utf-8"), score, toktype
|
||||||
|
|
||||||
|
def has_newline_token(self):
|
||||||
|
return "<0x0A>" in self.tokenizer.vocab or "\n" in self.tokenizer.vocab
|
||||||
|
|
||||||
|
def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
|
yield from self.hf_tokens()
|
||||||
|
yield from self.added_tokens()
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f"<LlamaHfVocab with {self.vocab_size_base} base tokens and {len(self.added_tokens_list)} added tokens>"
|
||||||
|
|
|
@ -144,6 +144,7 @@ def main() -> None:
|
||||||
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
|
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
|
||||||
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
|
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
|
||||||
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
|
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
|
||||||
|
parser.add_argument("--pre-tokenizer", type=str, help="The models tokenizer.ggml.pre", metavar='"pre tokenizer"')
|
||||||
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
|
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
|
||||||
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
|
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
|
||||||
parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
|
parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
|
||||||
|
@ -172,6 +173,9 @@ def main() -> None:
|
||||||
if template:
|
if template:
|
||||||
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
|
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
|
||||||
|
|
||||||
|
if args.pre_tokenizer:
|
||||||
|
new_metadata[gguf.Keys.Tokenizer.PRE] = MetadataDetails(gguf.GGUFValueType.STRING, args.pre_tokenizer)
|
||||||
|
|
||||||
if remove_metadata:
|
if remove_metadata:
|
||||||
logger.warning('*** Warning *** Warning *** Warning **')
|
logger.warning('*** Warning *** Warning *** Warning **')
|
||||||
logger.warning('* Most metadata is required for a fully functional GGUF file,')
|
logger.warning('* Most metadata is required for a fully functional GGUF file,')
|
||||||
|
|
251
llama.cpp
251
llama.cpp
|
@ -1706,12 +1706,13 @@ struct llama_mlock {
|
||||||
};
|
};
|
||||||
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
using llama_mlocks = std::vector<std::unique_ptr<llama_mlock>>;
|
||||||
|
|
||||||
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
// NOTE: avoid ever using this except for building the token_to_piece caches
|
||||||
|
static std::string llama_token_to_piece(const struct llama_model * model, llama_token token, bool special) {
|
||||||
std::vector<char> result(8, 0);
|
std::vector<char> result(8, 0);
|
||||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
const int n_tokens = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
||||||
if (n_tokens < 0) {
|
if (n_tokens < 0) {
|
||||||
result.resize(-n_tokens);
|
result.resize(-n_tokens);
|
||||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
|
int check = llama_token_to_piece(model, token, result.data(), result.size(), special);
|
||||||
GGML_ASSERT(check == -n_tokens);
|
GGML_ASSERT(check == -n_tokens);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
@ -2166,7 +2167,9 @@ struct llama_vocab {
|
||||||
std::unordered_map<token, id> token_to_id;
|
std::unordered_map<token, id> token_to_id;
|
||||||
std::vector<token_data> id_to_token;
|
std::vector<token_data> id_to_token;
|
||||||
|
|
||||||
std::vector<id> special_tokens_cache;
|
std::vector<id> cache_special_tokens;
|
||||||
|
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = false);
|
||||||
|
std::vector<token> cache_token_to_piece_special; // llama_token_to_piece(special = true);
|
||||||
|
|
||||||
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
|
||||||
|
|
||||||
|
@ -4596,20 +4599,14 @@ static void llm_load_vocab(
|
||||||
vocab.special_cls_id = 101;
|
vocab.special_cls_id = 101;
|
||||||
vocab.special_mask_id = 103;
|
vocab.special_mask_id = 103;
|
||||||
vocab.add_space_prefix = false;
|
vocab.add_space_prefix = false;
|
||||||
} else {
|
} else if (tokenizer_model == "gpt2") {
|
||||||
if (tokenizer_model == "gpt2") {
|
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_BPE;
|
|
||||||
|
|
||||||
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str());
|
||||||
if (add_space_prefix_keyidx != -1) {
|
if (add_space_prefix_keyidx != -1) {
|
||||||
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx);
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str());
|
|
||||||
LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__);
|
|
||||||
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// read bpe merges and populate bpe ranks
|
// read bpe merges and populate bpe ranks
|
||||||
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str());
|
||||||
if (merges_keyidx == -1) {
|
if (merges_keyidx == -1) {
|
||||||
|
@ -4643,6 +4640,8 @@ static void llm_load_vocab(
|
||||||
vocab.special_pad_id = -1;
|
vocab.special_pad_id = -1;
|
||||||
vocab.special_cls_id = -1;
|
vocab.special_cls_id = -1;
|
||||||
vocab.special_mask_id = -1;
|
vocab.special_mask_id = -1;
|
||||||
|
} else {
|
||||||
|
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
// for now, only BPE models have pre-tokenizers
|
// for now, only BPE models have pre-tokenizers
|
||||||
|
@ -4837,17 +4836,38 @@ static void llm_load_vocab(
|
||||||
{
|
{
|
||||||
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
for (llama_vocab::id id = 0; id < (llama_vocab::id)n_vocab; ++id) {
|
||||||
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
if (vocab.id_to_token[id].type != LLAMA_TOKEN_TYPE_NORMAL) {
|
||||||
vocab.special_tokens_cache.push_back(id);
|
vocab.cache_special_tokens.push_back(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::sort( vocab.special_tokens_cache.begin(), vocab.special_tokens_cache.end(),
|
std::sort( vocab.cache_special_tokens.begin(), vocab.cache_special_tokens.end(),
|
||||||
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
[&] (const llama_vocab::id a, const llama_vocab::id b) {
|
||||||
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
return vocab.id_to_token[a].text.size() > vocab.id_to_token[b].text.size();
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: special tokens cache size = %u.\n", __func__, (uint32_t)vocab.special_tokens_cache.size());
|
LLAMA_LOG_INFO("%s: special tokens cache size = %u\n", __func__, (uint32_t)vocab.cache_special_tokens.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
// build token to piece caches
|
||||||
|
{
|
||||||
|
size_t size_cache = 0;
|
||||||
|
|
||||||
|
std::vector<llama_vocab::token> cache_token_to_piece (n_vocab);
|
||||||
|
std::vector<llama_vocab::token> cache_token_to_piece_special(n_vocab);
|
||||||
|
|
||||||
|
for (uint32_t id = 0; id < n_vocab; ++id) {
|
||||||
|
cache_token_to_piece[id] = llama_token_to_piece(&model, id, false);
|
||||||
|
cache_token_to_piece_special[id] = llama_token_to_piece(&model, id, true);
|
||||||
|
|
||||||
|
size_cache += cache_token_to_piece[id].size();
|
||||||
|
size_cache += cache_token_to_piece_special[id].size();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::swap(vocab.cache_token_to_piece, cache_token_to_piece);
|
||||||
|
std::swap(vocab.cache_token_to_piece_special, cache_token_to_piece_special);
|
||||||
|
|
||||||
|
LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -11191,46 +11211,69 @@ struct llm_build_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||||
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, ggml_element_size(q) * hparams.n_embd_head_k, ggml_element_size(q) * hparams.n_embd_head_k * n_head, 0);
|
struct ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens,
|
||||||
|
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||||
|
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||||
|
0);
|
||||||
cb(q_nope, "q_nope", il);
|
cb(q_nope, "q_nope", il);
|
||||||
|
|
||||||
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
// and {n_head * n_embd_head_qk_rope, n_tokens}
|
||||||
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, ggml_element_size(q) * hparams.n_embd_head_k, ggml_element_size(q) * hparams.n_embd_head_k * n_head, ggml_element_size(q) * n_embd_head_qk_nope);
|
struct ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens,
|
||||||
|
ggml_row_size(q->type, hparams.n_embd_head_k),
|
||||||
|
ggml_row_size(q->type, hparams.n_embd_head_k * n_head),
|
||||||
|
ggml_row_size(q->type, n_embd_head_qk_nope));
|
||||||
cb(q_pe, "q_pe", il);
|
cb(q_pe, "q_pe", il);
|
||||||
|
|
||||||
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
// {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens}
|
||||||
struct ggml_tensor * compressed_kv_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
struct ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur);
|
||||||
cb(compressed_kv_pe, "compressed_kv_pe", il);
|
cb(kv_pe_compresseed, "kv_pe_compresseed", il);
|
||||||
|
|
||||||
// split into {kv_lora_rank, n_tokens}
|
// split into {kv_lora_rank, n_tokens}
|
||||||
struct ggml_tensor * compressed_kv = ggml_view_2d(ctx0, compressed_kv_pe, kv_lora_rank, n_tokens, compressed_kv_pe->nb[1], 0);
|
struct ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens,
|
||||||
cb(compressed_kv, "compressed_kv", il);
|
kv_pe_compresseed->nb[1],
|
||||||
|
0);
|
||||||
|
cb(kv_compressed, "kv_compressed", il);
|
||||||
|
|
||||||
// and {n_embd_head_qk_rope, n_tokens}
|
// and {n_embd_head_qk_rope, n_tokens}
|
||||||
struct ggml_tensor * k_pe = ggml_view_2d(ctx0, compressed_kv_pe, n_embd_head_qk_rope, n_tokens, compressed_kv_pe->nb[1], ggml_element_size(compressed_kv_pe)*kv_lora_rank);
|
struct ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens,
|
||||||
|
kv_pe_compresseed->nb[1],
|
||||||
|
kv_pe_compresseed->nb[1],
|
||||||
|
ggml_row_size(kv_pe_compresseed->type, kv_lora_rank));
|
||||||
cb(k_pe, "k_pe", il);
|
cb(k_pe, "k_pe", il);
|
||||||
|
|
||||||
compressed_kv = llm_build_norm(ctx0, compressed_kv, hparams,
|
kv_compressed = ggml_cont(ctx0, kv_compressed); // TODO: the CUDA backend does not support non-contiguous norm
|
||||||
|
kv_compressed = llm_build_norm(ctx0, kv_compressed, hparams,
|
||||||
model.layers[il].attn_kv_a_norm, NULL,
|
model.layers[il].attn_kv_a_norm, NULL,
|
||||||
LLM_NORM_RMS, cb, il);
|
LLM_NORM_RMS, cb, il);
|
||||||
cb(compressed_kv, "compressed_kv", il);
|
cb(kv_compressed, "kv_compressed", il);
|
||||||
|
|
||||||
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
// {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens}
|
||||||
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, compressed_kv);
|
struct ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed);
|
||||||
cb(kv, "kv", il);
|
cb(kv, "kv", il);
|
||||||
|
|
||||||
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
// split into {n_head * n_embd_head_qk_nope, n_tokens}
|
||||||
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), 0);
|
struct ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens,
|
||||||
|
ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v),
|
||||||
|
ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||||
|
0);
|
||||||
cb(k_nope, "k_nope", il);
|
cb(k_nope, "k_nope", il);
|
||||||
|
|
||||||
// and {n_head * n_embd_head_v, n_tokens}
|
// and {n_head * n_embd_head_v, n_tokens}
|
||||||
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, ggml_element_size(kv) * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v), ggml_element_size(kv) * n_embd_head_qk_nope);
|
struct ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens,
|
||||||
|
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)),
|
||||||
|
ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head),
|
||||||
|
ggml_row_size(kv->type, (n_embd_head_qk_nope)));
|
||||||
cb(v_states, "v_states", il);
|
cb(v_states, "v_states", il);
|
||||||
|
|
||||||
v_states = ggml_cont(ctx0, v_states);
|
v_states = ggml_cont(ctx0, v_states);
|
||||||
cb(v_states, "v_states", il);
|
cb(v_states, "v_states", il);
|
||||||
|
|
||||||
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, ggml_element_size(kv) * hparams.n_embd_head_v * n_head, 0);
|
v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens,
|
||||||
|
ggml_row_size(kv->type, hparams.n_embd_head_v * n_head),
|
||||||
|
0);
|
||||||
cb(v_states, "v_states", il);
|
cb(v_states, "v_states", il);
|
||||||
|
|
||||||
|
q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
||||||
q_pe = ggml_rope_ext(
|
q_pe = ggml_rope_ext(
|
||||||
ctx0, q_pe, inp_pos, nullptr,
|
ctx0, q_pe, inp_pos, nullptr,
|
||||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
|
@ -11239,8 +11282,9 @@ struct llm_build_context {
|
||||||
cb(q_pe, "q_pe", il);
|
cb(q_pe, "q_pe", il);
|
||||||
|
|
||||||
// shared RoPE key
|
// shared RoPE key
|
||||||
|
k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend does not support non-contiguous RoPE
|
||||||
k_pe = ggml_rope_ext(
|
k_pe = ggml_rope_ext(
|
||||||
ctx0, ggml_view_3d(ctx0, k_pe, n_embd_head_qk_rope, 1, n_tokens, k_pe->nb[0], k_pe->nb[1], 0), inp_pos, nullptr,
|
ctx0, k_pe, inp_pos, nullptr,
|
||||||
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
||||||
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
ext_factor, attn_factor_scaled, beta_fast, beta_slow
|
||||||
);
|
);
|
||||||
|
@ -13213,7 +13257,7 @@ struct fragment_buffer_variant {
|
||||||
|
|
||||||
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<fragment_buffer_variant> & buffer) {
|
||||||
// for each special token
|
// for each special token
|
||||||
for (const llama_vocab::id special_id : vocab.special_tokens_cache) {
|
for (const llama_vocab::id special_id : vocab.cache_special_tokens) {
|
||||||
const auto & special_token = vocab.id_to_token[special_id].text;
|
const auto & special_token = vocab.id_to_token[special_id].text;
|
||||||
|
|
||||||
// for each text fragment
|
// for each text fragment
|
||||||
|
@ -14372,7 +14416,7 @@ void llama_sample_repetition_penalties(
|
||||||
|
|
||||||
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
||||||
GGML_ASSERT(ctx);
|
GGML_ASSERT(ctx);
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
bool allow_eog = false;
|
bool allow_eog = false;
|
||||||
for (const auto & stack : grammar->stacks) {
|
for (const auto & stack : grammar->stacks) {
|
||||||
|
@ -14384,12 +14428,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
|
|
||||||
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
||||||
candidates_decoded.reserve(candidates->size);
|
candidates_decoded.reserve(candidates->size);
|
||||||
std::vector<llama_grammar_candidate> candidates_grammar;
|
|
||||||
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
||||||
candidates_grammar.reserve(candidates->size);
|
candidates_grammar.reserve(candidates->size);
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
const llama_token id = candidates->data[i].id;
|
const llama_token id = candidates->data[i].id;
|
||||||
const std::string piece = llama_token_to_piece(ctx, id, false);
|
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(id);
|
||||||
|
|
||||||
if (llama_token_is_eog(&ctx->model, id)) {
|
if (llama_token_is_eog(&ctx->model, id)) {
|
||||||
if (!allow_eog) {
|
if (!allow_eog) {
|
||||||
|
@ -14589,7 +14634,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string piece = llama_token_to_piece(ctx, token, false);
|
const std::string & piece = ctx->model.vocab.cache_token_to_piece.at(token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
||||||
|
@ -18272,69 +18317,83 @@ static std::string llama_decode_text(const std::string & text) {
|
||||||
|
|
||||||
// does not write null-terminator to buf
|
// does not write null-terminator to buf
|
||||||
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) {
|
||||||
|
// if we have a cache - use it
|
||||||
|
{
|
||||||
|
const auto & cache = special ? model->vocab.cache_token_to_piece_special : model->vocab.cache_token_to_piece;
|
||||||
|
|
||||||
|
if (!cache.empty()) {
|
||||||
|
const auto & res = cache.at(token);
|
||||||
|
if (length < (int) res.size()) {
|
||||||
|
return -(int) res.size();
|
||||||
|
}
|
||||||
|
memcpy(buf, res.c_str(), res.size());
|
||||||
|
return res.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (0 <= token && token < llama_n_vocab(model)) {
|
if (0 <= token && token < llama_n_vocab(model)) {
|
||||||
switch (llama_vocab_get_type(model->vocab)) {
|
switch (llama_vocab_get_type(model->vocab)) {
|
||||||
case LLAMA_VOCAB_TYPE_WPM:
|
case LLAMA_VOCAB_TYPE_WPM:
|
||||||
case LLAMA_VOCAB_TYPE_SPM: {
|
case LLAMA_VOCAB_TYPE_SPM: {
|
||||||
// NOTE: we accept all unsupported token types,
|
// NOTE: we accept all unsupported token types,
|
||||||
// suppressing them like CONTROL tokens.
|
// suppressing them like CONTROL tokens.
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
llama_unescape_whitespace(result);
|
llama_unescape_whitespace(result);
|
||||||
if (length < (int) result.length()) {
|
if (length < (int) result.length()) {
|
||||||
return -(int) result.length();
|
return -(int) result.length();
|
||||||
|
}
|
||||||
|
memcpy(buf, result.c_str(), result.length());
|
||||||
|
return result.length();
|
||||||
|
} else if (
|
||||||
|
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||||
|
(llama_is_control_token (model->vocab, token) && special)) {
|
||||||
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
|
if (length < (int) result.length()) {
|
||||||
|
return -(int) result.length();
|
||||||
|
}
|
||||||
|
memcpy(buf, result.c_str(), result.length());
|
||||||
|
return result.length();
|
||||||
|
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
||||||
|
if (length < 3) {
|
||||||
|
return -3;
|
||||||
|
}
|
||||||
|
memcpy(buf, "\xe2\x96\x85", 3);
|
||||||
|
return 3;
|
||||||
|
} else if (llama_is_byte_token(model->vocab, token)) {
|
||||||
|
if (length < 1) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
break;
|
||||||
return result.length();
|
|
||||||
} else if (
|
|
||||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
|
||||||
(llama_is_control_token (model->vocab, token) && special)) {
|
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
|
||||||
if (length < (int) result.length()) {
|
|
||||||
return -(int) result.length();
|
|
||||||
}
|
|
||||||
memcpy(buf, result.c_str(), result.length());
|
|
||||||
return result.length();
|
|
||||||
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
|
||||||
if (length < 3) {
|
|
||||||
return -3;
|
|
||||||
}
|
|
||||||
memcpy(buf, "\xe2\x96\x85", 3);
|
|
||||||
return 3;
|
|
||||||
} else if (llama_is_byte_token(model->vocab, token)) {
|
|
||||||
if (length < 1) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
buf[0] = llama_token_to_byte(model->vocab, token);
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
break;
|
case LLAMA_VOCAB_TYPE_BPE: {
|
||||||
}
|
// NOTE: we accept all unsupported token types,
|
||||||
case LLAMA_VOCAB_TYPE_BPE: {
|
// suppressing them like CONTROL tokens.
|
||||||
// NOTE: we accept all unsupported token types,
|
if (llama_is_normal_token(model->vocab, token)) {
|
||||||
// suppressing them like CONTROL tokens.
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
if (llama_is_normal_token(model->vocab, token)) {
|
result = llama_decode_text(result);
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
if (length < (int) result.length()) {
|
||||||
result = llama_decode_text(result);
|
return -(int) result.length();
|
||||||
if (length < (int) result.length()) {
|
}
|
||||||
return -(int) result.length();
|
memcpy(buf, result.c_str(), result.length());
|
||||||
|
return result.length();
|
||||||
|
} else if (
|
||||||
|
(llama_is_user_defined_token(model->vocab, token)) ||
|
||||||
|
(llama_is_control_token (model->vocab, token) && special)) {
|
||||||
|
std::string result = model->vocab.id_to_token[token].text;
|
||||||
|
if (length < (int) result.length()) {
|
||||||
|
return -(int) result.length();
|
||||||
|
}
|
||||||
|
memcpy(buf, result.c_str(), result.length());
|
||||||
|
return result.length();
|
||||||
}
|
}
|
||||||
memcpy(buf, result.c_str(), result.length());
|
break;
|
||||||
return result.length();
|
|
||||||
} else if (
|
|
||||||
(llama_is_user_defined_token(model->vocab, token)) ||
|
|
||||||
(llama_is_control_token (model->vocab, token) && special)) {
|
|
||||||
std::string result = model->vocab.id_to_token[token].text;
|
|
||||||
if (length < (int) result.length()) {
|
|
||||||
return -(int) result.length();
|
|
||||||
}
|
|
||||||
memcpy(buf, result.c_str(), result.length());
|
|
||||||
return result.length();
|
|
||||||
}
|
}
|
||||||
break;
|
default:
|
||||||
}
|
GGML_ASSERT(false);
|
||||||
default:
|
|
||||||
GGML_ASSERT(false);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
4
llama.h
4
llama.h
|
@ -424,8 +424,8 @@ extern "C" {
|
||||||
|
|
||||||
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
|
||||||
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
|
||||||
|
|
||||||
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
|
||||||
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# Package versions must stay compatible across all top-level python scripts.
|
# Package versions must stay compatible across all top-level python scripts.
|
||||||
#
|
#
|
||||||
|
|
||||||
-r ./requirements/requirements-convert.txt
|
-r ./requirements/requirements-convert-legacy-llama.txt
|
||||||
|
|
||||||
-r ./requirements/requirements-convert-hf-to-gguf.txt
|
-r ./requirements/requirements-convert-hf-to-gguf.txt
|
||||||
-r ./requirements/requirements-convert-hf-to-gguf-update.txt
|
-r ./requirements/requirements-convert-hf-to-gguf-update.txt
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
-r ./requirements-convert.txt
|
-r ./requirements-convert-legacy-llama.txt
|
||||||
torch~=2.1.1
|
torch~=2.1.1
|
||||||
|
|
|
@ -1,2 +1,2 @@
|
||||||
-r ./requirements-convert.txt
|
-r ./requirements-convert-legacy-llama.txt
|
||||||
torch~=2.1.1
|
torch~=2.1.1
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
-r ./requirements-convert.txt
|
-r ./requirements-convert-legacy-llama.txt
|
||||||
|
|
|
@ -166,7 +166,7 @@ if (( do_cleanup )); then
|
||||||
rm -rf -- "$all_venv"
|
rm -rf -- "$all_venv"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
check_convert_script convert.py
|
check_convert_script examples/convert-legacy-llama.py
|
||||||
for py in convert-*.py; do
|
for py in convert-*.py; do
|
||||||
# skip convert-hf-to-gguf-update.py
|
# skip convert-hf-to-gguf-update.py
|
||||||
# TODO: the check is failing for some reason:
|
# TODO: the check is failing for some reason:
|
||||||
|
|
|
@ -19,22 +19,22 @@ logger = logging.getLogger("compare-llama-bench")
|
||||||
|
|
||||||
# Properties by which to differentiate results per commit:
|
# Properties by which to differentiate results per commit:
|
||||||
KEY_PROPERTIES = [
|
KEY_PROPERTIES = [
|
||||||
"cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas",
|
"cpu_info", "gpu_info", "n_gpu_layers", "cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas",
|
||||||
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads",
|
"blas", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "embeddings", "n_threads",
|
||||||
"type_k", "type_v", "no_kv_offload", "tensor_split", "n_prompt", "n_gen"
|
"type_k", "type_v", "use_mmap", "no_kv_offload", "split_mode", "main_gpu", "tensor_split", "flash_attn", "n_prompt", "n_gen"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Properties that are boolean and are converted to Yes/No for the table:
|
# Properties that are boolean and are converted to Yes/No for the table:
|
||||||
BOOL_PROPERTIES = ["cuda", "opencl", "metal", "gpu_blas", "blas"]
|
BOOL_PROPERTIES = ["cuda", "opencl", "vulkan", "kompute", "metal", "sycl", "gpu_blas", "blas", "embeddings", "use_mmap", "no_kv_offload", "flash_attn"]
|
||||||
|
|
||||||
# Header names for the table:
|
# Header names for the table:
|
||||||
PRETTY_NAMES = {
|
PRETTY_NAMES = {
|
||||||
"cuda": "CUDA", "opencl": "OpenCL", "metal": "Metal", "gpu_blas": "GPU BLAS", "blas": "BLAS",
|
"cuda": "CUDA", "opencl": "OpenCL", "vulkan": "Vulkan", "kompute": "Kompute", "metal": "Metal", "sycl": "SYCL", "rpc": "RPC",
|
||||||
"cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
"gpu_blas": "GPU BLAS", "blas": "BLAS", "cpu_info": "CPU", "gpu_info": "GPU", "model_filename": "File", "model_type": "Model",
|
||||||
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters",
|
"model_size": "Model Size [GiB]", "model_n_params": "Num. of Par.", "n_batch": "Batch size", "n_ubatch": "Microbatch size",
|
||||||
"n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type",
|
"n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "split_mode": "Split mode",
|
||||||
"n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO",
|
"main_gpu": "Main GPU", "no_kv_offload": "NKVO", "flash_attn": "FlashAttention", "tensor_split": "Tensor split",
|
||||||
"tensor_split": "Tensor split"
|
"use_mmap": "Use mmap", "embeddings": "Embeddings",
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
DEFAULT_SHOW = ["model_type"] # Always show these properties by default.
|
||||||
|
|
|
@ -3,20 +3,20 @@
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# LLaMA v1
|
# LLaMA v1
|
||||||
python3 convert.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../llama1/7B --outfile models/llama-7b/ggml-model-f16.gguf --outtype f16
|
||||||
python3 convert.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../llama1/13B --outfile models/llama-13b/ggml-model-f16.gguf --outtype f16
|
||||||
python3 convert.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../llama1/30B --outfile models/llama-30b/ggml-model-f16.gguf --outtype f16
|
||||||
python3 convert.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../llama1/65B --outfile models/llama-65b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
# LLaMA v2
|
# LLaMA v2
|
||||||
python3 convert.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../llama2/llama-2-7b --outfile models/llama-7b-v2/ggml-model-f16.gguf --outtype f16
|
||||||
python3 convert.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../llama2/llama-2-13b --outfile models/llama-13b-v2/ggml-model-f16.gguf --outtype f16
|
||||||
python3 convert.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../llama2/llama-2-70b --outfile models/llama-70b-v2/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
# Code Llama
|
# Code Llama
|
||||||
python3 convert.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-7b/ --outfile models/codellama-7b/ggml-model-f16.gguf --outtype f16
|
||||||
python3 convert.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-13b/ --outfile models/codellama-13b/ggml-model-f16.gguf --outtype f16
|
||||||
python3 convert.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ../codellama/CodeLlama-34b/ --outfile models/codellama-34b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
# Falcon
|
# Falcon
|
||||||
python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1
|
python3 convert-falcon-hf-to-gguf.py ../falcon/falcon-7b 1
|
||||||
|
|
|
@ -75,7 +75,7 @@ if [ "$1" -eq "1" ]; then
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
python3 convert.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
|
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0
|
||||||
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
|
./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k
|
||||||
|
@ -90,7 +90,7 @@ if [ "$1" -eq "2" ]; then
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
|
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0
|
||||||
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
|
./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k
|
||||||
|
@ -105,7 +105,7 @@ if [ "$1" -eq "3" ]; then
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
|
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0
|
||||||
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
|
./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k
|
||||||
|
@ -120,7 +120,7 @@ if [ "$1" -eq "4" ]; then
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
|
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0
|
||||||
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
|
./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k
|
||||||
|
@ -135,7 +135,7 @@ if [ "$1" -eq "5" ]; then
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
|
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0
|
||||||
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
|
./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k
|
||||||
|
@ -150,7 +150,7 @@ if [ "$1" -eq "6" ]; then
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
|
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0
|
||||||
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
|
./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k
|
||||||
|
@ -165,7 +165,7 @@ if [ "$1" -eq "7" ]; then
|
||||||
|
|
||||||
cd /workspace/llama.cpp
|
cd /workspace/llama.cpp
|
||||||
|
|
||||||
python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
|
python3 examples/convert-legacy-llama.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16
|
||||||
|
|
||||||
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
|
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0
|
||||||
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
|
./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k
|
||||||
|
|
|
@ -129,8 +129,11 @@ llama_target_and_test(test-rope.cpp)
|
||||||
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||||
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
||||||
|
|
||||||
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
|
||||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||||
|
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||||
|
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
||||||
|
endif()
|
||||||
|
|
||||||
# dummy executable - not installed
|
# dummy executable - not installed
|
||||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||||
|
|
|
@ -1138,26 +1138,37 @@ struct test_soft_max : public test_case {
|
||||||
// GGML_OP_ROPE
|
// GGML_OP_ROPE
|
||||||
struct test_rope : public test_case {
|
struct test_rope : public test_case {
|
||||||
const ggml_type type;
|
const ggml_type type;
|
||||||
const std::array<int64_t, 4> ne;
|
const std::array<int64_t, 4> ne_a;
|
||||||
int n_dims;
|
int n_dims;
|
||||||
int mode;
|
int mode;
|
||||||
int n_ctx;
|
int n_ctx;
|
||||||
|
float fs; // freq_scale
|
||||||
|
float ef; // ext_factor
|
||||||
|
float af; // attn_factor
|
||||||
bool ff;
|
bool ff;
|
||||||
|
int v; // view (1 : non-contiguous a)
|
||||||
|
|
||||||
std::string vars() override {
|
std::string vars() override {
|
||||||
return VARS_TO_STR6(type, ne, n_dims, mode, n_ctx, ff);
|
return VARS_TO_STR10(type, ne_a, n_dims, mode, n_ctx, fs, ef, af, ff, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
test_rope(ggml_type type = GGML_TYPE_F32,
|
test_rope(ggml_type type = GGML_TYPE_F32,
|
||||||
std::array<int64_t, 4> ne = {10, 10, 10, 1},
|
std::array<int64_t, 4> ne_a = {10, 10, 10, 1},
|
||||||
int n_dims = 10, int mode = 0, int n_ctx = 512, bool ff = false)
|
int n_dims = 10, int mode = 0, int n_ctx = 512, float fs = 1.0f, float ef = 0.0f, float af = 0.0f, bool ff = false, int v = 0)
|
||||||
: type(type), ne(ne), n_dims(n_dims), mode(mode), n_ctx(n_ctx), ff(ff) {}
|
: type(type), ne_a(ne_a), n_dims(n_dims), mode(mode), n_ctx(n_ctx), fs(fs), ef(ef), af(af), ff(ff), v(v) {}
|
||||||
|
|
||||||
ggml_tensor * build_graph(ggml_context * ctx) override {
|
ggml_tensor * build_graph(ggml_context * ctx) override {
|
||||||
ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data());
|
ggml_tensor * a;
|
||||||
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne[2]);
|
if (v & 1) {
|
||||||
|
auto ne = ne_a; ne[0] *= 2; ne[1] *= 4; ne[2] *= 3;
|
||||||
|
a = ggml_new_tensor(ctx, type, 4, ne.data());
|
||||||
|
a = ggml_view_4d(ctx, a, ne_a[0], ne_a[1], ne_a[2], ne_a[3], a->nb[1], a->nb[2], a->nb[3], 0);
|
||||||
|
} else {
|
||||||
|
a = ggml_new_tensor(ctx, type, 4, ne_a.data());
|
||||||
|
}
|
||||||
|
ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, ne_a[2]);
|
||||||
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
|
ggml_tensor * freq = ff ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_dims/2) : nullptr;
|
||||||
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, 1.0f, 0.0f, 1.0f, 0.0f, 0.0f);
|
ggml_tensor * out = ggml_rope_ext(ctx, a, pos, freq, n_dims, mode, n_ctx, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1165,11 +1176,11 @@ struct test_rope : public test_case {
|
||||||
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
|
||||||
if (t->type == GGML_TYPE_I32) {
|
if (t->type == GGML_TYPE_I32) {
|
||||||
// pos
|
// pos
|
||||||
std::vector<int> data(ne[2]);
|
std::vector<int> data(ne_a[2]);
|
||||||
for (int i = 0; i < ne[2]; i++) {
|
for (int i = 0; i < ne_a[2]; i++) {
|
||||||
data[i] = rand() % n_ctx;
|
data[i] = rand() % n_ctx;
|
||||||
}
|
}
|
||||||
ggml_backend_tensor_set(t, data.data(), 0, ne[2] * sizeof(int));
|
ggml_backend_tensor_set(t, data.data(), 0, ne_a[2] * sizeof(int));
|
||||||
} else {
|
} else {
|
||||||
if (t->ne[0] == n_dims/2) {
|
if (t->ne[0] == n_dims/2) {
|
||||||
// frequency factors in the range [0.9f, 1.1f]
|
// frequency factors in the range [0.9f, 1.1f]
|
||||||
|
@ -2213,20 +2224,38 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 0.0f));
|
||||||
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {32, 2, 32, 1}, true, 0.1f, 8.0f));
|
||||||
|
|
||||||
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
{
|
||||||
// TODO: ff not supported yet for !neox
|
bool all = true;
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, false)); // llama 7B
|
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, false)); // llama 13B
|
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, false)); // llama 30B
|
|
||||||
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, false)); // llama 65B
|
|
||||||
|
|
||||||
for (bool ff : {false, true}) { // freq_factors
|
for (float v : { 0, 1 }) {
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
|
for (float fs : { 1.0f, 1.4245f }) {
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, ff)); // neox (falcon 7B)
|
for (float ef : { 0.0f, 0.7465f }) {
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
|
for (float af : { 1.0f, 1.4245f }) {
|
||||||
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, ff)); // neox (falcon 40B)
|
for (ggml_type type : {GGML_TYPE_F32, GGML_TYPE_F16}) {
|
||||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, ff)); // neox (stablelm)
|
// TODO: ff not supported yet for !neox
|
||||||
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, ff)); // neox (phi-2)
|
test_cases.emplace_back(new test_rope(type, {128, 32, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 7B
|
||||||
|
if (all) {
|
||||||
|
test_cases.emplace_back(new test_rope(type, {128, 40, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 13B
|
||||||
|
test_cases.emplace_back(new test_rope(type, {128, 52, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 30B
|
||||||
|
test_cases.emplace_back(new test_rope(type, {128, 64, 10, 1}, 128, 0, 512, fs, ef, af, false, v)); // llama 65B
|
||||||
|
}
|
||||||
|
|
||||||
|
for (bool ff : {false, true}) { // freq_factors
|
||||||
|
if (all) {
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 64, 1, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 64, 71, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 7B)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 64, 8, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 20, 2, 512, fs, ef, af, ff, v)); // neox (stablelm)
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 80, 32, 10, 1}, 32, 2, 512, fs, ef, af, ff, v)); // neox (phi-2)
|
||||||
|
}
|
||||||
|
|
||||||
|
test_cases.emplace_back(new test_rope(type, { 64, 128, 10, 1}, 64, 2, 512, fs, ef, af, ff, v)); // neox (falcon 40B)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
all = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue