Merge branch 'master' into compilade/mamba2
This commit is contained in:
commit
038d958333
132 changed files with 6559 additions and 5146 deletions
26
.devops/full-musa.Dockerfile
Normal file
26
.devops/full-musa.Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG MUSA_VERSION=rc3.1.0
|
||||||
|
# Target the MUSA build image
|
||||||
|
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
|
||||||
|
|
||||||
|
COPY requirements.txt requirements.txt
|
||||||
|
COPY requirements requirements
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip setuptools wheel \
|
||||||
|
&& pip install -r requirements.txt
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release -j$(nproc) && \
|
||||||
|
cp build/bin/* .
|
||||||
|
|
||||||
|
ENTRYPOINT ["/app/.devops/tools.sh"]
|
30
.devops/llama-cli-musa.Dockerfile
Normal file
30
.devops/llama-cli-musa.Dockerfile
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG MUSA_VERSION=rc3.1.0
|
||||||
|
# Target the MUSA build image
|
||||||
|
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the MUSA runtime image
|
||||||
|
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git cmake
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN cmake -B build -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release --target llama-cli -j$(nproc)
|
||||||
|
|
||||||
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libgomp1
|
||||||
|
|
||||||
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-cli /llama-cli
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-cli" ]
|
35
.devops/llama-server-musa.Dockerfile
Normal file
35
.devops/llama-server-musa.Dockerfile
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG MUSA_VERSION=rc3.1.0
|
||||||
|
# Target the MUSA build image
|
||||||
|
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the MUSA runtime image
|
||||||
|
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential git cmake libcurl4-openssl-dev
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
RUN cmake -B build -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
|
||||||
|
cmake --build build --config Release --target llama-server -j$(nproc)
|
||||||
|
|
||||||
|
FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
|
COPY --from=build /app/build/ggml/src/libggml.so /libggml.so
|
||||||
|
COPY --from=build /app/build/src/libllama.so /libllama.so
|
||||||
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
|
# Must be set to 0.0.0.0 so it can listen to requests from host machine
|
||||||
|
ENV LLAMA_ARG_HOST=0.0.0.0
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
|
ENTRYPOINT [ "/llama-server" ]
|
4
.github/workflows/bench.yml.disabled
vendored
4
.github/workflows/bench.yml.disabled
vendored
|
@ -27,10 +27,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '04 2 * * *'
|
- cron: '04 2 * * *'
|
||||||
|
|
||||||
|
|
5
.github/workflows/build.yml
vendored
5
.github/workflows/build.yml
vendored
|
@ -19,6 +19,11 @@ concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
# Fine-grant permission
|
||||||
|
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||||
|
permissions:
|
||||||
|
contents: write # for creating release
|
||||||
|
|
||||||
env:
|
env:
|
||||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||||
GGML_NLOOP: 3
|
GGML_NLOOP: 3
|
||||||
|
|
5
.github/workflows/close-issue.yml
vendored
5
.github/workflows/close-issue.yml
vendored
|
@ -3,6 +3,11 @@ on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: "42 0 * * *"
|
- cron: "42 0 * * *"
|
||||||
|
|
||||||
|
# Fine-grant permission
|
||||||
|
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||||
|
permissions:
|
||||||
|
issues: write
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
close-issues:
|
close-issues:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
3
.github/workflows/docker.yml
vendored
3
.github/workflows/docker.yml
vendored
|
@ -43,6 +43,9 @@ jobs:
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" }
|
||||||
|
- { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" }
|
||||||
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
# Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
|
||||||
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
#- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
|
|
7
.github/workflows/nix-ci-aarch64.yml
vendored
7
.github/workflows/nix-ci-aarch64.yml
vendored
|
@ -21,6 +21,13 @@ concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
# Fine-grant permission
|
||||||
|
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||||
|
permissions:
|
||||||
|
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
nix-build-aarch64:
|
nix-build-aarch64:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|
7
.github/workflows/nix-ci.yml
vendored
7
.github/workflows/nix-ci.yml
vendored
|
@ -12,6 +12,13 @@ concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
cancel-in-progress: true
|
cancel-in-progress: true
|
||||||
|
|
||||||
|
# Fine-grant permission
|
||||||
|
# https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
|
||||||
|
permissions:
|
||||||
|
# https://github.com/DeterminateSystems/nix-installer-action?tab=readme-ov-file#with-flakehub
|
||||||
|
id-token: write
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
nix-eval:
|
nix-eval:
|
||||||
strategy:
|
strategy:
|
||||||
|
|
|
@ -63,7 +63,7 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
|
||||||
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
# utils
|
# utils
|
||||||
option(LLAMA_BUILD_COMMON "llama: build common utils library" ON)
|
option(LLAMA_BUILD_COMMON "llama: build common utils library" ${LLAMA_STANDALONE})
|
||||||
|
|
||||||
# extra artifacts
|
# extra artifacts
|
||||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||||
|
@ -201,12 +201,12 @@ if (LLAMA_BUILD_COMMON)
|
||||||
add_subdirectory(common)
|
add_subdirectory(common)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||||
include(CTest)
|
include(CTest)
|
||||||
add_subdirectory(tests)
|
add_subdirectory(tests)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BUILD_EXAMPLES)
|
if (LLAMA_BUILD_COMMON AND LLAMA_BUILD_EXAMPLES)
|
||||||
add_subdirectory(examples)
|
add_subdirectory(examples)
|
||||||
add_subdirectory(pocs)
|
add_subdirectory(pocs)
|
||||||
endif()
|
endif()
|
||||||
|
|
|
@ -1,24 +1,23 @@
|
||||||
# Pull requests (for contributors)
|
# Pull requests (for contributors)
|
||||||
|
|
||||||
- Test your changes:
|
- Test your changes:
|
||||||
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the GGML library
|
- Using the commands in the [`tests`](tests) folder. For instance, running the `./tests/test-backend-ops` command tests different backend implementations of the `ggml` library
|
||||||
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
- Execute [the full CI locally on your machine](ci/README.md) before publishing
|
||||||
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
|
- Optionally rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs
|
||||||
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
|
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
|
||||||
- Consider allowing write access to your branch for faster review
|
|
||||||
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
|
||||||
|
|
||||||
# Pull requests (for collaborators)
|
# Pull requests (for collaborators)
|
||||||
|
|
||||||
- Squash-merge PRs
|
- Squash-merge PRs
|
||||||
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
- Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
|
||||||
- Optionally, pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
|
||||||
|
|
||||||
# Coding guidelines
|
# Coding guidelines
|
||||||
|
|
||||||
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
- Avoid adding third-party dependencies, extra files, extra headers, etc.
|
||||||
- Always consider cross-compatibility with other operating systems and architectures
|
- Always consider cross-compatibility with other operating systems and architectures
|
||||||
- Avoid fancy looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
|
- Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
|
||||||
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
|
- There are no strict rules for the code style, but try to follow the patterns in the code (indentation, spaces, etc.). Vertical alignment makes things more readable and easier to batch edit
|
||||||
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
- Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
|
||||||
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
- Naming usually optimizes for common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
|
||||||
|
|
18
Makefile
18
Makefile
|
@ -5,7 +5,6 @@ BUILD_TARGETS = \
|
||||||
llama-batched \
|
llama-batched \
|
||||||
llama-batched-bench \
|
llama-batched-bench \
|
||||||
llama-bench \
|
llama-bench \
|
||||||
llama-benchmark-matmult \
|
|
||||||
llama-cli \
|
llama-cli \
|
||||||
llama-convert-llama2c-to-ggml \
|
llama-convert-llama2c-to-ggml \
|
||||||
llama-embedding \
|
llama-embedding \
|
||||||
|
@ -68,7 +67,7 @@ TEST_TARGETS = \
|
||||||
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
|
||||||
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \
|
||||||
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
|
||||||
retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm
|
retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm
|
||||||
|
|
||||||
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
|
||||||
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
|
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
|
||||||
|
@ -1055,10 +1054,11 @@ ggml/src/ggml-alloc.o: \
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CC) $(CFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml/src/ggml-backend.o: \
|
ggml/src/ggml-backend.o: \
|
||||||
ggml/src/ggml-backend.c \
|
ggml/src/ggml-backend.cpp \
|
||||||
|
ggml/src/ggml-backend-impl.h \
|
||||||
ggml/include/ggml.h \
|
ggml/include/ggml.h \
|
||||||
ggml/include/ggml-backend.h
|
ggml/include/ggml-backend.h
|
||||||
$(CC) $(CFLAGS) -c $< -o $@
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
ggml/src/ggml-quants.o: \
|
ggml/src/ggml-quants.o: \
|
||||||
ggml/src/ggml-quants.c \
|
ggml/src/ggml-quants.c \
|
||||||
|
@ -1523,16 +1523,6 @@ common/build-info.o: common/build-info.cpp
|
||||||
|
|
||||||
tests: $(TEST_TARGETS)
|
tests: $(TEST_TARGETS)
|
||||||
|
|
||||||
llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \
|
|
||||||
$(OBJ_GGML) common/build-info.o
|
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
|
|
||||||
|
|
||||||
run-benchmark-matmult: llama-benchmark-matmult
|
|
||||||
./$@
|
|
||||||
|
|
||||||
.PHONY: run-benchmark-matmult swift
|
|
||||||
|
|
||||||
tests/test-arg-parser: tests/test-arg-parser.cpp \
|
tests/test-arg-parser: tests/test-arg-parser.cpp \
|
||||||
$(OBJ_ALL)
|
$(OBJ_ALL)
|
||||||
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
|
||||||
|
|
|
@ -11,7 +11,7 @@ var sources = [
|
||||||
"src/unicode-data.cpp",
|
"src/unicode-data.cpp",
|
||||||
"ggml/src/ggml.c",
|
"ggml/src/ggml.c",
|
||||||
"ggml/src/ggml-alloc.c",
|
"ggml/src/ggml-alloc.c",
|
||||||
"ggml/src/ggml-backend.c",
|
"ggml/src/ggml-backend.cpp",
|
||||||
"ggml/src/ggml-quants.c",
|
"ggml/src/ggml-quants.c",
|
||||||
"ggml/src/ggml-aarch64.c",
|
"ggml/src/ggml-aarch64.c",
|
||||||
]
|
]
|
||||||
|
|
|
@ -31,7 +31,7 @@ variety of hardware - locally and in the cloud.
|
||||||
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
- Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks
|
||||||
- AVX, AVX2 and AVX512 support for x86 architectures
|
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||||
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
- 1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use
|
||||||
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP)
|
- Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads MTT GPUs via MUSA)
|
||||||
- Vulkan and SYCL backend support
|
- Vulkan and SYCL backend support
|
||||||
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
- CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
|
||||||
|
|
||||||
|
@ -92,6 +92,7 @@ Typically finetunes of the base models below are supported as well.
|
||||||
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
- [x] [EXAONE-3.0-7.8B-Instruct](https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct)
|
||||||
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
- [x] [FalconMamba Models](https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a)
|
||||||
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
- [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat)
|
||||||
|
- [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a)
|
||||||
|
|
||||||
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
|
||||||
|
|
||||||
|
@ -168,6 +169,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
- [AIKit](https://github.com/sozercan/aikit) (MIT)
|
||||||
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
- [LARS - The LLM & Advanced Referencing Solution](https://github.com/abgulati/LARS) (AGPL)
|
||||||
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
- [LLMUnity](https://github.com/undreamai/LLMUnity) (MIT)
|
||||||
|
- [Llama Assistant](https://github.com/vietanhdev/llama-assistant) (GPL)
|
||||||
|
|
||||||
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)*
|
||||||
|
|
||||||
|
@ -411,7 +413,7 @@ Please refer to [Build llama.cpp locally](./docs/build.md)
|
||||||
| [BLAS](./docs/build.md#blas-build) | All |
|
| [BLAS](./docs/build.md#blas-build) | All |
|
||||||
| [BLIS](./docs/backend/BLIS.md) | All |
|
| [BLIS](./docs/backend/BLIS.md) | All |
|
||||||
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
| [SYCL](./docs/backend/SYCL.md) | Intel and Nvidia GPU |
|
||||||
| [MUSA](./docs/build.md#musa) | Moore Threads GPU |
|
| [MUSA](./docs/build.md#musa) | Moore Threads MTT GPU |
|
||||||
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
|
| [CUDA](./docs/build.md#cuda) | Nvidia GPU |
|
||||||
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
|
| [hipBLAS](./docs/build.md#hipblas) | AMD GPU |
|
||||||
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
| [Vulkan](./docs/build.md#vulkan) | GPU |
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#/bin/bash
|
#!/bin/bash
|
||||||
#
|
#
|
||||||
# sample usage:
|
# sample usage:
|
||||||
#
|
#
|
||||||
|
@ -751,7 +751,8 @@ function gg_run_rerank_tiny {
|
||||||
|
|
||||||
model_f16="${path_models}/ggml-model-f16.gguf"
|
model_f16="${path_models}/ggml-model-f16.gguf"
|
||||||
|
|
||||||
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s><s>hi\nwhat is panda?</s><s>it's a bear\nwhat is panda?</s><s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
# for this model, the SEP token is "</s>"
|
||||||
|
(time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
# sample output
|
# sample output
|
||||||
# rerank score 0: 0.029
|
# rerank score 0: 0.029
|
||||||
|
@ -774,7 +775,7 @@ function gg_run_rerank_tiny {
|
||||||
|
|
||||||
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
||||||
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
|
||||||
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.15" | tee -a $OUT/${ci}-rk-f16.log
|
check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
|
1139
common/arg.cpp
1139
common/arg.cpp
File diff suppressed because it is too large
Load diff
44
common/arg.h
44
common/arg.h
|
@ -10,7 +10,7 @@
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_arg {
|
struct common_arg {
|
||||||
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
|
||||||
std::vector<const char *> args;
|
std::vector<const char *> args;
|
||||||
const char * value_hint = nullptr; // help text or example for arg value
|
const char * value_hint = nullptr; // help text or example for arg value
|
||||||
|
@ -18,60 +18,60 @@ struct llama_arg {
|
||||||
const char * env = nullptr;
|
const char * env = nullptr;
|
||||||
std::string help;
|
std::string help;
|
||||||
bool is_sparam = false; // is current arg a sampling param?
|
bool is_sparam = false; // is current arg a sampling param?
|
||||||
void (*handler_void) (gpt_params & params) = nullptr;
|
void (*handler_void) (common_params & params) = nullptr;
|
||||||
void (*handler_string) (gpt_params & params, const std::string &) = nullptr;
|
void (*handler_string) (common_params & params, const std::string &) = nullptr;
|
||||||
void (*handler_str_str)(gpt_params & params, const std::string &, const std::string &) = nullptr;
|
void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
|
||||||
void (*handler_int) (gpt_params & params, int) = nullptr;
|
void (*handler_int) (common_params & params, int) = nullptr;
|
||||||
|
|
||||||
llama_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
const char * value_hint,
|
const char * value_hint,
|
||||||
const std::string & help,
|
const std::string & help,
|
||||||
void (*handler)(gpt_params & params, const std::string &)
|
void (*handler)(common_params & params, const std::string &)
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
) : args(args), value_hint(value_hint), help(help), handler_string(handler) {}
|
||||||
|
|
||||||
llama_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
const char * value_hint,
|
const char * value_hint,
|
||||||
const std::string & help,
|
const std::string & help,
|
||||||
void (*handler)(gpt_params & params, int)
|
void (*handler)(common_params & params, int)
|
||||||
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
) : args(args), value_hint(value_hint), help(help), handler_int(handler) {}
|
||||||
|
|
||||||
llama_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
const std::string & help,
|
const std::string & help,
|
||||||
void (*handler)(gpt_params & params)
|
void (*handler)(common_params & params)
|
||||||
) : args(args), help(help), handler_void(handler) {}
|
) : args(args), help(help), handler_void(handler) {}
|
||||||
|
|
||||||
// support 2 values for arg
|
// support 2 values for arg
|
||||||
llama_arg(
|
common_arg(
|
||||||
const std::initializer_list<const char *> & args,
|
const std::initializer_list<const char *> & args,
|
||||||
const char * value_hint,
|
const char * value_hint,
|
||||||
const char * value_hint_2,
|
const char * value_hint_2,
|
||||||
const std::string & help,
|
const std::string & help,
|
||||||
void (*handler)(gpt_params & params, const std::string &, const std::string &)
|
void (*handler)(common_params & params, const std::string &, const std::string &)
|
||||||
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
) : args(args), value_hint(value_hint), value_hint_2(value_hint_2), help(help), handler_str_str(handler) {}
|
||||||
|
|
||||||
llama_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
common_arg & set_examples(std::initializer_list<enum llama_example> examples);
|
||||||
llama_arg & set_env(const char * env);
|
common_arg & set_env(const char * env);
|
||||||
llama_arg & set_sparam();
|
common_arg & set_sparam();
|
||||||
bool in_example(enum llama_example ex);
|
bool in_example(enum llama_example ex);
|
||||||
bool get_value_from_env(std::string & output);
|
bool get_value_from_env(std::string & output);
|
||||||
bool has_value_from_env();
|
bool has_value_from_env();
|
||||||
std::string to_string();
|
std::string to_string();
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params_context {
|
struct common_params_context {
|
||||||
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
enum llama_example ex = LLAMA_EXAMPLE_COMMON;
|
||||||
gpt_params & params;
|
common_params & params;
|
||||||
std::vector<llama_arg> options;
|
std::vector<common_arg> options;
|
||||||
void(*print_usage)(int, char **) = nullptr;
|
void(*print_usage)(int, char **) = nullptr;
|
||||||
gpt_params_context(gpt_params & params) : params(params) {}
|
common_params_context(common_params & params) : params(params) {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// parse input arguments from CLI
|
// parse input arguments from CLI
|
||||||
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
// if one argument has invalid value, it will automatically display usage of the specific argument (and not the full usage message)
|
||||||
bool gpt_params_parse(int argc, char ** argv, gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
||||||
// function to be used by test-arg-parser
|
// function to be used by test-arg-parser
|
||||||
gpt_params_context gpt_params_parser_init(gpt_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
|
||||||
|
|
|
@ -12,6 +12,7 @@
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
|
#include <climits>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <codecvt>
|
#include <codecvt>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
|
@ -23,10 +24,10 @@
|
||||||
#include <regex>
|
#include <regex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <unordered_set>
|
#include <unordered_set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <thread>
|
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__MACH__)
|
#if defined(__APPLE__) && defined(__MACH__)
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
|
@ -362,10 +363,10 @@ bool parse_cpu_mask(const std::string & mask, bool (&boolmask)[GGML_MAX_N_THREAD
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_init() {
|
void common_init() {
|
||||||
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
llama_log_set([](ggml_log_level level, const char * text, void * /*user_data*/) {
|
||||||
if (LOG_DEFAULT_LLAMA <= gpt_log_verbosity_thold) {
|
if (LOG_DEFAULT_LLAMA <= common_log_verbosity_thold) {
|
||||||
gpt_log_add(gpt_log_main(), level, "%s", text);
|
common_log_add(common_log_main(), level, "%s", text);
|
||||||
}
|
}
|
||||||
}, NULL);
|
}, NULL);
|
||||||
|
|
||||||
|
@ -378,7 +379,7 @@ void gpt_init() {
|
||||||
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
LOG_INF("build: %d (%s) with %s for %s%s\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT, LLAMA_COMPILER, LLAMA_BUILD_TARGET, build_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params) {
|
std::string common_params_get_system_info(const common_params & params) {
|
||||||
std::ostringstream os;
|
std::ostringstream os;
|
||||||
|
|
||||||
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
os << "system_info: n_threads = " << params.cpuparams.n_threads;
|
||||||
|
@ -400,6 +401,21 @@ std::string gpt_params_get_system_info(const gpt_params & params) {
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
std::string string_format(const char * fmt, ...) {
|
||||||
|
va_list ap;
|
||||||
|
va_list ap2;
|
||||||
|
va_start(ap, fmt);
|
||||||
|
va_copy(ap2, ap);
|
||||||
|
int size = vsnprintf(NULL, 0, fmt, ap);
|
||||||
|
GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
|
||||||
|
std::vector<char> buf(size + 1);
|
||||||
|
int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
|
||||||
|
GGML_ASSERT(size2 == size);
|
||||||
|
va_end(ap2);
|
||||||
|
va_end(ap);
|
||||||
|
return std::string(buf.data(), size);
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::string> string_split(std::string input, char separator) {
|
std::vector<std::string> string_split(std::string input, char separator) {
|
||||||
std::vector<std::string> parts;
|
std::vector<std::string> parts;
|
||||||
size_t separator_pos = input.find(separator);
|
size_t separator_pos = input.find(separator);
|
||||||
|
@ -493,7 +509,7 @@ std::string string_from(const struct llama_context * ctx, const std::vector<llam
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto detokenized = llama_token_to_piece(ctx, token);
|
auto detokenized = common_token_to_piece(ctx, token);
|
||||||
|
|
||||||
detokenized.erase(
|
detokenized.erase(
|
||||||
std::remove_if(
|
std::remove_if(
|
||||||
|
@ -524,7 +540,7 @@ std::string string_from(const struct llama_context * ctx, const struct llama_bat
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto detokenized = llama_token_to_piece(ctx, batch.token[i]);
|
auto detokenized = common_token_to_piece(ctx, batch.token[i]);
|
||||||
|
|
||||||
detokenized.erase(
|
detokenized.erase(
|
||||||
std::remove_if(
|
std::remove_if(
|
||||||
|
@ -819,16 +835,16 @@ std::string fs_get_cache_file(const std::string & filename) {
|
||||||
//
|
//
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
struct common_init_result common_init_from_params(common_params & params) {
|
||||||
llama_init_result iparams;
|
common_init_result iparams;
|
||||||
auto mparams = llama_model_params_from_gpt_params(params);
|
auto mparams = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
|
|
||||||
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
if (!params.hf_repo.empty() && !params.hf_file.empty()) {
|
||||||
model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||||
} else if (!params.model_url.empty()) {
|
} else if (!params.model_url.empty()) {
|
||||||
model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
model = common_load_model_from_url(params.model_url.c_str(), params.model.c_str(), params.hf_token.c_str(), mparams);
|
||||||
} else {
|
} else {
|
||||||
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
}
|
}
|
||||||
|
@ -838,7 +854,32 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto cparams = llama_context_params_from_gpt_params(params);
|
if (params.reranking) {
|
||||||
|
bool ok = true;
|
||||||
|
|
||||||
|
if (llama_token_bos(model) == LLAMA_TOKEN_NULL) {
|
||||||
|
LOG_WRN("%s: warning: model does not have a BOS token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||||
|
LOG_WRN("%s: warning: model does not have an EOS token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (llama_token_sep(model) == LLAMA_TOKEN_NULL) {
|
||||||
|
LOG_WRN("%s: warning: model does not have a SEP token, reranking will not work\n", __func__);
|
||||||
|
ok = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!ok) {
|
||||||
|
llama_free_model(model);
|
||||||
|
|
||||||
|
return iparams;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
llama_context * lctx = llama_new_context_with_model(model, cparams);
|
||||||
if (lctx == NULL) {
|
if (lctx == NULL) {
|
||||||
|
@ -851,10 +892,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
if (params.control_vector_layer_start <= 0) params.control_vector_layer_start = 1;
|
||||||
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
if (params.control_vector_layer_end <= 0) params.control_vector_layer_end = llama_n_layer(model);
|
||||||
|
|
||||||
const auto cvec = llama_control_vector_load(params.control_vectors);
|
const auto cvec = common_control_vector_load(params.control_vectors);
|
||||||
if (cvec.n_embd == -1) {
|
if (cvec.n_embd == -1) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -867,13 +909,14 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
if (err) {
|
if (err) {
|
||||||
llama_free(lctx);
|
llama_free(lctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// load and optionally apply lora adapters
|
// load and optionally apply lora adapters
|
||||||
for (auto & la : params.lora_adapters) {
|
for (auto & la : params.lora_adapters) {
|
||||||
llama_lora_adapter_container loaded_la;
|
common_lora_adapter_container loaded_la;
|
||||||
loaded_la.path = la.path;
|
loaded_la.path = la.path;
|
||||||
loaded_la.scale = la.scale;
|
loaded_la.scale = la.scale;
|
||||||
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
|
||||||
|
@ -886,10 +929,10 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
|
||||||
}
|
}
|
||||||
if (!params.lora_init_without_apply) {
|
if (!params.lora_init_without_apply) {
|
||||||
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
|
common_lora_adapters_apply(lctx, iparams.lora_adapters);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.sparams.ignore_eos && llama_token_eos(model) == -1) {
|
if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
|
||||||
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
|
||||||
params.sparams.ignore_eos = false;
|
params.sparams.ignore_eos = false;
|
||||||
}
|
}
|
||||||
|
@ -930,10 +973,11 @@ struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
|
||||||
|
|
||||||
iparams.model = model;
|
iparams.model = model;
|
||||||
iparams.context = lctx;
|
iparams.context = lctx;
|
||||||
|
|
||||||
return iparams;
|
return iparams;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters) {
|
||||||
llama_lora_adapter_clear(ctx);
|
llama_lora_adapter_clear(ctx);
|
||||||
for (auto & la : lora_adapters) {
|
for (auto & la : lora_adapters) {
|
||||||
if (la.scale != 0.0f) {
|
if (la.scale != 0.0f) {
|
||||||
|
@ -942,7 +986,7 @@ void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lor
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
|
struct llama_model_params common_model_params_to_llama(const common_params & params) {
|
||||||
auto mparams = llama_model_default_params();
|
auto mparams = llama_model_default_params();
|
||||||
|
|
||||||
if (params.n_gpu_layers != -1) {
|
if (params.n_gpu_layers != -1) {
|
||||||
|
@ -994,7 +1038,7 @@ static ggml_type kv_cache_type_from_str(const std::string & s) {
|
||||||
throw std::runtime_error("Invalid cache type: " + s);
|
throw std::runtime_error("Invalid cache type: " + s);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) {
|
struct llama_context_params common_context_params_to_llama(const common_params & params) {
|
||||||
auto cparams = llama_context_default_params();
|
auto cparams = llama_context_default_params();
|
||||||
|
|
||||||
cparams.n_ctx = params.n_ctx;
|
cparams.n_ctx = params.n_ctx;
|
||||||
|
@ -1084,7 +1128,7 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool llama_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) {
|
||||||
|
|
||||||
// Initialize libcurl
|
// Initialize libcurl
|
||||||
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
std::unique_ptr<CURL, decltype(&curl_easy_cleanup)> curl(curl_easy_init(), &curl_easy_cleanup);
|
||||||
|
@ -1154,15 +1198,15 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
||||||
}
|
}
|
||||||
|
|
||||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||||
struct llama_load_model_from_url_headers {
|
struct common_load_model_from_url_headers {
|
||||||
std::string etag;
|
std::string etag;
|
||||||
std::string last_modified;
|
std::string last_modified;
|
||||||
};
|
};
|
||||||
llama_load_model_from_url_headers headers;
|
common_load_model_from_url_headers headers;
|
||||||
{
|
{
|
||||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||||
llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
|
common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata;
|
||||||
|
|
||||||
static std::regex header_regex("([^:]+): (.*)\r\n");
|
static std::regex header_regex("([^:]+): (.*)\r\n");
|
||||||
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
static std::regex etag_regex("ETag", std::regex_constants::icase);
|
||||||
|
@ -1298,7 +1342,7 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * model_url,
|
const char * model_url,
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
const char * hf_token,
|
const char * hf_token,
|
||||||
|
@ -1309,7 +1353,7 @@ struct llama_model * llama_load_model_from_url(
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!llama_download_file(model_url, path_model, hf_token)) {
|
if (!common_download_file(model_url, path_model, hf_token)) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1362,7 +1406,7 @@ struct llama_model * llama_load_model_from_url(
|
||||||
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0};
|
||||||
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split);
|
||||||
|
|
||||||
return llama_download_file(split_url, split_path, hf_token);
|
return common_download_file(split_url, split_path, hf_token);
|
||||||
}, idx));
|
}, idx));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1377,7 +1421,7 @@ struct llama_model * llama_load_model_from_url(
|
||||||
return llama_load_model_from_file(path_model, params);
|
return llama_load_model_from_file(path_model, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * repo,
|
const char * repo,
|
||||||
const char * model,
|
const char * model,
|
||||||
const char * path_model,
|
const char * path_model,
|
||||||
|
@ -1397,12 +1441,12 @@ struct llama_model * llama_load_model_from_hf(
|
||||||
model_url += "/resolve/main/";
|
model_url += "/resolve/main/";
|
||||||
model_url += model;
|
model_url += model;
|
||||||
|
|
||||||
return llama_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
return common_load_model_from_url(model_url.c_str(), path_model, hf_token, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_url(
|
struct llama_model * common_load_model_from_url(
|
||||||
const char * /*model_url*/,
|
const char * /*model_url*/,
|
||||||
const char * /*path_model*/,
|
const char * /*path_model*/,
|
||||||
const char * /*hf_token*/,
|
const char * /*hf_token*/,
|
||||||
|
@ -1411,7 +1455,7 @@ struct llama_model * llama_load_model_from_url(
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_hf(
|
struct llama_model * common_load_model_from_hf(
|
||||||
const char * /*repo*/,
|
const char * /*repo*/,
|
||||||
const char * /*model*/,
|
const char * /*model*/,
|
||||||
const char * /*path_model*/,
|
const char * /*path_model*/,
|
||||||
|
@ -1427,11 +1471,11 @@ struct llama_model * llama_load_model_from_hf(
|
||||||
// Batch utils
|
// Batch utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_batch_clear(struct llama_batch & batch) {
|
void common_batch_clear(struct llama_batch & batch) {
|
||||||
batch.n_tokens = 0;
|
batch.n_tokens = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_batch_add(
|
void common_batch_add(
|
||||||
struct llama_batch & batch,
|
struct llama_batch & batch,
|
||||||
llama_token id,
|
llama_token id,
|
||||||
llama_pos pos,
|
llama_pos pos,
|
||||||
|
@ -1454,15 +1498,15 @@ void llama_batch_add(
|
||||||
// Vocab utils
|
// Vocab utils
|
||||||
//
|
//
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> common_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special) {
|
bool parse_special) {
|
||||||
return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
return common_tokenize(llama_get_model(ctx), text, add_special, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> common_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
|
@ -1481,7 +1525,7 @@ std::vector<llama_token> llama_tokenize(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
std::string common_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
|
||||||
std::string piece;
|
std::string piece;
|
||||||
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
|
||||||
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
|
||||||
|
@ -1497,7 +1541,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t
|
||||||
return piece;
|
return piece;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
std::string common_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
|
||||||
std::string text;
|
std::string text;
|
||||||
text.resize(std::max(text.capacity(), tokens.size()));
|
text.resize(std::max(text.capacity(), tokens.size()));
|
||||||
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
|
||||||
|
@ -1517,15 +1561,15 @@ std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token>
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
bool llama_chat_verify_template(const std::string & tmpl) {
|
bool common_chat_verify_template(const std::string & tmpl) {
|
||||||
llama_chat_message chat[] = {{"user", "test"}};
|
llama_chat_message chat[] = {{"user", "test"}};
|
||||||
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
int res = llama_chat_apply_template(nullptr, tmpl.c_str(), chat, 1, true, nullptr, 0);
|
||||||
return res >= 0;
|
return res >= 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
std::string common_chat_apply_template(const struct llama_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<llama_chat_msg> & msgs,
|
const std::vector<common_chat_msg> & msgs,
|
||||||
bool add_ass) {
|
bool add_ass) {
|
||||||
int alloc_size = 0;
|
int alloc_size = 0;
|
||||||
bool fallback = false; // indicate if we must fallback to default chatml
|
bool fallback = false; // indicate if we must fallback to default chatml
|
||||||
|
@ -1567,42 +1611,42 @@ std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_chat_format_single(const struct llama_model * model,
|
std::string common_chat_format_single(const struct llama_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<llama_chat_msg> & past_msg,
|
const std::vector<common_chat_msg> & past_msg,
|
||||||
const llama_chat_msg & new_msg,
|
const common_chat_msg & new_msg,
|
||||||
bool add_ass) {
|
bool add_ass) {
|
||||||
std::ostringstream ss;
|
std::ostringstream ss;
|
||||||
auto fmt_past_msg = past_msg.empty() ? "" : llama_chat_apply_template(model, tmpl, past_msg, false);
|
auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(model, tmpl, past_msg, false);
|
||||||
std::vector<llama_chat_msg> chat_new(past_msg);
|
std::vector<common_chat_msg> chat_new(past_msg);
|
||||||
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
// if the past_msg ends with a newline, we must preserve it in the formatted version
|
||||||
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
|
||||||
ss << "\n";
|
ss << "\n";
|
||||||
};
|
};
|
||||||
// format chat with new_msg
|
// format chat with new_msg
|
||||||
chat_new.push_back(new_msg);
|
chat_new.push_back(new_msg);
|
||||||
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
|
auto fmt_new_msg = common_chat_apply_template(model, tmpl, chat_new, add_ass);
|
||||||
// get the diff part
|
// get the diff part
|
||||||
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string llama_chat_format_example(const struct llama_model * model,
|
std::string common_chat_format_example(const struct llama_model * model,
|
||||||
const std::string & tmpl) {
|
const std::string & tmpl) {
|
||||||
std::vector<llama_chat_msg> msgs = {
|
std::vector<common_chat_msg> msgs = {
|
||||||
{"system", "You are a helpful assistant"},
|
{"system", "You are a helpful assistant"},
|
||||||
{"user", "Hello"},
|
{"user", "Hello"},
|
||||||
{"assistant", "Hi there"},
|
{"assistant", "Hi there"},
|
||||||
{"user", "How are you?"},
|
{"user", "How are you?"},
|
||||||
};
|
};
|
||||||
return llama_chat_apply_template(model, tmpl, msgs, true);
|
return common_chat_apply_template(model, tmpl, msgs, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
|
||||||
|
@ -1625,7 +1669,7 @@ void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size) {
|
||||||
printf("\n=== Done dumping\n");
|
printf("\n=== Done dumping\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size) {
|
||||||
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
|
||||||
|
|
||||||
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
|
||||||
|
@ -1677,7 +1721,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) {
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
|
|
||||||
switch (embd_norm) {
|
switch (embd_norm) {
|
||||||
|
@ -1711,7 +1755,7 @@ void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n){
|
||||||
double sum = 0.0;
|
double sum = 0.0;
|
||||||
double sum1 = 0.0;
|
double sum1 = 0.0;
|
||||||
double sum2 = 0.0;
|
double sum2 = 0.0;
|
||||||
|
@ -1737,8 +1781,8 @@ float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n)
|
||||||
// Control vector utils
|
// Control vector utils
|
||||||
//
|
//
|
||||||
|
|
||||||
static llama_control_vector_data llama_control_vector_load_one(const llama_control_vector_load_info & load_info) {
|
static common_control_vector_data common_control_vector_load_one(const common_control_vector_load_info & load_info) {
|
||||||
llama_control_vector_data result = { -1, {} };
|
common_control_vector_data result = { -1, {} };
|
||||||
|
|
||||||
ggml_context * ctx = nullptr;
|
ggml_context * ctx = nullptr;
|
||||||
struct gguf_init_params meta_gguf_params = {
|
struct gguf_init_params meta_gguf_params = {
|
||||||
|
@ -1822,11 +1866,11 @@ static llama_control_vector_data llama_control_vector_load_one(const llama_contr
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos) {
|
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos) {
|
||||||
llama_control_vector_data result = { -1, {} };
|
common_control_vector_data result = { -1, {} };
|
||||||
|
|
||||||
for (const auto & info : load_infos) {
|
for (const auto & info : load_infos) {
|
||||||
auto cur = llama_control_vector_load_one(info);
|
auto cur = common_control_vector_load_one(info);
|
||||||
|
|
||||||
if (cur.n_embd == -1) {
|
if (cur.n_embd == -1) {
|
||||||
result.n_embd = -1;
|
result.n_embd = -1;
|
||||||
|
@ -1918,7 +1962,7 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const llama_context * lctx,
|
void yaml_dump_non_result_info(FILE * stream, const common_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc) {
|
||||||
const auto & sparams = params.sparams;
|
const auto & sparams = params.sparams;
|
||||||
|
|
||||||
|
|
129
common/common.h
129
common/common.h
|
@ -24,12 +24,12 @@
|
||||||
|
|
||||||
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
||||||
|
|
||||||
struct llama_lora_adapter_info {
|
struct common_lora_adapter_info {
|
||||||
std::string path;
|
std::string path;
|
||||||
float scale;
|
float scale;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_lora_adapter_container : llama_lora_adapter_info {
|
struct common_lora_adapter_container : common_lora_adapter_info {
|
||||||
struct llama_lora_adapter * adapter;
|
struct llama_lora_adapter * adapter;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
|
||||||
extern char const * LLAMA_COMPILER;
|
extern char const * LLAMA_COMPILER;
|
||||||
extern char const * LLAMA_BUILD_TARGET;
|
extern char const * LLAMA_BUILD_TARGET;
|
||||||
|
|
||||||
struct llama_control_vector_load_info;
|
struct common_control_vector_load_info;
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU utils
|
// CPU utils
|
||||||
|
@ -82,14 +82,14 @@ enum llama_example {
|
||||||
LLAMA_EXAMPLE_COUNT,
|
LLAMA_EXAMPLE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum gpt_sampler_type {
|
enum common_sampler_type {
|
||||||
GPT_SAMPLER_TYPE_NONE = 0,
|
COMMON_SAMPLER_TYPE_NONE = 0,
|
||||||
GPT_SAMPLER_TYPE_TOP_K = 1,
|
COMMON_SAMPLER_TYPE_TOP_K = 1,
|
||||||
GPT_SAMPLER_TYPE_TOP_P = 2,
|
COMMON_SAMPLER_TYPE_TOP_P = 2,
|
||||||
GPT_SAMPLER_TYPE_MIN_P = 3,
|
COMMON_SAMPLER_TYPE_MIN_P = 3,
|
||||||
GPT_SAMPLER_TYPE_TFS_Z = 4,
|
COMMON_SAMPLER_TYPE_TFS_Z = 4,
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P = 5,
|
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE = 6,
|
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
|
||||||
};
|
};
|
||||||
|
|
||||||
// dimensionality reduction methods, used by cvector-generator
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
@ -99,7 +99,7 @@ enum dimre_method {
|
||||||
};
|
};
|
||||||
|
|
||||||
// sampler parameters
|
// sampler parameters
|
||||||
struct gpt_sampler_params {
|
struct common_sampler_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
||||||
|
|
||||||
int32_t n_prev = 64; // number of previous tokens to remember
|
int32_t n_prev = 64; // number of previous tokens to remember
|
||||||
|
@ -124,13 +124,13 @@ struct gpt_sampler_params {
|
||||||
bool ignore_eos = false;
|
bool ignore_eos = false;
|
||||||
bool no_perf = false; // disable performance metrics
|
bool no_perf = false; // disable performance metrics
|
||||||
|
|
||||||
std::vector<enum gpt_sampler_type> samplers = {
|
std::vector<enum common_sampler_type> samplers = {
|
||||||
GPT_SAMPLER_TYPE_TOP_K,
|
COMMON_SAMPLER_TYPE_TOP_K,
|
||||||
GPT_SAMPLER_TYPE_TFS_Z,
|
COMMON_SAMPLER_TYPE_TFS_Z,
|
||||||
GPT_SAMPLER_TYPE_TYPICAL_P,
|
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
||||||
GPT_SAMPLER_TYPE_TOP_P,
|
COMMON_SAMPLER_TYPE_TOP_P,
|
||||||
GPT_SAMPLER_TYPE_MIN_P,
|
COMMON_SAMPLER_TYPE_MIN_P,
|
||||||
GPT_SAMPLER_TYPE_TEMPERATURE
|
COMMON_SAMPLER_TYPE_TEMPERATURE
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string grammar; // optional BNF-like grammar to constrain sampling
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
||||||
|
@ -141,7 +141,7 @@ struct gpt_sampler_params {
|
||||||
std::string print() const;
|
std::string print() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct common_params {
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
int32_t n_ctx = 0; // context size
|
int32_t n_ctx = 0; // context size
|
||||||
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
|
||||||
|
@ -183,7 +183,7 @@ struct gpt_params {
|
||||||
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
||||||
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
||||||
|
|
||||||
struct gpt_sampler_params sparams;
|
struct common_sampler_params sparams;
|
||||||
|
|
||||||
std::string model = ""; // model path // NOLINT
|
std::string model = ""; // model path // NOLINT
|
||||||
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
||||||
|
@ -208,9 +208,9 @@ struct gpt_params {
|
||||||
std::vector<llama_model_kv_override> kv_overrides;
|
std::vector<llama_model_kv_override> kv_overrides;
|
||||||
|
|
||||||
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
||||||
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
||||||
|
|
||||||
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
|
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
||||||
|
|
||||||
int32_t verbosity = 0;
|
int32_t verbosity = 0;
|
||||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
|
@ -282,7 +282,6 @@ struct gpt_params {
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
std::string public_path = ""; // NOLINT
|
std::string public_path = ""; // NOLINT
|
||||||
std::string chat_template = ""; // NOLINT
|
std::string chat_template = ""; // NOLINT
|
||||||
std::string system_prompt = ""; // NOLINT
|
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
@ -290,7 +289,10 @@ struct gpt_params {
|
||||||
std::string ssl_file_key = ""; // NOLINT
|
std::string ssl_file_key = ""; // NOLINT
|
||||||
std::string ssl_file_cert = ""; // NOLINT
|
std::string ssl_file_cert = ""; // NOLINT
|
||||||
|
|
||||||
bool endpoint_slots = true;
|
// "advanced" endpoints are disabled by default for better security
|
||||||
|
bool webui = true;
|
||||||
|
bool endpoint_slots = false;
|
||||||
|
bool endpoint_props = false; // only control POST requests, not GET
|
||||||
bool endpoint_metrics = false;
|
bool endpoint_metrics = false;
|
||||||
|
|
||||||
bool log_json = false;
|
bool log_json = false;
|
||||||
|
@ -345,9 +347,9 @@ struct gpt_params {
|
||||||
|
|
||||||
// call once at the start of a program if it uses libcommon
|
// call once at the start of a program if it uses libcommon
|
||||||
// initializes the logging system and prints info about the build
|
// initializes the logging system and prints info about the build
|
||||||
void gpt_init();
|
void common_init();
|
||||||
|
|
||||||
std::string gpt_params_get_system_info(const gpt_params & params);
|
std::string common_params_get_system_info(const common_params & params);
|
||||||
|
|
||||||
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
|
||||||
|
@ -358,6 +360,19 @@ bool set_process_priority(enum ggml_sched_priority prio);
|
||||||
// String utils
|
// String utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
#ifdef __GNUC__
|
||||||
|
#ifdef __MINGW32__
|
||||||
|
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
||||||
|
#else
|
||||||
|
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
||||||
|
std::string string_format(const char * fmt, ...);
|
||||||
|
|
||||||
std::vector<std::string> string_split(std::string input, char separator);
|
std::vector<std::string> string_split(std::string input, char separator);
|
||||||
|
|
||||||
std::string string_strip(const std::string & str);
|
std::string string_strip(const std::string & str);
|
||||||
|
@ -401,29 +416,29 @@ std::string fs_get_cache_file(const std::string & filename);
|
||||||
// Model utils
|
// Model utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_init_result {
|
struct common_init_result {
|
||||||
struct llama_model * model = nullptr;
|
struct llama_model * model = nullptr;
|
||||||
struct llama_context * context = nullptr;
|
struct llama_context * context = nullptr;
|
||||||
std::vector<llama_lora_adapter_container> lora_adapters;
|
std::vector<common_lora_adapter_container> lora_adapters;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
|
struct common_init_result common_init_from_params(common_params & params);
|
||||||
|
|
||||||
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
|
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
||||||
struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
|
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
||||||
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
||||||
|
|
||||||
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
||||||
|
|
||||||
// clear LoRA adapters from context, then apply new list of adapters
|
// clear LoRA adapters from context, then apply new list of adapters
|
||||||
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
|
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
||||||
|
|
||||||
// Batch utils
|
// Batch utils
|
||||||
|
|
||||||
void llama_batch_clear(struct llama_batch & batch);
|
void common_batch_clear(struct llama_batch & batch);
|
||||||
|
|
||||||
void llama_batch_add(
|
void common_batch_add(
|
||||||
struct llama_batch & batch,
|
struct llama_batch & batch,
|
||||||
llama_token id,
|
llama_token id,
|
||||||
llama_pos pos,
|
llama_pos pos,
|
||||||
|
@ -436,13 +451,13 @@ void llama_batch_add(
|
||||||
|
|
||||||
// tokenizes a string into a vector of tokens
|
// tokenizes a string into a vector of tokens
|
||||||
// should work similar to Python's `tokenizer.encode`
|
// should work similar to Python's `tokenizer.encode`
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> common_tokenize(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
bool parse_special = false);
|
bool parse_special = false);
|
||||||
|
|
||||||
std::vector<llama_token> llama_tokenize(
|
std::vector<llama_token> common_tokenize(
|
||||||
const struct llama_model * model,
|
const struct llama_model * model,
|
||||||
const std::string & text,
|
const std::string & text,
|
||||||
bool add_special,
|
bool add_special,
|
||||||
|
@ -450,7 +465,7 @@ std::vector<llama_token> llama_tokenize(
|
||||||
|
|
||||||
// tokenizes a token into a piece, optionally renders special/control tokens
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
||||||
// should work similar to Python's `tokenizer.id_to_piece`
|
// should work similar to Python's `tokenizer.id_to_piece`
|
||||||
std::string llama_token_to_piece(
|
std::string common_token_to_piece(
|
||||||
const struct llama_context * ctx,
|
const struct llama_context * ctx,
|
||||||
llama_token token,
|
llama_token token,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
@ -458,7 +473,7 @@ std::string llama_token_to_piece(
|
||||||
// detokenizes a vector of tokens into a string
|
// detokenizes a vector of tokens into a string
|
||||||
// should work similar to Python's `tokenizer.decode`
|
// should work similar to Python's `tokenizer.decode`
|
||||||
// optionally renders special/control tokens
|
// optionally renders special/control tokens
|
||||||
std::string llama_detokenize(
|
std::string common_detokenize(
|
||||||
llama_context * ctx,
|
llama_context * ctx,
|
||||||
const std::vector<llama_token> & tokens,
|
const std::vector<llama_token> & tokens,
|
||||||
bool special = true);
|
bool special = true);
|
||||||
|
@ -468,31 +483,31 @@ std::string llama_detokenize(
|
||||||
//
|
//
|
||||||
|
|
||||||
// same with llama_chat_message, but uses std::string
|
// same with llama_chat_message, but uses std::string
|
||||||
struct llama_chat_msg {
|
struct common_chat_msg {
|
||||||
std::string role;
|
std::string role;
|
||||||
std::string content;
|
std::string content;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool llama_chat_verify_template(const std::string & tmpl);
|
bool common_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
// CPP wrapper for llama_chat_apply_template
|
// CPP wrapper for llama_chat_apply_template
|
||||||
// If the built-in template is not supported, we default to chatml
|
// If the built-in template is not supported, we default to chatml
|
||||||
// If the custom "tmpl" is not supported, we throw an error
|
// If the custom "tmpl" is not supported, we throw an error
|
||||||
std::string llama_chat_apply_template(const struct llama_model * model,
|
std::string common_chat_apply_template(const struct llama_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<llama_chat_msg> & chat,
|
const std::vector<common_chat_msg> & chat,
|
||||||
bool add_ass);
|
bool add_ass);
|
||||||
|
|
||||||
// Format single message, while taking into account the position of that message in chat history
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
std::string llama_chat_format_single(const struct llama_model * model,
|
std::string common_chat_format_single(const struct llama_model * model,
|
||||||
const std::string & tmpl,
|
const std::string & tmpl,
|
||||||
const std::vector<llama_chat_msg> & past_msg,
|
const std::vector<common_chat_msg> & past_msg,
|
||||||
const llama_chat_msg & new_msg,
|
const common_chat_msg & new_msg,
|
||||||
bool add_ass);
|
bool add_ass);
|
||||||
|
|
||||||
// Returns an example of formatted chat
|
// Returns an example of formatted chat
|
||||||
std::string llama_chat_format_example(const struct llama_model * model,
|
std::string common_chat_format_example(const struct llama_model * model,
|
||||||
const std::string & tmpl);
|
const std::string & tmpl);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -500,31 +515,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
//
|
//
|
||||||
|
|
||||||
// Dump the KV cache view with the number of sequences per cell.
|
// Dump the KV cache view with the number of sequences per cell.
|
||||||
void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
||||||
|
|
||||||
// Dump the KV cache view showing individual sequences in each cell (long output).
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
||||||
void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Control vector utils
|
// Control vector utils
|
||||||
//
|
//
|
||||||
|
|
||||||
struct llama_control_vector_data {
|
struct common_control_vector_data {
|
||||||
int n_embd;
|
int n_embd;
|
||||||
|
|
||||||
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
||||||
std::vector<float> data;
|
std::vector<float> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_control_vector_load_info {
|
struct common_control_vector_load_info {
|
||||||
float strength;
|
float strength;
|
||||||
|
|
||||||
std::string fname;
|
std::string fname;
|
||||||
|
@ -532,7 +547,7 @@ struct llama_control_vector_load_info {
|
||||||
|
|
||||||
// Load control vectors, scale each by strength, and add them together.
|
// Load control vectors, scale each by strength, and add them together.
|
||||||
// On error, returns {-1, empty}
|
// On error, returns {-1, empty}
|
||||||
llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
|
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Split utils
|
// Split utils
|
||||||
|
@ -551,5 +566,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
|
||||||
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
||||||
|
|
||||||
void yaml_dump_non_result_info(
|
void yaml_dump_non_result_info(
|
||||||
FILE * stream, const gpt_params & params, const llama_context * lctx,
|
FILE * stream, const common_params & params, const llama_context * lctx,
|
||||||
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
||||||
|
|
100
common/log.cpp
100
common/log.cpp
|
@ -8,10 +8,10 @@
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int gpt_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
int common_log_verbosity_thold = LOG_DEFAULT_LLAMA;
|
||||||
|
|
||||||
void gpt_log_set_verbosity_thold(int verbosity) {
|
void common_log_set_verbosity_thold(int verbosity) {
|
||||||
gpt_log_verbosity_thold = verbosity;
|
common_log_verbosity_thold = verbosity;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LOG_COL_DEFAULT "\033[0m"
|
#define LOG_COL_DEFAULT "\033[0m"
|
||||||
|
@ -29,16 +29,16 @@ static int64_t t_us() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// colors
|
// colors
|
||||||
enum gpt_log_col : int {
|
enum common_log_col : int {
|
||||||
GPT_LOG_COL_DEFAULT = 0,
|
COMMON_LOG_COL_DEFAULT = 0,
|
||||||
GPT_LOG_COL_BOLD,
|
COMMON_LOG_COL_BOLD,
|
||||||
GPT_LOG_COL_RED,
|
COMMON_LOG_COL_RED,
|
||||||
GPT_LOG_COL_GREEN,
|
COMMON_LOG_COL_GREEN,
|
||||||
GPT_LOG_COL_YELLOW,
|
COMMON_LOG_COL_YELLOW,
|
||||||
GPT_LOG_COL_BLUE,
|
COMMON_LOG_COL_BLUE,
|
||||||
GPT_LOG_COL_MAGENTA,
|
COMMON_LOG_COL_MAGENTA,
|
||||||
GPT_LOG_COL_CYAN,
|
COMMON_LOG_COL_CYAN,
|
||||||
GPT_LOG_COL_WHITE,
|
COMMON_LOG_COL_WHITE,
|
||||||
};
|
};
|
||||||
|
|
||||||
// disable colors by default
|
// disable colors by default
|
||||||
|
@ -54,7 +54,7 @@ static std::vector<const char *> g_col = {
|
||||||
"",
|
"",
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_log_entry {
|
struct common_log_entry {
|
||||||
enum ggml_log_level level;
|
enum ggml_log_level level;
|
||||||
|
|
||||||
bool prefix;
|
bool prefix;
|
||||||
|
@ -71,7 +71,7 @@ struct gpt_log_entry {
|
||||||
if (!fcur) {
|
if (!fcur) {
|
||||||
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
// stderr displays DBG messages only when their verbosity level is not higher than the threshold
|
||||||
// these messages will still be logged to a file
|
// these messages will still be logged to a file
|
||||||
if (level == GGML_LOG_LEVEL_DEBUG && gpt_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
if (level == GGML_LOG_LEVEL_DEBUG && common_log_verbosity_thold < LOG_DEFAULT_DEBUG) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,19 +86,19 @@ struct gpt_log_entry {
|
||||||
if (timestamp) {
|
if (timestamp) {
|
||||||
// [M.s.ms.us]
|
// [M.s.ms.us]
|
||||||
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
fprintf(fcur, "%s%d.%02d.%03d.%03d%s ",
|
||||||
g_col[GPT_LOG_COL_BLUE],
|
g_col[COMMON_LOG_COL_BLUE],
|
||||||
(int) (timestamp / 1000000 / 60),
|
(int) (timestamp / 1000000 / 60),
|
||||||
(int) (timestamp / 1000000 % 60),
|
(int) (timestamp / 1000000 % 60),
|
||||||
(int) (timestamp / 1000 % 1000),
|
(int) (timestamp / 1000 % 1000),
|
||||||
(int) (timestamp % 1000),
|
(int) (timestamp % 1000),
|
||||||
g_col[GPT_LOG_COL_DEFAULT]);
|
g_col[COMMON_LOG_COL_DEFAULT]);
|
||||||
}
|
}
|
||||||
|
|
||||||
switch (level) {
|
switch (level) {
|
||||||
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[GPT_LOG_COL_GREEN], g_col[GPT_LOG_COL_DEFAULT]); break;
|
case GGML_LOG_LEVEL_INFO: fprintf(fcur, "%sI %s", g_col[COMMON_LOG_COL_GREEN], g_col[COMMON_LOG_COL_DEFAULT]); break;
|
||||||
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[GPT_LOG_COL_MAGENTA], "" ); break;
|
case GGML_LOG_LEVEL_WARN: fprintf(fcur, "%sW %s", g_col[COMMON_LOG_COL_MAGENTA], "" ); break;
|
||||||
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[GPT_LOG_COL_RED], "" ); break;
|
case GGML_LOG_LEVEL_ERROR: fprintf(fcur, "%sE %s", g_col[COMMON_LOG_COL_RED], "" ); break;
|
||||||
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[GPT_LOG_COL_YELLOW], "" ); break;
|
case GGML_LOG_LEVEL_DEBUG: fprintf(fcur, "%sD %s", g_col[COMMON_LOG_COL_YELLOW], "" ); break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -107,18 +107,18 @@ struct gpt_log_entry {
|
||||||
fprintf(fcur, "%s", msg.data());
|
fprintf(fcur, "%s", msg.data());
|
||||||
|
|
||||||
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
if (level == GGML_LOG_LEVEL_WARN || level == GGML_LOG_LEVEL_ERROR || level == GGML_LOG_LEVEL_DEBUG) {
|
||||||
fprintf(fcur, "%s", g_col[GPT_LOG_COL_DEFAULT]);
|
fprintf(fcur, "%s", g_col[COMMON_LOG_COL_DEFAULT]);
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(fcur);
|
fflush(fcur);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_log {
|
struct common_log {
|
||||||
// default capacity - will be expanded if needed
|
// default capacity - will be expanded if needed
|
||||||
gpt_log() : gpt_log(256) {}
|
common_log() : common_log(256) {}
|
||||||
|
|
||||||
gpt_log(size_t capacity) {
|
common_log(size_t capacity) {
|
||||||
file = nullptr;
|
file = nullptr;
|
||||||
prefix = false;
|
prefix = false;
|
||||||
timestamps = false;
|
timestamps = false;
|
||||||
|
@ -137,7 +137,7 @@ struct gpt_log {
|
||||||
resume();
|
resume();
|
||||||
}
|
}
|
||||||
|
|
||||||
~gpt_log() {
|
~common_log() {
|
||||||
pause();
|
pause();
|
||||||
if (file) {
|
if (file) {
|
||||||
fclose(file);
|
fclose(file);
|
||||||
|
@ -158,12 +158,12 @@ private:
|
||||||
int64_t t_start;
|
int64_t t_start;
|
||||||
|
|
||||||
// ring buffer of entries
|
// ring buffer of entries
|
||||||
std::vector<gpt_log_entry> entries;
|
std::vector<common_log_entry> entries;
|
||||||
size_t head;
|
size_t head;
|
||||||
size_t tail;
|
size_t tail;
|
||||||
|
|
||||||
// worker thread copies into this
|
// worker thread copies into this
|
||||||
gpt_log_entry cur;
|
common_log_entry cur;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
void add(enum ggml_log_level level, const char * fmt, va_list args) {
|
||||||
|
@ -219,7 +219,7 @@ public:
|
||||||
tail = (tail + 1) % entries.size();
|
tail = (tail + 1) % entries.size();
|
||||||
if (tail == head) {
|
if (tail == head) {
|
||||||
// expand the buffer
|
// expand the buffer
|
||||||
std::vector<gpt_log_entry> new_entries(2*entries.size());
|
std::vector<common_log_entry> new_entries(2*entries.size());
|
||||||
|
|
||||||
size_t new_tail = 0;
|
size_t new_tail = 0;
|
||||||
|
|
||||||
|
@ -320,15 +320,15 @@ public:
|
||||||
pause();
|
pause();
|
||||||
|
|
||||||
if (colors) {
|
if (colors) {
|
||||||
g_col[GPT_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
g_col[COMMON_LOG_COL_DEFAULT] = LOG_COL_DEFAULT;
|
||||||
g_col[GPT_LOG_COL_BOLD] = LOG_COL_BOLD;
|
g_col[COMMON_LOG_COL_BOLD] = LOG_COL_BOLD;
|
||||||
g_col[GPT_LOG_COL_RED] = LOG_COL_RED;
|
g_col[COMMON_LOG_COL_RED] = LOG_COL_RED;
|
||||||
g_col[GPT_LOG_COL_GREEN] = LOG_COL_GREEN;
|
g_col[COMMON_LOG_COL_GREEN] = LOG_COL_GREEN;
|
||||||
g_col[GPT_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
g_col[COMMON_LOG_COL_YELLOW] = LOG_COL_YELLOW;
|
||||||
g_col[GPT_LOG_COL_BLUE] = LOG_COL_BLUE;
|
g_col[COMMON_LOG_COL_BLUE] = LOG_COL_BLUE;
|
||||||
g_col[GPT_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
g_col[COMMON_LOG_COL_MAGENTA] = LOG_COL_MAGENTA;
|
||||||
g_col[GPT_LOG_COL_CYAN] = LOG_COL_CYAN;
|
g_col[COMMON_LOG_COL_CYAN] = LOG_COL_CYAN;
|
||||||
g_col[GPT_LOG_COL_WHITE] = LOG_COL_WHITE;
|
g_col[COMMON_LOG_COL_WHITE] = LOG_COL_WHITE;
|
||||||
} else {
|
} else {
|
||||||
for (size_t i = 0; i < g_col.size(); i++) {
|
for (size_t i = 0; i < g_col.size(); i++) {
|
||||||
g_col[i] = "";
|
g_col[i] = "";
|
||||||
|
@ -355,47 +355,47 @@ public:
|
||||||
// public API
|
// public API
|
||||||
//
|
//
|
||||||
|
|
||||||
struct gpt_log * gpt_log_init() {
|
struct common_log * common_log_init() {
|
||||||
return new gpt_log;
|
return new common_log;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct gpt_log * gpt_log_main() {
|
struct common_log * common_log_main() {
|
||||||
static struct gpt_log log;
|
static struct common_log log;
|
||||||
|
|
||||||
return &log;
|
return &log;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_pause(struct gpt_log * log) {
|
void common_log_pause(struct common_log * log) {
|
||||||
log->pause();
|
log->pause();
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_resume(struct gpt_log * log) {
|
void common_log_resume(struct common_log * log) {
|
||||||
log->resume();
|
log->resume();
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_free(struct gpt_log * log) {
|
void common_log_free(struct common_log * log) {
|
||||||
delete log;
|
delete log;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...) {
|
||||||
va_list args;
|
va_list args;
|
||||||
va_start(args, fmt);
|
va_start(args, fmt);
|
||||||
log->add(level, fmt, args);
|
log->add(level, fmt, args);
|
||||||
va_end(args);
|
va_end(args);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_set_file(struct gpt_log * log, const char * file) {
|
void common_log_set_file(struct common_log * log, const char * file) {
|
||||||
log->set_file(file);
|
log->set_file(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_set_colors(struct gpt_log * log, bool colors) {
|
void common_log_set_colors(struct common_log * log, bool colors) {
|
||||||
log->set_colors(colors);
|
log->set_colors(colors);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_set_prefix(struct gpt_log * log, bool prefix) {
|
void common_log_set_prefix(struct common_log * log, bool prefix) {
|
||||||
log->set_prefix(prefix);
|
log->set_prefix(prefix);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps) {
|
void common_log_set_timestamps(struct common_log * log, bool timestamps) {
|
||||||
log->set_timestamps(timestamps);
|
log->set_timestamps(timestamps);
|
||||||
}
|
}
|
||||||
|
|
36
common/log.h
36
common/log.h
|
@ -14,23 +14,23 @@
|
||||||
#define LOG_DEFAULT_LLAMA 0
|
#define LOG_DEFAULT_LLAMA 0
|
||||||
|
|
||||||
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
// needed by the LOG_TMPL macro to avoid computing log arguments if the verbosity lower
|
||||||
// set via gpt_log_set_verbosity()
|
// set via common_log_set_verbosity()
|
||||||
extern int gpt_log_verbosity_thold;
|
extern int common_log_verbosity_thold;
|
||||||
|
|
||||||
void gpt_log_set_verbosity_thold(int verbosity); // not thread-safe
|
void common_log_set_verbosity_thold(int verbosity); // not thread-safe
|
||||||
|
|
||||||
// the gpt_log uses an internal worker thread to print/write log messages
|
// the common_log uses an internal worker thread to print/write log messages
|
||||||
// when the worker thread is paused, incoming log messages are discarded
|
// when the worker thread is paused, incoming log messages are discarded
|
||||||
struct gpt_log;
|
struct common_log;
|
||||||
|
|
||||||
struct gpt_log * gpt_log_init();
|
struct common_log * common_log_init();
|
||||||
struct gpt_log * gpt_log_main(); // singleton, automatically destroys itself on exit
|
struct common_log * common_log_main(); // singleton, automatically destroys itself on exit
|
||||||
void gpt_log_pause (struct gpt_log * log); // pause the worker thread, not thread-safe
|
void common_log_pause (struct common_log * log); // pause the worker thread, not thread-safe
|
||||||
void gpt_log_resume(struct gpt_log * log); // resume the worker thread, not thread-safe
|
void common_log_resume(struct common_log * log); // resume the worker thread, not thread-safe
|
||||||
void gpt_log_free (struct gpt_log * log);
|
void common_log_free (struct common_log * log);
|
||||||
|
|
||||||
LOG_ATTRIBUTE_FORMAT(3, 4)
|
LOG_ATTRIBUTE_FORMAT(3, 4)
|
||||||
void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * fmt, ...);
|
void common_log_add(struct common_log * log, enum ggml_log_level level, const char * fmt, ...);
|
||||||
|
|
||||||
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
// defaults: file = NULL, colors = false, prefix = false, timestamps = false
|
||||||
//
|
//
|
||||||
|
@ -54,10 +54,10 @@ void gpt_log_add(struct gpt_log * log, enum ggml_log_level level, const char * f
|
||||||
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
// D - debug (stderr, V = LOG_DEFAULT_DEBUG)
|
||||||
//
|
//
|
||||||
|
|
||||||
void gpt_log_set_file (struct gpt_log * log, const char * file); // not thread-safe
|
void common_log_set_file (struct common_log * log, const char * file); // not thread-safe
|
||||||
void gpt_log_set_colors (struct gpt_log * log, bool colors); // not thread-safe
|
void common_log_set_colors (struct common_log * log, bool colors); // not thread-safe
|
||||||
void gpt_log_set_prefix (struct gpt_log * log, bool prefix); // whether to output prefix to each log
|
void common_log_set_prefix (struct common_log * log, bool prefix); // whether to output prefix to each log
|
||||||
void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // whether to output timestamps in the prefix
|
void common_log_set_timestamps(struct common_log * log, bool timestamps); // whether to output timestamps in the prefix
|
||||||
|
|
||||||
// helper macros for logging
|
// helper macros for logging
|
||||||
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
// use these to avoid computing log arguments if the verbosity of the log is higher than the threshold
|
||||||
|
@ -66,13 +66,13 @@ void gpt_log_set_timestamps(struct gpt_log * log, bool timestamps); // w
|
||||||
//
|
//
|
||||||
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
// LOG_DBG("this is a debug message: %d\n", expensive_function());
|
||||||
//
|
//
|
||||||
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > gpt_log_verbosity_thold
|
// this will avoid calling expensive_function() if LOG_DEFAULT_DEBUG > common_log_verbosity_thold
|
||||||
//
|
//
|
||||||
|
|
||||||
#define LOG_TMPL(level, verbosity, ...) \
|
#define LOG_TMPL(level, verbosity, ...) \
|
||||||
do { \
|
do { \
|
||||||
if ((verbosity) <= gpt_log_verbosity_thold) { \
|
if ((verbosity) <= common_log_verbosity_thold) { \
|
||||||
gpt_log_add(gpt_log_main(), (level), __VA_ARGS__); \
|
common_log_add(common_log_main(), (level), __VA_ARGS__); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
|
||||||
void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
void common_ngram_cache_update(common_ngram_cache & ngram_cache, int ngram_min, int ngram_max,
|
||||||
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
std::vector<llama_token> & inp, int nnew, bool print_progress) {
|
||||||
const int64_t t_start_ms = ggml_time_ms();
|
const int64_t t_start_ms = ggml_time_ms();
|
||||||
const int64_t inp_size = inp.size();
|
const int64_t inp_size = inp.size();
|
||||||
|
@ -20,16 +20,16 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
|
||||||
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
const int64_t i_start = std::max(inp_size - nnew, ngram_size);
|
||||||
for (int64_t i = i_start; i < inp_size; ++i) {
|
for (int64_t i = i_start; i < inp_size; ++i) {
|
||||||
const int64_t ngram_start = i - ngram_size;
|
const int64_t ngram_start = i - ngram_size;
|
||||||
llama_ngram ngram(&inp[ngram_start], ngram_size);
|
common_ngram ngram(&inp[ngram_start], ngram_size);
|
||||||
const llama_token token = inp[i];
|
const llama_token token = inp[i];
|
||||||
|
|
||||||
llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
common_ngram_cache::iterator part_it = ngram_cache.find(ngram);
|
||||||
if (part_it == ngram_cache.end()) {
|
if (part_it == ngram_cache.end()) {
|
||||||
llama_ngram_cache_part part;
|
common_ngram_cache_part part;
|
||||||
part.emplace(token, 1);
|
part.emplace(token, 1);
|
||||||
ngram_cache.emplace(ngram, part);
|
ngram_cache.emplace(ngram, part);
|
||||||
} else {
|
} else {
|
||||||
llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
common_ngram_cache_part::iterator token_count_it = part_it->second.find(token);
|
||||||
if (token_count_it == part_it->second.end()) {
|
if (token_count_it == part_it->second.end()) {
|
||||||
part_it->second.emplace(token, 1);
|
part_it->second.emplace(token, 1);
|
||||||
} else {
|
} else {
|
||||||
|
@ -62,12 +62,12 @@ constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2};
|
||||||
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66};
|
||||||
|
|
||||||
// Helper function that tries to draft a token from only the static ngram cache:
|
// Helper function that tries to draft a token from only the static ngram cache:
|
||||||
static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) {
|
static llama_token try_draft(common_ngram_cache & nc_static, const common_ngram ngram_static) {
|
||||||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
if (part_static_it == nc_static.end()) {
|
if (part_static_it == nc_static.end()) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
const llama_ngram_cache_part part_static = part_static_it->second;
|
const common_ngram_cache_part part_static = part_static_it->second;
|
||||||
|
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
int sum_count_static = 0;
|
int sum_count_static = 0;
|
||||||
|
@ -95,19 +95,19 @@ static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ng
|
||||||
|
|
||||||
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
// Try to draft a token from primary cache (context/dynamic), validate with static cache:
|
||||||
static llama_token try_draft(
|
static llama_token try_draft(
|
||||||
llama_ngram_cache & nc_primary, const std::vector<llama_ngram> & ngrams_primary, llama_ngram_cache_part & part_static,
|
common_ngram_cache & nc_primary, const std::vector<common_ngram> & ngrams_primary, common_ngram_cache_part & part_static,
|
||||||
const int * min_sample_size, const int * min_percent) {
|
const int * min_sample_size, const int * min_percent) {
|
||||||
|
|
||||||
llama_token drafted_token = -1;
|
llama_token drafted_token = -1;
|
||||||
|
|
||||||
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) {
|
||||||
const llama_ngram ngram_primary = ngrams_primary[i];
|
const common_ngram ngram_primary = ngrams_primary[i];
|
||||||
|
|
||||||
llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
common_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary);
|
||||||
if (part_primary_it == nc_primary.end()) {
|
if (part_primary_it == nc_primary.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
const llama_ngram_cache_part part_primary = part_primary_it->second;
|
const common_ngram_cache_part part_primary = part_primary_it->second;
|
||||||
|
|
||||||
int max_count_primary = 0;
|
int max_count_primary = 0;
|
||||||
int max_count_static = 0;
|
int max_count_static = 0;
|
||||||
|
@ -117,7 +117,7 @@ static llama_token try_draft(
|
||||||
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
for (std::pair<llama_token, int> token_count_primary : part_primary) {
|
||||||
const llama_token token = token_count_primary.first;
|
const llama_token token = token_count_primary.first;
|
||||||
|
|
||||||
llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
common_ngram_cache_part::iterator token_count_static_it = part_static.find(token);
|
||||||
|
|
||||||
const int32_t count_primary = token_count_primary.second;
|
const int32_t count_primary = token_count_primary.second;
|
||||||
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1;
|
||||||
|
@ -142,9 +142,9 @@ static llama_token try_draft(
|
||||||
return drafted_token;
|
return drafted_token;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_ngram_cache_draft(
|
void common_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static
|
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static
|
||||||
) {
|
) {
|
||||||
GGML_ASSERT(draft.size() == 1);
|
GGML_ASSERT(draft.size() == 1);
|
||||||
const int inp_size = inp.size();
|
const int inp_size = inp.size();
|
||||||
|
@ -157,21 +157,21 @@ void llama_ngram_cache_draft(
|
||||||
llama_token drafted_token = -1;
|
llama_token drafted_token = -1;
|
||||||
|
|
||||||
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
|
||||||
llama_ngram ngram_static;
|
common_ngram ngram_static;
|
||||||
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
|
||||||
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
|
||||||
}
|
}
|
||||||
llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
common_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
|
||||||
llama_ngram_cache_part part_static;
|
common_ngram_cache_part part_static;
|
||||||
if (part_static_it != nc_static.end()) {
|
if (part_static_it != nc_static.end()) {
|
||||||
part_static = part_static_it->second;
|
part_static = part_static_it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
// cd = context + dynamic
|
// cd = context + dynamic
|
||||||
std::vector<llama_ngram> ngrams_cd;
|
std::vector<common_ngram> ngrams_cd;
|
||||||
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
|
||||||
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
|
||||||
llama_ngram ngram_cd;
|
common_ngram ngram_cd;
|
||||||
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
|
||||||
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
|
||||||
}
|
}
|
||||||
|
@ -196,16 +196,16 @@ void llama_ngram_cache_draft(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) {
|
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename) {
|
||||||
std::ofstream file_out(filename, std::ios::binary);
|
std::ofstream file_out(filename, std::ios::binary);
|
||||||
for (std::pair<llama_ngram, llama_ngram_cache_part> item : ngram_cache) {
|
for (std::pair<common_ngram, common_ngram_cache_part> item : ngram_cache) {
|
||||||
const llama_ngram ngram = item.first;
|
const common_ngram ngram = item.first;
|
||||||
llama_ngram_cache_part token_counts = item.second;
|
common_ngram_cache_part token_counts = item.second;
|
||||||
GGML_ASSERT(!token_counts.empty());
|
GGML_ASSERT(!token_counts.empty());
|
||||||
const int32_t ntokens = token_counts.size();
|
const int32_t ntokens = token_counts.size();
|
||||||
GGML_ASSERT(ntokens > 0);
|
GGML_ASSERT(ntokens > 0);
|
||||||
|
|
||||||
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(llama_ngram));
|
file_out.write(reinterpret_cast<const char *>(&ngram), sizeof(common_ngram));
|
||||||
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
|
||||||
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
for (std::pair<llama_token, int32_t> item2 : token_counts) {
|
||||||
const llama_token token = item2.first;
|
const llama_token token = item2.first;
|
||||||
|
@ -219,14 +219,14 @@ void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filen
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
common_ngram_cache common_ngram_cache_load(std::string & filename) {
|
||||||
std::ifstream hashmap_file(filename, std::ios::binary);
|
std::ifstream hashmap_file(filename, std::ios::binary);
|
||||||
if (!hashmap_file) {
|
if (!hashmap_file) {
|
||||||
throw std::ifstream::failure("Unable to open file " + filename);
|
throw std::ifstream::failure("Unable to open file " + filename);
|
||||||
}
|
}
|
||||||
llama_ngram_cache ngram_cache;
|
common_ngram_cache ngram_cache;
|
||||||
|
|
||||||
llama_ngram ngram;
|
common_ngram ngram;
|
||||||
int32_t ntokens;
|
int32_t ntokens;
|
||||||
llama_token token;
|
llama_token token;
|
||||||
int32_t count;
|
int32_t count;
|
||||||
|
@ -235,11 +235,11 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
||||||
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
char * ntokensc = reinterpret_cast<char*>(&ntokens);
|
||||||
char * tokenc = reinterpret_cast<char*>(&token);
|
char * tokenc = reinterpret_cast<char*>(&token);
|
||||||
char * countc = reinterpret_cast<char*>(&count);
|
char * countc = reinterpret_cast<char*>(&count);
|
||||||
while(hashmap_file.read(ngramc, sizeof(llama_ngram))) {
|
while(hashmap_file.read(ngramc, sizeof(common_ngram))) {
|
||||||
GGML_ASSERT(!hashmap_file.eof());
|
GGML_ASSERT(!hashmap_file.eof());
|
||||||
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
|
||||||
GGML_ASSERT(ntokens > 0);
|
GGML_ASSERT(ntokens > 0);
|
||||||
llama_ngram_cache_part token_counts;
|
common_ngram_cache_part token_counts;
|
||||||
|
|
||||||
for (int i = 0; i < ntokens; ++i) {
|
for (int i = 0; i < ntokens; ++i) {
|
||||||
GGML_ASSERT(!hashmap_file.eof());
|
GGML_ASSERT(!hashmap_file.eof());
|
||||||
|
@ -257,12 +257,12 @@ llama_ngram_cache llama_ngram_cache_load(std::string & filename) {
|
||||||
return ngram_cache;
|
return ngram_cache;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) {
|
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add) {
|
||||||
for (std::pair<llama_ngram, llama_ngram_cache_part> ngram_part : ngram_cache_add) {
|
for (std::pair<common_ngram, common_ngram_cache_part> ngram_part : ngram_cache_add) {
|
||||||
const llama_ngram ngram = ngram_part.first;
|
const common_ngram ngram = ngram_part.first;
|
||||||
llama_ngram_cache_part part = ngram_part.second;
|
common_ngram_cache_part part = ngram_part.second;
|
||||||
|
|
||||||
llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
common_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram);
|
||||||
if (part_merged_it == ngram_cache_target.end()) {
|
if (part_merged_it == ngram_cache_target.end()) {
|
||||||
ngram_cache_target.emplace(ngram, part);
|
ngram_cache_target.emplace(ngram, part);
|
||||||
continue;
|
continue;
|
||||||
|
@ -273,7 +273,7 @@ void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram
|
||||||
const int32_t count = token_count.second;
|
const int32_t count = token_count.second;
|
||||||
GGML_ASSERT(count > 0);
|
GGML_ASSERT(count > 0);
|
||||||
|
|
||||||
llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
common_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token);
|
||||||
if (token_count_merged_it == part_merged_it->second.end()) {
|
if (token_count_merged_it == part_merged_it->second.end()) {
|
||||||
part_merged_it->second.emplace(token, count);
|
part_merged_it->second.emplace(token, count);
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -12,22 +12,22 @@
|
||||||
|
|
||||||
// Data structures to map n-grams to empirical token probabilities:
|
// Data structures to map n-grams to empirical token probabilities:
|
||||||
|
|
||||||
struct llama_ngram {
|
struct common_ngram {
|
||||||
llama_token tokens[LLAMA_NGRAM_MAX];
|
llama_token tokens[LLAMA_NGRAM_MAX];
|
||||||
|
|
||||||
llama_ngram() {
|
common_ngram() {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = -1;
|
tokens[i] = -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_ngram(const llama_token * input, const int ngram_size) {
|
common_ngram(const llama_token * input, const int ngram_size) {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
tokens[i] = i < ngram_size ? input[i] : -1;
|
tokens[i] = i < ngram_size ? input[i] : -1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator==(const llama_ngram & other) const {
|
bool operator==(const common_ngram & other) const {
|
||||||
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
if (tokens[i] != other.tokens[i]) {
|
if (tokens[i] != other.tokens[i]) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -37,28 +37,28 @@ struct llama_ngram {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_token_hash_function {
|
struct common_token_hash_function {
|
||||||
size_t operator()(const llama_token token) const {
|
size_t operator()(const llama_token token) const {
|
||||||
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
// see https://probablydance.com/2018/06/16/fibonacci-hashing-the-optimization-that-the-world-forgot-or-a-better-alternative-to-integer-modulo/
|
||||||
return token * 11400714819323198485llu;
|
return token * 11400714819323198485llu;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_ngram_hash_function {
|
struct common_ngram_hash_function {
|
||||||
size_t operator()(const llama_ngram & ngram) const {
|
size_t operator()(const common_ngram & ngram) const {
|
||||||
size_t hash = llama_token_hash_function{}(ngram.tokens[0]);
|
size_t hash = common_token_hash_function{}(ngram.tokens[0]);
|
||||||
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
for (int i = 1; i < LLAMA_NGRAM_MAX; ++i) {
|
||||||
hash ^= llama_token_hash_function{}(ngram.tokens[i]);
|
hash ^= common_token_hash_function{}(ngram.tokens[i]);
|
||||||
}
|
}
|
||||||
return hash;
|
return hash;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// token -> number of times token has been seen
|
// token -> number of times token has been seen
|
||||||
typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
|
typedef std::unordered_map<llama_token, int32_t> common_ngram_cache_part;
|
||||||
|
|
||||||
// n-gram -> empirical distribution of following tokens
|
// n-gram -> empirical distribution of following tokens
|
||||||
typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
|
typedef std::unordered_map<common_ngram, common_ngram_cache_part, common_ngram_hash_function> common_ngram_cache;
|
||||||
|
|
||||||
|
|
||||||
// Update an ngram cache with tokens.
|
// Update an ngram cache with tokens.
|
||||||
|
@ -70,8 +70,8 @@ typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash
|
||||||
//
|
//
|
||||||
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
// In order to get correct results inp_data can ONLY BE APPENDED TO.
|
||||||
// Changes in the middle need a complete rebuild.
|
// Changes in the middle need a complete rebuild.
|
||||||
void llama_ngram_cache_update(
|
void common_ngram_cache_update(
|
||||||
llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
common_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector<llama_token> & inp_data, int nnew, bool print_progress);
|
||||||
|
|
||||||
// Try to draft tokens from ngram caches.
|
// Try to draft tokens from ngram caches.
|
||||||
// inp: the tokens generated so far.
|
// inp: the tokens generated so far.
|
||||||
|
@ -81,21 +81,21 @@ void llama_ngram_cache_update(
|
||||||
// nc_context: ngram cache based on current context.
|
// nc_context: ngram cache based on current context.
|
||||||
// nc_dynamic: ngram cache based on previous user generations.
|
// nc_dynamic: ngram cache based on previous user generations.
|
||||||
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
// nc_static: ngram cache generated from a large text corpus, used for validation.
|
||||||
void llama_ngram_cache_draft(
|
void common_ngram_cache_draft(
|
||||||
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
std::vector<llama_token> & inp, std::vector<llama_token> & draft, int n_draft, int ngram_min, int ngram_max,
|
||||||
llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static);
|
common_ngram_cache & nc_context, common_ngram_cache & nc_dynamic, common_ngram_cache & nc_static);
|
||||||
|
|
||||||
// Save an ngram cache to a file.
|
// Save an ngram cache to a file.
|
||||||
// ngram_cache: the ngram cache to save.
|
// ngram_cache: the ngram cache to save.
|
||||||
// filename: the path under which to save the ngram cache.
|
// filename: the path under which to save the ngram cache.
|
||||||
void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename);
|
void common_ngram_cache_save(common_ngram_cache & ngram_cache, std::string & filename);
|
||||||
|
|
||||||
// Load an ngram cache saved with llama_ngram_cache_save.
|
// Load an ngram cache saved with common_ngram_cache_save.
|
||||||
// filename: the path from which to load the ngram cache.
|
// filename: the path from which to load the ngram cache.
|
||||||
// returns: an ngram cache containing the information saved to filename.
|
// returns: an ngram cache containing the information saved to filename.
|
||||||
llama_ngram_cache llama_ngram_cache_load(std::string & filename);
|
common_ngram_cache common_ngram_cache_load(std::string & filename);
|
||||||
|
|
||||||
// Merge two ngram caches.
|
// Merge two ngram caches.
|
||||||
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add.
|
||||||
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
// ngram_cache_add: the ngram cache to add to ngram_cache_target.
|
||||||
void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add);
|
void common_ngram_cache_merge(common_ngram_cache & ngram_cache_target, common_ngram_cache & ngram_cache_add);
|
||||||
|
|
|
@ -98,8 +98,8 @@ struct ring_buffer {
|
||||||
std::vector<T> data;
|
std::vector<T> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct gpt_sampler {
|
struct common_sampler {
|
||||||
gpt_sampler_params params;
|
common_sampler_params params;
|
||||||
|
|
||||||
struct llama_sampler * grmr;
|
struct llama_sampler * grmr;
|
||||||
struct llama_sampler * chain;
|
struct llama_sampler * chain;
|
||||||
|
@ -125,7 +125,7 @@ struct gpt_sampler {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
std::string gpt_sampler_params::print() const {
|
std::string common_sampler_params::print() const {
|
||||||
char result[1024];
|
char result[1024];
|
||||||
|
|
||||||
snprintf(result, sizeof(result),
|
snprintf(result, sizeof(result),
|
||||||
|
@ -139,12 +139,12 @@ std::string gpt_sampler_params::print() const {
|
||||||
return std::string(result);
|
return std::string(result);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
|
||||||
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
lparams.no_perf = params.no_perf;
|
lparams.no_perf = params.no_perf;
|
||||||
|
|
||||||
auto * result = new gpt_sampler {
|
auto * result = new common_sampler {
|
||||||
/* .params = */ params,
|
/* .params = */ params,
|
||||||
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
/* .grmr = */ llama_sampler_init_grammar(model, params.grammar.c_str(), "root"),
|
||||||
/* .chain = */ llama_sampler_chain_init(lparams),
|
/* .chain = */ llama_sampler_chain_init(lparams),
|
||||||
|
@ -175,22 +175,22 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
||||||
if (params.mirostat == 0) {
|
if (params.mirostat == 0) {
|
||||||
for (const auto & cnstr : params.samplers) {
|
for (const auto & cnstr : params.samplers) {
|
||||||
switch (cnstr) {
|
switch (cnstr) {
|
||||||
case GPT_SAMPLER_TYPE_TOP_K:
|
case COMMON_SAMPLER_TYPE_TOP_K:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
|
||||||
break;
|
break;
|
||||||
case GPT_SAMPLER_TYPE_TOP_P:
|
case COMMON_SAMPLER_TYPE_TOP_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_top_p (params.top_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case GPT_SAMPLER_TYPE_MIN_P:
|
case COMMON_SAMPLER_TYPE_MIN_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_min_p (params.min_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case GPT_SAMPLER_TYPE_TFS_Z:
|
case COMMON_SAMPLER_TYPE_TFS_Z:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_tail_free(params.tfs_z, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case GPT_SAMPLER_TYPE_TYPICAL_P:
|
case COMMON_SAMPLER_TYPE_TYPICAL_P:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
|
||||||
break;
|
break;
|
||||||
case GPT_SAMPLER_TYPE_TEMPERATURE:
|
case COMMON_SAMPLER_TYPE_TEMPERATURE:
|
||||||
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
llama_sampler_chain_add(result->chain, llama_sampler_init_temp_ext (params.temp, params.dynatemp_range, params.dynatemp_exponent));
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -224,7 +224,7 @@ struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const st
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
void common_sampler_free(struct common_sampler * gsmpl) {
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
llama_sampler_free(gsmpl->grmr);
|
llama_sampler_free(gsmpl->grmr);
|
||||||
|
|
||||||
|
@ -234,7 +234,7 @@ void gpt_sampler_free(struct gpt_sampler * gsmpl) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar) {
|
||||||
if (accept_grammar) {
|
if (accept_grammar) {
|
||||||
llama_sampler_accept(gsmpl->grmr, token);
|
llama_sampler_accept(gsmpl->grmr, token);
|
||||||
}
|
}
|
||||||
|
@ -244,14 +244,14 @@ void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool acce
|
||||||
gsmpl->prev.push_back(token);
|
gsmpl->prev.push_back(token);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_sampler_reset(struct gpt_sampler * gsmpl) {
|
void common_sampler_reset(struct common_sampler * gsmpl) {
|
||||||
llama_sampler_reset(gsmpl->grmr);
|
llama_sampler_reset(gsmpl->grmr);
|
||||||
|
|
||||||
llama_sampler_reset(gsmpl->chain);
|
llama_sampler_reset(gsmpl->chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
struct common_sampler * common_sampler_clone(common_sampler * gsmpl) {
|
||||||
return new gpt_sampler {
|
return new common_sampler {
|
||||||
/* .params = */ gsmpl->params,
|
/* .params = */ gsmpl->params,
|
||||||
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
/* .grmr = */ llama_sampler_clone(gsmpl->grmr),
|
||||||
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
/* .chain = */ llama_sampler_clone(gsmpl->chain),
|
||||||
|
@ -261,7 +261,7 @@ struct gpt_sampler * gpt_sampler_clone(gpt_sampler * gsmpl) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl) {
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl) {
|
||||||
// TODO: measure grammar performance
|
// TODO: measure grammar performance
|
||||||
|
|
||||||
if (gsmpl) {
|
if (gsmpl) {
|
||||||
|
@ -272,7 +272,7 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first) {
|
||||||
gsmpl->set_logits(ctx, idx);
|
gsmpl->set_logits(ctx, idx);
|
||||||
|
|
||||||
auto & grmr = gsmpl->grmr;
|
auto & grmr = gsmpl->grmr;
|
||||||
|
@ -318,21 +318,21 @@ llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context
|
||||||
return cur_p.data[cur_p.selected].id;
|
return cur_p.data[cur_p.selected].id;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl) {
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl) {
|
||||||
return llama_sampler_get_seed(gsmpl->chain);
|
return llama_sampler_get_seed(gsmpl->chain);
|
||||||
}
|
}
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl) {
|
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl) {
|
||||||
return &gsmpl->cur_p;
|
return &gsmpl->cur_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl) {
|
llama_token common_sampler_last(const struct common_sampler * gsmpl) {
|
||||||
return gsmpl->prev.rat(0);
|
return gsmpl->prev.rat(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
std::string common_sampler_print(const struct common_sampler * gsmpl) {
|
||||||
std::string result = "logits ";
|
std::string result = "logits ";
|
||||||
|
|
||||||
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
for (int i = 0; i < llama_sampler_chain_n(gsmpl->chain); i++) {
|
||||||
|
@ -343,7 +343,7 @@ std::string gpt_sampler_print(const struct gpt_sampler * gsmpl) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main, int n) {
|
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx_main, int n) {
|
||||||
n = std::min(n, (int) gsmpl->prev.size());
|
n = std::min(n, (int) gsmpl->prev.size());
|
||||||
|
|
||||||
if (n <= 0) {
|
if (n <= 0) {
|
||||||
|
@ -358,63 +358,63 @@ std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx_main,
|
||||||
|
|
||||||
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
GGML_ASSERT(id != LLAMA_TOKEN_NULL && "null token in the sampling history - should not happen");
|
||||||
|
|
||||||
result += llama_token_to_piece(ctx_main, id);
|
result += common_token_to_piece(ctx_main, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr) {
|
char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
|
||||||
switch (cnstr) {
|
switch (cnstr) {
|
||||||
case GPT_SAMPLER_TYPE_TOP_K: return 'k';
|
case COMMON_SAMPLER_TYPE_TOP_K: return 'k';
|
||||||
case GPT_SAMPLER_TYPE_TFS_Z: return 'f';
|
case COMMON_SAMPLER_TYPE_TFS_Z: return 'f';
|
||||||
case GPT_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return 'y';
|
||||||
case GPT_SAMPLER_TYPE_TOP_P: return 'p';
|
case COMMON_SAMPLER_TYPE_TOP_P: return 'p';
|
||||||
case GPT_SAMPLER_TYPE_MIN_P: return 'm';
|
case COMMON_SAMPLER_TYPE_MIN_P: return 'm';
|
||||||
case GPT_SAMPLER_TYPE_TEMPERATURE: return 't';
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
|
||||||
default : return '?';
|
default : return '?';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr) {
|
std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
|
||||||
switch (cnstr) {
|
switch (cnstr) {
|
||||||
case GPT_SAMPLER_TYPE_TOP_K: return "top_k";
|
case COMMON_SAMPLER_TYPE_TOP_K: return "top_k";
|
||||||
case GPT_SAMPLER_TYPE_TFS_Z: return "tfs_z";
|
case COMMON_SAMPLER_TYPE_TFS_Z: return "tfs_z";
|
||||||
case GPT_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
case COMMON_SAMPLER_TYPE_TYPICAL_P: return "typ_p";
|
||||||
case GPT_SAMPLER_TYPE_TOP_P: return "top_p";
|
case COMMON_SAMPLER_TYPE_TOP_P: return "top_p";
|
||||||
case GPT_SAMPLER_TYPE_MIN_P: return "min_p";
|
case COMMON_SAMPLER_TYPE_MIN_P: return "min_p";
|
||||||
case GPT_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
|
||||||
default : return "";
|
default : return "";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
std::vector<common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names) {
|
||||||
std::unordered_map<std::string, gpt_sampler_type> sampler_canonical_name_map {
|
std::unordered_map<std::string, common_sampler_type> sampler_canonical_name_map {
|
||||||
{ "top_k", GPT_SAMPLER_TYPE_TOP_K },
|
{ "top_k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top_p", GPT_SAMPLER_TYPE_TOP_P },
|
{ "top_p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "typ_p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ "typ_p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "min_p", GPT_SAMPLER_TYPE_MIN_P },
|
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ "tfs_z", GPT_SAMPLER_TYPE_TFS_Z },
|
{ "tfs_z", COMMON_SAMPLER_TYPE_TFS_Z },
|
||||||
{ "temperature", GPT_SAMPLER_TYPE_TEMPERATURE },
|
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
};
|
};
|
||||||
|
|
||||||
// since samplers names are written multiple ways
|
// since samplers names are written multiple ways
|
||||||
// make it ready for both system names and input names
|
// make it ready for both system names and input names
|
||||||
std::unordered_map<std::string, gpt_sampler_type> sampler_alt_name_map {
|
std::unordered_map<std::string, common_sampler_type> sampler_alt_name_map {
|
||||||
{ "top-k", GPT_SAMPLER_TYPE_TOP_K },
|
{ "top-k", COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ "top-p", GPT_SAMPLER_TYPE_TOP_P },
|
{ "top-p", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "nucleus", GPT_SAMPLER_TYPE_TOP_P },
|
{ "nucleus", COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ "typical-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "typical", GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ "typical", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "typ-p", GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "typ", GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ "min-p", GPT_SAMPLER_TYPE_MIN_P },
|
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ "tfs-z", GPT_SAMPLER_TYPE_TFS_Z },
|
{ "tfs-z", COMMON_SAMPLER_TYPE_TFS_Z },
|
||||||
{ "tfs", GPT_SAMPLER_TYPE_TFS_Z },
|
{ "tfs", COMMON_SAMPLER_TYPE_TFS_Z },
|
||||||
{ "temp", GPT_SAMPLER_TYPE_TEMPERATURE },
|
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<gpt_sampler_type> samplers;
|
std::vector<common_sampler_type> samplers;
|
||||||
samplers.reserve(names.size());
|
samplers.reserve(names.size());
|
||||||
|
|
||||||
for (const auto & name : names) {
|
for (const auto & name : names) {
|
||||||
|
@ -434,17 +434,17 @@ std::vector<gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std
|
||||||
return samplers;
|
return samplers;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars) {
|
std::vector<common_sampler_type> common_sampler_types_from_chars(const std::string & chars) {
|
||||||
std::unordered_map<char, gpt_sampler_type> sampler_name_map = {
|
std::unordered_map<char, common_sampler_type> sampler_name_map = {
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_K), GPT_SAMPLER_TYPE_TOP_K },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_K), COMMON_SAMPLER_TYPE_TOP_K },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TFS_Z), GPT_SAMPLER_TYPE_TFS_Z },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TFS_Z), COMMON_SAMPLER_TYPE_TFS_Z },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TYPICAL_P), GPT_SAMPLER_TYPE_TYPICAL_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TYPICAL_P), COMMON_SAMPLER_TYPE_TYPICAL_P },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TOP_P), GPT_SAMPLER_TYPE_TOP_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TOP_P), COMMON_SAMPLER_TYPE_TOP_P },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_MIN_P), GPT_SAMPLER_TYPE_MIN_P },
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
|
||||||
{ gpt_sampler_type_to_chr(GPT_SAMPLER_TYPE_TEMPERATURE), GPT_SAMPLER_TYPE_TEMPERATURE }
|
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }
|
||||||
};
|
};
|
||||||
|
|
||||||
std::vector<gpt_sampler_type> samplers;
|
std::vector<common_sampler_type> samplers;
|
||||||
samplers.reserve(chars.size());
|
samplers.reserve(chars.size());
|
||||||
|
|
||||||
for (const auto & c : chars) {
|
for (const auto & c : chars) {
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
// gpt_sampler extends llama_sampler with additional functionality:
|
// common_sampler extends llama_sampler with additional functionality:
|
||||||
//
|
//
|
||||||
// - grammar support
|
// - grammar support
|
||||||
// - custom sampler logic based on the parameters
|
// - custom sampler logic based on the parameters
|
||||||
|
@ -23,30 +23,30 @@
|
||||||
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
// token in order to verify if it fits the grammar. And only if the token doesn't fit the grammar, the
|
||||||
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
// grammar constraints are applied to the full vocabulary and the token is resampled.
|
||||||
//
|
//
|
||||||
// The gpt_sampler also maintains a container with the last accepted tokens. In the future, this can
|
// The common_sampler also maintains a container with the last accepted tokens. In the future, this can
|
||||||
// be moved into the core llama library.
|
// be moved into the core llama library.
|
||||||
//
|
//
|
||||||
// For convenience, the gpt_sampler also maintains a container with the current candidate tokens.
|
// For convenience, the common_sampler also maintains a container with the current candidate tokens.
|
||||||
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
// This can be used to access the probabilities of the rest of the non-sampled tokens.
|
||||||
//
|
//
|
||||||
// TODO: measure grammar performance
|
// TODO: measure grammar performance
|
||||||
//
|
//
|
||||||
|
|
||||||
struct gpt_sampler;
|
struct common_sampler;
|
||||||
|
|
||||||
// llama_sampler API overloads
|
// llama_sampler API overloads
|
||||||
|
|
||||||
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params);
|
struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
|
||||||
|
|
||||||
void gpt_sampler_free(struct gpt_sampler * gsmpl);
|
void common_sampler_free(struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
|
||||||
void gpt_sampler_accept(struct gpt_sampler * gsmpl, llama_token token, bool accept_grammar);
|
void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
|
||||||
void gpt_sampler_reset (struct gpt_sampler * gsmpl);
|
void common_sampler_reset (struct common_sampler * gsmpl);
|
||||||
struct gpt_sampler * gpt_sampler_clone (struct gpt_sampler * gsmpl);
|
struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// arguments can be nullptr to skip printing
|
// arguments can be nullptr to skip printing
|
||||||
void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler * gsmpl);
|
void common_perf_print(const struct llama_context * ctx, const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// extended sampling implementation:
|
// extended sampling implementation:
|
||||||
//
|
//
|
||||||
|
@ -58,26 +58,26 @@ void gpt_perf_print(const struct llama_context * ctx, const struct gpt_sampler *
|
||||||
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
// if grammar_first is true, the grammar is applied before the samplers (slower)
|
||||||
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
// useful in cases where all the resulting candidates (not just the sampled one) must fit the grammar
|
||||||
//
|
//
|
||||||
llama_token gpt_sampler_sample(struct gpt_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
llama_token common_sampler_sample(struct common_sampler * gsmpl, struct llama_context * ctx, int idx, bool grammar_first = false);
|
||||||
|
|
||||||
uint32_t gpt_sampler_get_seed(const struct gpt_sampler * gsmpl);
|
uint32_t common_sampler_get_seed(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// helpers
|
// helpers
|
||||||
|
|
||||||
// access the internal list of current candidate tokens
|
// access the internal list of current candidate tokens
|
||||||
llama_token_data_array * gpt_sampler_get_candidates(struct gpt_sampler * gsmpl);
|
llama_token_data_array * common_sampler_get_candidates(struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// get the last accepted token
|
// get the last accepted token
|
||||||
llama_token gpt_sampler_last(const struct gpt_sampler * gsmpl);
|
llama_token common_sampler_last(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// print the sampler chain into a string
|
// print the sampler chain into a string
|
||||||
std::string gpt_sampler_print(const struct gpt_sampler * gsmpl);
|
std::string common_sampler_print(const struct common_sampler * gsmpl);
|
||||||
|
|
||||||
// get a string representation of the last accepted tokens
|
// get a string representation of the last accepted tokens
|
||||||
std::string gpt_sampler_prev_str(gpt_sampler * gsmpl, llama_context * ctx, int n);
|
std::string common_sampler_prev_str(common_sampler * gsmpl, llama_context * ctx, int n);
|
||||||
|
|
||||||
char gpt_sampler_type_to_chr(enum gpt_sampler_type cnstr);
|
char common_sampler_type_to_chr(enum common_sampler_type cnstr);
|
||||||
std::string gpt_sampler_type_to_str(enum gpt_sampler_type cnstr);
|
std::string common_sampler_type_to_str(enum common_sampler_type cnstr);
|
||||||
|
|
||||||
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
std::vector<enum common_sampler_type> common_sampler_types_from_names(const std::vector<std::string> & names, bool allow_alt_names);
|
||||||
std::vector<enum gpt_sampler_type> gpt_sampler_types_from_chars(const std::string & chars);
|
std::vector<enum common_sampler_type> common_sampler_types_from_chars(const std::string & chars);
|
||||||
|
|
109
docs/android.md
109
docs/android.md
|
@ -2,55 +2,82 @@
|
||||||
# Android
|
# Android
|
||||||
|
|
||||||
## Build on Android using Termux
|
## Build on Android using Termux
|
||||||
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
|
|
||||||
```
|
|
||||||
apt update && apt upgrade -y
|
|
||||||
apt install git make cmake
|
|
||||||
```
|
|
||||||
|
|
||||||
It's recommended to move your model inside the `~/` directory for best performance:
|
[Termux](https://termux.dev/en/) is an Android terminal emulator and Linux environment app (no root required). As of writing, Termux is available experimentally in the Google Play Store; otherwise, it may be obtained directly from the project repo or on F-Droid.
|
||||||
```
|
|
||||||
cd storage/downloads
|
|
||||||
mv model.gguf ~/
|
|
||||||
```
|
|
||||||
|
|
||||||
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
|
With Termux, you can install and run `llama.cpp` as if the environment were Linux. Once in the Termux shell:
|
||||||
|
|
||||||
## Building the Project using Android NDK
|
|
||||||
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
|
||||||
|
|
||||||
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
|
||||||
```
|
|
||||||
$ mkdir build-android
|
|
||||||
$ cd build-android
|
|
||||||
$ export NDK=<your_ndk_directory>
|
|
||||||
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
|
||||||
$ make
|
|
||||||
```
|
|
||||||
|
|
||||||
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
|
||||||
|
|
||||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
|
||||||
|
|
||||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
|
||||||
```
|
|
||||||
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
|
|
||||||
$cd /data/data/com.termux/files/home/bin
|
|
||||||
$chmod +x ./*
|
|
||||||
```
|
|
||||||
|
|
||||||
Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
|
|
||||||
|
|
||||||
```
|
```
|
||||||
$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
|
$ apt update && apt upgrade -y
|
||||||
|
$ apt install git cmake
|
||||||
```
|
```
|
||||||
|
|
||||||
Now, you can start chatting:
|
Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
|
||||||
|
|
||||||
|
Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
|
||||||
|
|
||||||
```
|
```
|
||||||
$cd /data/data/com.termux/files/home/bin
|
$ curl -L {model-url} -o ~/{model}.gguf
|
||||||
$./llama-cli -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Here's a demo of an interactive session running on Pixel 5 phone:
|
Then, if you are not already in the repo directory, `cd` into `llama.cpp` and:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./build/bin/llama-simple -m ~/{model}.gguf -c {context-size} -p "{your-prompt}"
|
||||||
|
```
|
||||||
|
|
||||||
|
Here, we show `llama-simple`, but any of the executables under `examples` should work, in theory. Be sure to set `context-size` to a reasonable number (say, 4096) to start with; otherwise, memory could spike and kill your terminal.
|
||||||
|
|
||||||
|
To see what it might look like visually, here's an old demo of an interactive session running on a Pixel 5 phone:
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||||
|
|
||||||
|
## Cross-compile using Android NDK
|
||||||
|
It's possible to build `llama.cpp` for Android on your host system via CMake and the Android NDK. If you are interested in this path, ensure you already have an environment prepared to cross-compile programs for Android (i.e., install the Android SDK). Note that, unlike desktop environments, the Android environment ships with a limited set of native libraries, and so only those libraries are available to CMake when building with the Android NDK (see: https://developer.android.com/ndk/guides/stable_apis.)
|
||||||
|
|
||||||
|
Once you're ready and have cloned `llama.cpp`, invoke the following in the project directory:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ cmake \
|
||||||
|
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
||||||
|
-DANDROID_ABI=arm64-v8a \
|
||||||
|
-DANDROID_PLATFORM=android-28 \
|
||||||
|
-DCMAKE_C_FLAGS="-march=armv8.7a" \
|
||||||
|
-DCMAKE_CXX_FLAGS="-march=armv8.7a" \
|
||||||
|
-DGGML_OPENMP=OFF \
|
||||||
|
-DGGML_LLAMAFILE=OFF \
|
||||||
|
-B build-android
|
||||||
|
```
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
- While later versions of Android NDK ship with OpenMP, it must still be installed by CMake as a dependency, which is not supported at this time
|
||||||
|
- `llamafile` does not appear to support Android devices (see: https://github.com/Mozilla-Ocho/llamafile/issues/325)
|
||||||
|
|
||||||
|
The above command should configure `llama.cpp` with the most performant options for modern devices. Even if your device is not running `armv8.7a`, `llama.cpp` includes runtime checks for available CPU features it can use.
|
||||||
|
|
||||||
|
Feel free to adjust the Android ABI for your target. Once the project is configured:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ cmake --build build-android --config Release -j{n}
|
||||||
|
$ cmake --install build-android --prefix {install-dir} --config Release
|
||||||
|
```
|
||||||
|
|
||||||
|
After installing, go ahead and download the model of your choice to your host system. Then:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ adb shell "mkdir /data/local/tmp/llama.cpp"
|
||||||
|
$ adb push {install-dir} /data/local/tmp/llama.cpp/
|
||||||
|
$ adb push {model}.gguf /data/local/tmp/llama.cpp/
|
||||||
|
$ adb shell
|
||||||
|
```
|
||||||
|
|
||||||
|
In the `adb shell`:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ cd /data/local/tmp/llama.cpp
|
||||||
|
$ LD_LIBRARY_PATH=lib ./bin/llama-simple -m {model}.gguf -c {context-size} -p "{your-prompt}"
|
||||||
|
```
|
||||||
|
|
||||||
|
That's it!
|
||||||
|
|
||||||
|
Be aware that Android will not find the library path `lib` on its own, so we must specify `LD_LIBRARY_PATH` in order to run the installed executables. Android does support `RPATH` in later API levels, so this could change in the future. Refer to the previous section for information about `context-size` (very important!) and running other `examples`.
|
||||||
|
|
|
@ -26,7 +26,7 @@
|
||||||
|
|
||||||
### Llama.cpp + SYCL
|
### Llama.cpp + SYCL
|
||||||
|
|
||||||
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*).
|
The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it also supports other vendor GPUs: Nvidia and AMD.
|
||||||
|
|
||||||
## Recommended Release
|
## Recommended Release
|
||||||
|
|
||||||
|
@ -112,9 +112,17 @@ SYCL backend supports Intel GPU Family:
|
||||||
**Verified devices**
|
**Verified devices**
|
||||||
|
|
||||||
| Nvidia GPU | Status | Verified Model |
|
| Nvidia GPU | Status | Verified Model |
|
||||||
|--------------------------|---------|----------------|
|
|--------------------------|-----------|----------------|
|
||||||
| Ampere Series | Support | A100, A4000 |
|
| Ampere Series | Supported | A100, A4000 |
|
||||||
| Ampere Series *(Mobile)* | Support | RTX 40 Series |
|
| Ampere Series *(Mobile)* | Supported | RTX 40 Series |
|
||||||
|
|
||||||
|
| AMD GPU | Status | Verified Model |
|
||||||
|
|--------------------------|--------------|----------------|
|
||||||
|
| Radeon Pro | Experimental | W6800 |
|
||||||
|
| Radeon RX | Experimental | 6700 XT |
|
||||||
|
|
||||||
|
Note: AMD GPU support is highly experimental and is incompatible with F16.
|
||||||
|
Additionally, it only supports GPUs with a sub_group_size (warp size) of 32.
|
||||||
|
|
||||||
## Docker
|
## Docker
|
||||||
The docker build option is currently limited to *intel GPU* targets.
|
The docker build option is currently limited to *intel GPU* targets.
|
||||||
|
@ -186,6 +194,10 @@ Platform #0: Intel(R) OpenCL HD Graphics
|
||||||
|
|
||||||
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
|
In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed.
|
||||||
|
|
||||||
|
- **AMD GPU**
|
||||||
|
|
||||||
|
To target AMD GPUs with SYCL, the ROCm stack must be installed first.
|
||||||
|
|
||||||
2. **Install Intel® oneAPI Base toolkit**
|
2. **Install Intel® oneAPI Base toolkit**
|
||||||
|
|
||||||
- **For Intel GPU**
|
- **For Intel GPU**
|
||||||
|
@ -212,6 +224,19 @@ cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENAB
|
||||||
cmake --build buildWithCublas --config Release
|
cmake --build buildWithCublas --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
- **Adding support to AMD GPUs**
|
||||||
|
|
||||||
|
**oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
|
||||||
|
|
||||||
|
**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
|
||||||
|
|
||||||
|
```sh
|
||||||
|
git clone https://github.com/oneapi-src/oneMKL
|
||||||
|
cd oneMKL
|
||||||
|
# Find your HIPTARGET with rocminfo, under the key 'Name:'
|
||||||
|
cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
|
||||||
|
cmake --build buildWithrocBLAS --config Release
|
||||||
|
```
|
||||||
|
|
||||||
3. **Verify installation and environment**
|
3. **Verify installation and environment**
|
||||||
|
|
||||||
|
@ -223,22 +248,32 @@ sycl-ls
|
||||||
|
|
||||||
- **Intel GPU**
|
- **Intel GPU**
|
||||||
|
|
||||||
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below:
|
When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`level_zero:gpu`] in the sample output below:
|
||||||
|
|
||||||
```
|
```
|
||||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
[opencl:cpu][opencl:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
[opencl:gpu][opencl:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
[level_zero:gpu][level_zero:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||||
```
|
```
|
||||||
|
|
||||||
- **Nvidia GPU**
|
- **Nvidia GPU**
|
||||||
|
|
||||||
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow:
|
Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`cuda:gpu`] as below:
|
||||||
|
|
||||||
```
|
```
|
||||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
|
[opencl:acc][opencl:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix]
|
||||||
[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
|
[opencl:cpu][opencl:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix]
|
||||||
[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2]
|
[cuda:gpu][cuda:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.5]
|
||||||
|
```
|
||||||
|
|
||||||
|
- **AMD GPU**
|
||||||
|
|
||||||
|
For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
|
||||||
|
|
||||||
|
```
|
||||||
|
[opencl:cpu][opencl:0] Intel(R) OpenCL, 12th Gen Intel(R) Core(TM) i9-12900K OpenCL 3.0 (Build 0) [2024.18.6.0.02_160000]
|
||||||
|
[hip:gpu][hip:0] AMD HIP BACKEND, AMD Radeon PRO W6800 gfx1030 [HIP 60140.9]
|
||||||
```
|
```
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build llama.cpp
|
||||||
|
@ -266,6 +301,7 @@ cmake --build build --config Release -j -v
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Nvidia GPU
|
#### Nvidia GPU
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# Export relevant ENV variables
|
# Export relevant ENV variables
|
||||||
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
|
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
|
||||||
|
@ -283,7 +319,25 @@ cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
|
```
|
||||||
|
|
||||||
|
#### AMD GPU
|
||||||
|
|
||||||
|
```sh
|
||||||
|
# Export relevant ENV variables
|
||||||
|
export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
|
||||||
|
export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
|
||||||
|
export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
|
||||||
|
|
||||||
|
# Build LLAMA with rocBLAS acceleration through SYCL
|
||||||
|
|
||||||
|
## AMD
|
||||||
|
# Use FP32, FP16 is not supported
|
||||||
|
# Find your GGML_SYCL_HIP_TARGET with rocminfo, under the key 'Name:'
|
||||||
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_HIP_TARGET=${GGML_SYCL_HIP_TARGET} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
|
# build all binary
|
||||||
|
cmake --build build --config Release -j -v
|
||||||
```
|
```
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
@ -587,9 +641,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
#### Build
|
#### Build
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|--------------------|-----------------------------------|---------------------------------------------|
|
|--------------------|---------------------------------------|---------------------------------------------|
|
||||||
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
||||||
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
||||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
|
@ -198,6 +198,8 @@ The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
### MUSA
|
### MUSA
|
||||||
|
|
||||||
|
This provides GPU acceleration using the MUSA cores of your Moore Threads MTT GPU. Make sure to have the MUSA SDK installed. You can download it from here: [MUSA SDK](https://developer.mthreads.com/sdk/download/musa).
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make GGML_MUSA=1
|
make GGML_MUSA=1
|
||||||
|
@ -209,6 +211,12 @@ The following compilation options are also available to tweak performance:
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The environment variable [`MUSA_VISIBLE_DEVICES`](https://docs.mthreads.com/musa-sdk/musa-sdk-doc-online/programming_guide/Z%E9%99%84%E5%BD%95/) can be used to specify which GPU(s) will be used.
|
||||||
|
|
||||||
|
The environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1` can be used to enable unified memory in Linux. This allows swapping to system RAM instead of crashing when the GPU VRAM is exhausted.
|
||||||
|
|
||||||
|
Most of the compilation options available for CUDA should also be available for MUSA, though they haven't been thoroughly tested yet.
|
||||||
|
|
||||||
### hipBLAS
|
### hipBLAS
|
||||||
|
|
||||||
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
This provides BLAS acceleration on HIP-supported AMD GPUs.
|
||||||
|
|
|
@ -19,8 +19,11 @@ Additionally, there the following images, similar to the above:
|
||||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
|
- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
|
||||||
|
|
||||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
@ -84,3 +87,37 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Docker With MUSA
|
||||||
|
|
||||||
|
Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/native) properly installed on Linux, `muBLAS` should be accessible inside the container.
|
||||||
|
|
||||||
|
## Building Docker locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
|
||||||
|
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
|
||||||
|
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
|
||||||
|
```
|
||||||
|
|
||||||
|
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.
|
||||||
|
|
||||||
|
The defaults are:
|
||||||
|
|
||||||
|
- `MUSA_VERSION` set to `rc3.1.0`
|
||||||
|
|
||||||
|
The resulting images, are essentially the same as the non-MUSA images:
|
||||||
|
|
||||||
|
1. `local/llama.cpp:full-musa`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||||
|
2. `local/llama.cpp:light-musa`: This image only includes the main executable file.
|
||||||
|
3. `local/llama.cpp:server-musa`: This image only includes the server executable file.
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
After building locally, Usage is similar to the non-MUSA examples, but you'll need to set `mthreads` as default Docker runtime. This can be done by executing `(cd /usr/bin/musa && sudo ./docker setup $PWD)` and verifying the changes by executing `docker info | grep mthreads` on the host machine. You will also want to use the `--n-gpu-layers` flag.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v /path/to/models:/models local/llama.cpp:full-musa --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
|
docker run -v /path/to/models:/models local/llama.cpp:light-musa -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||||
|
docker run -v /path/to/models:/models local/llama.cpp:server-musa -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||||
|
```
|
||||||
|
|
|
@ -16,7 +16,6 @@ else()
|
||||||
add_subdirectory(baby-llama)
|
add_subdirectory(baby-llama)
|
||||||
add_subdirectory(batched-bench)
|
add_subdirectory(batched-bench)
|
||||||
add_subdirectory(batched)
|
add_subdirectory(batched)
|
||||||
add_subdirectory(benchmark)
|
|
||||||
add_subdirectory(convert-llama2c-to-ggml)
|
add_subdirectory(convert-llama2c-to-ggml)
|
||||||
add_subdirectory(embedding)
|
add_subdirectory(embedding)
|
||||||
add_subdirectory(eval-callback)
|
add_subdirectory(eval-callback)
|
||||||
|
|
|
@ -15,13 +15,13 @@ static void print_usage(int, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_BENCH, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
int is_pp_shared = params.is_pp_shared;
|
int is_pp_shared = params.is_pp_shared;
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
@ -45,7 +45,7 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
llama_context_params ctx_params = common_context_params_to_llama(params);
|
||||||
|
|
||||||
// ensure enough sequences are available
|
// ensure enough sequences are available
|
||||||
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
ctx_params.n_seq_max = n_pl.empty() ? 1 : *std::max_element(n_pl.begin(), n_pl.end());
|
||||||
|
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
|
||||||
// warm up
|
// warm up
|
||||||
{
|
{
|
||||||
for (int i = 0; i < 16; ++i) {
|
for (int i = 0; i < 16; ++i) {
|
||||||
llama_batch_add(batch, 0, i, { 0 }, false);
|
common_batch_add(batch, 0, i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
@ -122,11 +122,11 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
for (int i = 0; i < pp; ++i) {
|
for (int i = 0; i < pp; ++i) {
|
||||||
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
|
for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
|
||||||
llama_batch_add(batch, 0, i, { j }, false);
|
common_batch_add(batch, 0, i, { j }, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
|
@ -151,10 +151,10 @@ int main(int argc, char ** argv) {
|
||||||
const auto t_tg_start = ggml_time_us();
|
const auto t_tg_start = ggml_time_us();
|
||||||
|
|
||||||
for (int i = 0; i < tg; ++i) {
|
for (int i = 0; i < tg; ++i) {
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
for (int j = 0; j < pl; ++j) {
|
for (int j = 0; j < pl; ++j) {
|
||||||
llama_batch_add(batch, 0, pp + i, { j }, true);
|
common_batch_add(batch, 0, pp + i, { j }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
|
||||||
|
|
|
@ -15,16 +15,16 @@ static void print_usage(int, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
params.prompt = "Hello my name is";
|
params.prompt = "Hello my name is";
|
||||||
params.n_predict = 32;
|
params.n_predict = 32;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
// number of parallel batches
|
// number of parallel batches
|
||||||
int n_parallel = params.n_parallel;
|
int n_parallel = params.n_parallel;
|
||||||
|
@ -39,7 +39,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
@ -51,13 +51,13 @@ int main(int argc, char ** argv) {
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(model, params.prompt, true);
|
tokens_list = common_tokenize(model, params.prompt, true);
|
||||||
|
|
||||||
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size())*n_parallel;
|
||||||
|
|
||||||
// initialize the context
|
// initialize the context
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
llama_context_params ctx_params = common_context_params_to_llama(params);
|
||||||
|
|
||||||
ctx_params.n_ctx = n_kv_req;
|
ctx_params.n_ctx = n_kv_req;
|
||||||
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
ctx_params.n_batch = std::max(n_predict, n_parallel);
|
||||||
|
@ -94,7 +94,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
for (auto id : tokens_list) {
|
||||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// create a llama_batch
|
// create a llama_batch
|
||||||
|
@ -108,7 +108,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
for (size_t i = 0; i < tokens_list.size(); ++i) {
|
||||||
llama_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
common_batch_add(batch, tokens_list[i], i, seq_ids, false);
|
||||||
}
|
}
|
||||||
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
GGML_ASSERT(batch.n_tokens == (int) tokens_list.size());
|
||||||
|
|
||||||
|
@ -123,8 +123,8 @@ int main(int argc, char ** argv) {
|
||||||
decoder_start_token_id = llama_token_bos(model);
|
decoder_start_token_id = llama_token_bos(model);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
llama_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
common_batch_add(batch, decoder_start_token_id, 0, seq_ids, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// llama_decode will output logits only for the last token of the prompt
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
@ -161,7 +161,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
while (n_cur <= n_predict) {
|
while (n_cur <= n_predict) {
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// sample the next token for each parallel sequence / stream
|
// sample the next token for each parallel sequence / stream
|
||||||
for (int32_t i = 0; i < n_parallel; ++i) {
|
for (int32_t i = 0; i < n_parallel; ++i) {
|
||||||
|
@ -185,15 +185,15 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// if there is only one stream, we print immediately to stdout
|
// if there is only one stream, we print immediately to stdout
|
||||||
if (n_parallel == 1) {
|
if (n_parallel == 1) {
|
||||||
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
streams[i] += llama_token_to_piece(ctx, new_token_id);
|
streams[i] += common_token_to_piece(ctx, new_token_id);
|
||||||
|
|
||||||
i_batch[i] = batch.n_tokens;
|
i_batch[i] = batch.n_tokens;
|
||||||
|
|
||||||
// push this new token for next evaluation
|
// push this new token for next evaluation
|
||||||
llama_batch_add(batch, new_token_id, n_cur, { i }, true);
|
common_batch_add(batch, new_token_id, n_cur, { i }, true);
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +0,0 @@
|
||||||
set(TARGET llama-bench-matmult)
|
|
||||||
add_executable(${TARGET} benchmark-matmult.cpp)
|
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_include_directories(${TARGET} PRIVATE ../../common)
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
|
|
@ -1,275 +0,0 @@
|
||||||
#include "common.h"
|
|
||||||
#include "ggml.h"
|
|
||||||
|
|
||||||
#include <locale.h>
|
|
||||||
#include <assert.h>
|
|
||||||
#include <math.h>
|
|
||||||
#include <cstring>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cinttypes>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <queue>
|
|
||||||
#include <string.h>
|
|
||||||
#include <cassert>
|
|
||||||
#include <fstream>
|
|
||||||
#include <string>
|
|
||||||
#include <iterator>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
|
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
|
||||||
buf.resize(plan.work_size);
|
|
||||||
plan.work_data = buf.data();
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_graph_compute(graph, &plan);
|
|
||||||
}
|
|
||||||
|
|
||||||
static float tensor_sum_elements(const ggml_tensor * tensor) {
|
|
||||||
double sum = 0;
|
|
||||||
if (tensor->type == GGML_TYPE_F32) {
|
|
||||||
for (int j = 0; j < tensor->ne[1]; j++) {
|
|
||||||
for (int k = 0; k < tensor->ne[0]; k++) {
|
|
||||||
sum += ((float *) tensor->data)[j*tensor->ne[0] + k];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void tensor_dump(const ggml_tensor * tensor, const char * name) {
|
|
||||||
printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name,
|
|
||||||
tensor->type, ggml_type_name(tensor->type),
|
|
||||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]);
|
|
||||||
float sum = tensor_sum_elements(tensor);
|
|
||||||
printf("Sum of tensor %s is %6.2f\n", name, sum);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
|
|
||||||
|
|
||||||
struct benchmark_params_struct {
|
|
||||||
int n_threads = 1;
|
|
||||||
int32_t n_iterations = 10;
|
|
||||||
};
|
|
||||||
|
|
||||||
static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) {
|
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
fprintf(stderr, "options:\n");
|
|
||||||
fprintf(stderr, " -h, --help show this help message and exit\n");
|
|
||||||
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
|
||||||
fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations);
|
|
||||||
fprintf(stderr, "\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
|
||||||
struct benchmark_params_struct benchmark_params;
|
|
||||||
|
|
||||||
bool invalid_param = false;
|
|
||||||
std::string arg;
|
|
||||||
for (int i = 1; i < argc; i++) {
|
|
||||||
arg = argv[i];
|
|
||||||
|
|
||||||
if (arg == "-t" || arg == "--threads") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
benchmark_params.n_threads = std::stoi(argv[i]);
|
|
||||||
} else if (arg == "-i" || arg == "--iter") {
|
|
||||||
if (++i >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
benchmark_params.n_iterations = std::stoi(argv[i]);
|
|
||||||
} else if (arg == "-h" || arg == "--help") {
|
|
||||||
print_usage(argc, argv, benchmark_params);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (invalid_param) {
|
|
||||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
|
||||||
print_usage(argc, argv, benchmark_params);
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
|
|
||||||
print_build_info();
|
|
||||||
printf("Starting Test\n");
|
|
||||||
|
|
||||||
// create the ggml context
|
|
||||||
struct ggml_context * ctx;
|
|
||||||
//const int sizex = 4096;
|
|
||||||
//const int sizey = 11008;
|
|
||||||
|
|
||||||
#undef VERBOSE_DEBUGGING
|
|
||||||
#ifndef VERBOSE_DEBUGGING
|
|
||||||
const int sizey = 4096;
|
|
||||||
const int sizex = 11008;
|
|
||||||
const int sizez = 128;
|
|
||||||
#else
|
|
||||||
/* Working - let's increase size */
|
|
||||||
const int sizey = 1;
|
|
||||||
const int sizex = (8*32);
|
|
||||||
const int sizez = 1;
|
|
||||||
|
|
||||||
/*const int sizey = 1;
|
|
||||||
const int sizex = 3*(8*32);
|
|
||||||
const int sizez = 1;*/
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//printf("Memsize required = %i\n", sizex*sizex);
|
|
||||||
|
|
||||||
// TODO: perform the bench for all types or for a user specified type
|
|
||||||
const ggml_type qtype = GGML_TYPE_Q4_1;
|
|
||||||
|
|
||||||
size_t ctx_size = 0;
|
|
||||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
|
||||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey);
|
|
||||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez);
|
|
||||||
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
|
||||||
ctx_size += ggml_row_size(qtype, sizex*sizey);
|
|
||||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
|
||||||
ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS
|
|
||||||
ctx_size += 1024*1024*16;
|
|
||||||
|
|
||||||
printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024));
|
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
|
||||||
/*.mem_size =*/ ctx_size,
|
|
||||||
/*.mem_buffer =*/ NULL,
|
|
||||||
/* no_alloc =*/ 0
|
|
||||||
};
|
|
||||||
|
|
||||||
ctx = ggml_init(params);
|
|
||||||
if (!ctx) {
|
|
||||||
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
printf("Creating new tensors\n");
|
|
||||||
// printf("Creating new tensor m1\n");
|
|
||||||
struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
|
||||||
ggml_set_f32(m11, 1.0f);
|
|
||||||
|
|
||||||
// printf("Creating new tensor m1\n");
|
|
||||||
struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey);
|
|
||||||
ggml_set_f32(m12, 1.5f);
|
|
||||||
|
|
||||||
// printf("Creating new tensor m2\n");
|
|
||||||
struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez);
|
|
||||||
ggml_set_f32(m2, 2.0f);
|
|
||||||
|
|
||||||
printf("\n------ Test 1 - Matrix Mult via F32 code\n");
|
|
||||||
// printf("Creating new tensor m11xm2\n");
|
|
||||||
struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2);
|
|
||||||
|
|
||||||
// printf("Creating compute graph\n");
|
|
||||||
struct ggml_cgraph * gf = ggml_new_graph(ctx);
|
|
||||||
ggml_build_forward_expand(gf, m11xm2);
|
|
||||||
|
|
||||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
|
||||||
|
|
||||||
TENSOR_DUMP(m11);
|
|
||||||
TENSOR_DUMP(m2);
|
|
||||||
|
|
||||||
std::vector<uint8_t> work_buffer;
|
|
||||||
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
|
|
||||||
|
|
||||||
TENSOR_DUMP(ggml_graph_node(gf, 0));
|
|
||||||
|
|
||||||
printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
|
|
||||||
|
|
||||||
int32_t nelements = sizex*sizey;
|
|
||||||
|
|
||||||
// Set up a the benchmark matrices
|
|
||||||
// printf("Creating new tensor q11 & Running quantize\n");
|
|
||||||
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
|
||||||
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);
|
|
||||||
|
|
||||||
// Set up a the compute graph
|
|
||||||
// printf("Creating new tensor q31\n");
|
|
||||||
struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2);
|
|
||||||
|
|
||||||
// printf("Creating compute graph\n");
|
|
||||||
struct ggml_cgraph * gf31 = ggml_new_graph(ctx);
|
|
||||||
ggml_build_forward_expand(gf31, q31);
|
|
||||||
|
|
||||||
// Set up a second graph computation to make sure we override the CPU cache lines
|
|
||||||
// printf("Creating new tensor q12 & Running quantize\n");
|
|
||||||
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
|
|
||||||
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);
|
|
||||||
|
|
||||||
// printf("Creating new tensor q32\n");
|
|
||||||
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
|
|
||||||
|
|
||||||
//printf("Creating compute graph\n");
|
|
||||||
struct ggml_cgraph * gf32 = ggml_new_graph(ctx);
|
|
||||||
ggml_build_forward_expand(gf32, q32);
|
|
||||||
printf("n_threads=%i\n", benchmark_params.n_threads);
|
|
||||||
|
|
||||||
const int dimx = sizex;
|
|
||||||
const int dimy = sizey;
|
|
||||||
const int dimz = sizez;
|
|
||||||
long long int flops_per_dot_product = dimy + dimy;
|
|
||||||
long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ;
|
|
||||||
printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000);
|
|
||||||
|
|
||||||
|
|
||||||
// Let's use the F32 result from above as a reference for the quantized multiplication
|
|
||||||
float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
|
|
||||||
|
|
||||||
printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
|
|
||||||
printf("=====================================================================================\n");
|
|
||||||
|
|
||||||
double gflops_sum = 0;
|
|
||||||
for (int i=0;i<benchmark_params.n_iterations ;i++) {
|
|
||||||
|
|
||||||
long long int start = ggml_time_us();
|
|
||||||
//printf("Running ggml_graph_compute\n");
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
|
|
||||||
|
|
||||||
long long int stop = ggml_time_us();
|
|
||||||
long long int usec = stop-start;
|
|
||||||
double gflops = (double)(flops_per_matrix)/usec/1000.0;
|
|
||||||
gflops_sum += gflops;
|
|
||||||
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
|
|
||||||
i,
|
|
||||||
benchmark_params.n_threads,
|
|
||||||
sizex, sizey, sizez, flops_per_matrix,
|
|
||||||
usec,gflops);
|
|
||||||
|
|
||||||
#ifdef VERBOSE_DEBUGGING
|
|
||||||
TENSOR_DUMP("res",gf31.nodes[0])
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Check that the matrix multiplication result is in the right ballpark
|
|
||||||
// We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
|
|
||||||
float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
|
|
||||||
float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
|
|
||||||
float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6
|
|
||||||
|
|
||||||
if (delta > allowed_delta) {
|
|
||||||
printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n",
|
|
||||||
sum_of_F32_reference,
|
|
||||||
sum_of_Q4_result,
|
|
||||||
delta,
|
|
||||||
allowed_delta
|
|
||||||
);
|
|
||||||
exit(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Running a different graph computation to make sure we override the CPU cache lines
|
|
||||||
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
|
|
||||||
}
|
|
||||||
printf("\n");
|
|
||||||
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
|
|
||||||
printf("=====================================================================================\n");
|
|
||||||
}
|
|
|
@ -872,7 +872,7 @@ static std::string basename(const std::string &path) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
struct train_params params = get_default_train_params();
|
struct train_params params = get_default_train_params();
|
||||||
if (!params_parse(argc, argv, ¶ms)) {
|
if (!params_parse(argc, argv, ¶ms)) {
|
||||||
|
|
|
@ -31,7 +31,7 @@ template <class Iter>
|
||||||
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
std::string ret;
|
std::string ret;
|
||||||
for (; begin != end; ++begin) {
|
for (; begin != end; ++begin) {
|
||||||
ret += llama_token_to_piece(ctx, *begin);
|
ret += common_token_to_piece(ctx, *begin);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -272,8 +272,8 @@ struct tokenized_prompt {
|
||||||
|
|
||||||
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
tokens_pos = common_tokenize(ctx, pos, add_bos, true);
|
||||||
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
tokens_neg = common_tokenize(ctx, neg, add_bos, true);
|
||||||
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
padding_seq(ctx, tokens_pos, max_seq_len);
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||||
padding_seq(ctx, tokens_neg, max_seq_len);
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||||
|
@ -281,7 +281,7 @@ struct tokenized_prompt {
|
||||||
|
|
||||||
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
|
void padding_seq(llama_context * ctx, std::vector<llama_token> & tokens, size_t len) {
|
||||||
// TODO: customize padding token
|
// TODO: customize padding token
|
||||||
std::vector<llama_token> pad_tokens = ::llama_tokenize(ctx, " ", false);
|
std::vector<llama_token> pad_tokens = common_tokenize(ctx, " ", false);
|
||||||
llama_token pad_tok = pad_tokens.back();
|
llama_token pad_tok = pad_tokens.back();
|
||||||
while (tokens.size() < len) {
|
while (tokens.size() < len) {
|
||||||
tokens.push_back(pad_tok);
|
tokens.push_back(pad_tok);
|
||||||
|
@ -370,7 +370,7 @@ static void export_gguf(const std::vector<struct ggml_tensor *> & v_ctrl, const
|
||||||
* Load prompt files and completion file.
|
* Load prompt files and completion file.
|
||||||
* Then format each pair of prompt + completion to make an entry.
|
* Then format each pair of prompt + completion to make an entry.
|
||||||
*/
|
*/
|
||||||
static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
static int prepare_entries(common_params & params, train_context & ctx_train) {
|
||||||
// load prompts
|
// load prompts
|
||||||
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
|
std::vector<std::string> positive_prompts = ctrlvec_load_prompt_file(params.cvector_positive_file, true);
|
||||||
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
|
std::vector<std::string> negative_prompts = ctrlvec_load_prompt_file(params.cvector_negative_file, true);
|
||||||
|
@ -388,9 +388,9 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_CVECTOR_GENERATOR, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -413,7 +413,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model to get hparams
|
// load the model to get hparams
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
|
@ -28,7 +28,7 @@ static std::vector<std::string> split_lines(const std::string & s, const std::st
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
size_t n_tokens = tokens.size();
|
size_t n_tokens = tokens.size();
|
||||||
for (size_t i = 0; i < n_tokens; i++) {
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,18 +74,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
}
|
}
|
||||||
|
|
||||||
float * out = output + embd_pos * n_embd;
|
float * out = output + embd_pos * n_embd;
|
||||||
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
common_embd_normalize(embd, out, n_embd, embd_norm);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EMBEDDING)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
params.embedding = true;
|
params.embedding = true;
|
||||||
// For non-causal models, batch size must be equal to ubatch size
|
// For non-causal models, batch size must be equal to ubatch size
|
||||||
|
@ -95,7 +95,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -122,7 +122,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
|
@ -135,7 +135,7 @@ int main(int argc, char ** argv) {
|
||||||
// tokenize the prompts and trim
|
// tokenize the prompts and trim
|
||||||
std::vector<std::vector<int32_t>> inputs;
|
std::vector<std::vector<int32_t>> inputs;
|
||||||
for (const auto & prompt : prompts) {
|
for (const auto & prompt : prompts) {
|
||||||
auto inp = ::llama_tokenize(ctx, prompt, true, true);
|
auto inp = common_tokenize(ctx, prompt, true, true);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
|
@ -159,7 +159,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, prompts[i].c_str());
|
||||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, inputs[i].size());
|
||||||
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
for (int j = 0; j < (int) inputs[i].size(); j++) {
|
||||||
LOG("%6d -> '%s'\n", inputs[i][j], llama_token_to_piece(ctx, inputs[i][j]).c_str());
|
LOG("%6d -> '%s'\n", inputs[i][j], common_token_to_piece(ctx, inputs[i][j]).c_str());
|
||||||
}
|
}
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
}
|
}
|
||||||
|
@ -199,7 +199,7 @@ int main(int argc, char ** argv) {
|
||||||
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
e += pooling_type == LLAMA_POOLING_TYPE_NONE ? batch.n_tokens : s;
|
||||||
s = 0;
|
s = 0;
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
// add to batch
|
// add to batch
|
||||||
|
@ -263,7 +263,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
LOG("%6.2f ", sim);
|
LOG("%6.2f ", sim);
|
||||||
}
|
}
|
||||||
LOG("%1.10s", prompts[i].c_str());
|
LOG("%1.10s", prompts[i].c_str());
|
||||||
|
@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
|
||||||
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int i = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
LOG(" [");
|
LOG(" [");
|
||||||
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
for (int j = 0;;) { // at least two iteration (n_embd_count > 1)
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
float sim = common_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
LOG("%6.2f", sim);
|
LOG("%6.2f", sim);
|
||||||
j++;
|
j++;
|
||||||
if (j < n_embd_count) LOG(", "); else break;
|
if (j < n_embd_count) LOG(", "); else break;
|
||||||
|
|
|
@ -126,10 +126,10 @@ static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool run(llama_context * ctx, const gpt_params & params) {
|
static bool run(llama_context * ctx, const common_params & params) {
|
||||||
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, add_bos);
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, add_bos);
|
||||||
|
|
||||||
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) {
|
||||||
LOG_ERR("%s : failed to eval\n", __func__);
|
LOG_ERR("%s : failed to eval\n", __func__);
|
||||||
|
@ -142,13 +142,13 @@ static bool run(llama_context * ctx, const gpt_params & params) {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
callback_data cb_data;
|
callback_data cb_data;
|
||||||
|
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -128,7 +128,7 @@ struct lora_merge_ctx {
|
||||||
|
|
||||||
lora_merge_ctx(
|
lora_merge_ctx(
|
||||||
std::string & base_fname,
|
std::string & base_fname,
|
||||||
std::vector<llama_lora_adapter_info> & lora_files,
|
std::vector<common_lora_adapter_info> & lora_files,
|
||||||
std::string & outfile,
|
std::string & outfile,
|
||||||
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
int n_threads) : base_model(base_fname, 0), n_threads(n_threads), fout(outfile, std::ios::binary) {
|
||||||
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
fout.exceptions(std::ofstream::failbit); // fail fast on write errors
|
||||||
|
@ -314,9 +314,9 @@ struct lora_merge_ctx {
|
||||||
// optionally dequantize it
|
// optionally dequantize it
|
||||||
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
printf("%s : + dequantize base tensor from %s to F32\n", __func__, ggml_type_name(base->type));
|
||||||
auto nels = ggml_nelements(inp_base);
|
auto nels = ggml_nelements(inp_base);
|
||||||
ggml_type_traits_t qtype = ggml_internal_get_type_traits(base->type);
|
const auto * qtype = ggml_get_type_traits(base->type);
|
||||||
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
std::vector<uint8_t> dequant_buf(nels * sizeof(float));
|
||||||
qtype.to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
qtype->to_float(read_buf.data(), (float *)dequant_buf.data(), nels);
|
||||||
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
ggml_backend_tensor_set(inp_base, dequant_buf.data(), 0, dequant_buf.size());
|
||||||
} else {
|
} else {
|
||||||
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
ggml_backend_tensor_set(inp_base, read_buf.data(), 0, ggml_nbytes(inp_base));
|
||||||
|
@ -400,9 +400,9 @@ static void print_usage(int, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_EXPORT_LORA, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,7 @@ static void write_table_header(std::ofstream & file) {
|
||||||
file << "| -------- | ----------- |\n";
|
file << "| -------- | ----------- |\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
|
static void write_table_entry(std::ofstream & file, const common_arg & opt) {
|
||||||
file << "| `";
|
file << "| `";
|
||||||
// args
|
// args
|
||||||
for (const auto & arg : opt.args) {
|
for (const auto & arg : opt.args) {
|
||||||
|
@ -40,7 +40,7 @@ static void write_table_entry(std::ofstream & file, const llama_arg & opt) {
|
||||||
file << "` | " << md_help << " |\n";
|
file << "` | " << md_help << " |\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
|
static void write_table(std::ofstream & file, std::vector<common_arg *> & opts) {
|
||||||
write_table_header(file);
|
write_table_header(file);
|
||||||
for (const auto & opt : opts) {
|
for (const auto & opt : opts) {
|
||||||
write_table_entry(file, *opt);
|
write_table_entry(file, *opt);
|
||||||
|
@ -50,12 +50,12 @@ static void write_table(std::ofstream & file, std::vector<llama_arg *> & opts) {
|
||||||
static void export_md(std::string fname, llama_example ex) {
|
static void export_md(std::string fname, llama_example ex) {
|
||||||
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
std::ofstream file(fname, std::ofstream::out | std::ofstream::trunc);
|
||||||
|
|
||||||
gpt_params params;
|
common_params params;
|
||||||
auto ctx_arg = gpt_params_parser_init(params, ex);
|
auto ctx_arg = common_params_parser_init(params, ex);
|
||||||
|
|
||||||
std::vector<llama_arg *> common_options;
|
std::vector<common_arg *> common_options;
|
||||||
std::vector<llama_arg *> sparam_options;
|
std::vector<common_arg *> sparam_options;
|
||||||
std::vector<llama_arg *> specific_options;
|
std::vector<common_arg *> specific_options;
|
||||||
for (auto & opt : ctx_arg.options) {
|
for (auto & opt : ctx_arg.options) {
|
||||||
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
// in case multiple LLAMA_EXAMPLE_* are set, we prioritize the LLAMA_EXAMPLE_* matching current example
|
||||||
if (opt.is_sparam) {
|
if (opt.is_sparam) {
|
||||||
|
|
|
@ -22,12 +22,20 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
enum split_operation : uint8_t {
|
enum split_operation : uint8_t {
|
||||||
SPLIT_OP_SPLIT,
|
OP_NONE,
|
||||||
SPLIT_OP_MERGE,
|
OP_SPLIT,
|
||||||
|
OP_MERGE,
|
||||||
|
};
|
||||||
|
|
||||||
|
enum split_mode : uint8_t {
|
||||||
|
MODE_NONE,
|
||||||
|
MODE_TENSOR,
|
||||||
|
MODE_SIZE,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct split_params {
|
struct split_params {
|
||||||
split_operation operation = SPLIT_OP_SPLIT;
|
split_operation operation = OP_NONE;
|
||||||
|
split_mode mode = MODE_NONE;
|
||||||
size_t n_bytes_split = 0;
|
size_t n_bytes_split = 0;
|
||||||
int n_split_tensors = 128;
|
int n_split_tensors = 128;
|
||||||
std::string input;
|
std::string input;
|
||||||
|
@ -87,59 +95,52 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
|
||||||
}
|
}
|
||||||
|
|
||||||
bool arg_found = false;
|
bool arg_found = false;
|
||||||
bool is_op_set = false;
|
|
||||||
bool is_mode_set = false;
|
|
||||||
if (arg == "-h" || arg == "--help") {
|
if (arg == "-h" || arg == "--help") {
|
||||||
split_print_usage(argv[0]);
|
split_print_usage(argv[0]);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
} else if (arg == "--version") {
|
||||||
if (arg == "--version") {
|
|
||||||
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT);
|
||||||
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET);
|
||||||
exit(0);
|
exit(0);
|
||||||
}
|
} else if (arg == "--dry-run") {
|
||||||
if (arg == "--dry-run") {
|
|
||||||
arg_found = true;
|
arg_found = true;
|
||||||
params.dry_run = true;
|
params.dry_run = true;
|
||||||
}
|
} else if (arg == "--no-tensor-first-split") {
|
||||||
if (arg == "--no-tensor-first-split") {
|
|
||||||
arg_found = true;
|
arg_found = true;
|
||||||
params.no_tensor_first_split = true;
|
params.no_tensor_first_split = true;
|
||||||
}
|
} else if (arg == "--merge") {
|
||||||
|
arg_found = true;
|
||||||
if (is_op_set) {
|
if (params.operation != OP_NONE && params.operation != OP_MERGE) {
|
||||||
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
|
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
|
||||||
}
|
}
|
||||||
if (arg == "--merge") {
|
params.operation = OP_MERGE;
|
||||||
|
} else if (arg == "--split") {
|
||||||
arg_found = true;
|
arg_found = true;
|
||||||
is_op_set = true;
|
if (params.operation != OP_NONE && params.operation != OP_SPLIT) {
|
||||||
params.operation = SPLIT_OP_MERGE;
|
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
|
||||||
|
}
|
||||||
|
params.operation = OP_SPLIT;
|
||||||
|
} else if (arg == "--split-max-tensors") {
|
||||||
|
if (++arg_idx >= argc) {
|
||||||
|
invalid_param = true;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
if (arg == "--split") {
|
|
||||||
arg_found = true;
|
arg_found = true;
|
||||||
is_op_set = true;
|
if (params.mode != MODE_NONE && params.mode != MODE_TENSOR) {
|
||||||
params.operation = SPLIT_OP_SPLIT;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (is_mode_set) {
|
|
||||||
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
|
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
|
||||||
}
|
}
|
||||||
if (arg == "--split-max-tensors") {
|
params.mode = MODE_TENSOR;
|
||||||
if (++arg_idx >= argc) {
|
|
||||||
invalid_param = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
arg_found = true;
|
|
||||||
is_mode_set = true;
|
|
||||||
params.n_split_tensors = atoi(argv[arg_idx]);
|
params.n_split_tensors = atoi(argv[arg_idx]);
|
||||||
}
|
} else if (arg == "--split-max-size") {
|
||||||
if (arg == "--split-max-size") {
|
|
||||||
if (++arg_idx >= argc) {
|
if (++arg_idx >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
arg_found = true;
|
arg_found = true;
|
||||||
is_mode_set = true;
|
if (params.mode != MODE_NONE && params.mode != MODE_SIZE) {
|
||||||
|
throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both");
|
||||||
|
}
|
||||||
|
params.mode = MODE_SIZE;
|
||||||
params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
|
params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -148,6 +149,15 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// the operation is split if not specified
|
||||||
|
if (params.operation == OP_NONE) {
|
||||||
|
params.operation = OP_SPLIT;
|
||||||
|
}
|
||||||
|
// the split mode is by tensor if not specified
|
||||||
|
if (params.mode == MODE_NONE) {
|
||||||
|
params.mode = MODE_TENSOR;
|
||||||
|
}
|
||||||
|
|
||||||
if (invalid_param) {
|
if (invalid_param) {
|
||||||
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
|
||||||
}
|
}
|
||||||
|
@ -265,13 +275,15 @@ struct split_strategy {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool should_split(int i_tensor, size_t next_size) {
|
bool should_split(int i_tensor, size_t next_size) {
|
||||||
if (params.n_bytes_split > 0) {
|
if (params.mode == MODE_SIZE) {
|
||||||
// split by max size per file
|
// split by max size per file
|
||||||
return next_size > params.n_bytes_split;
|
return next_size > params.n_bytes_split;
|
||||||
} else {
|
} else if (params.mode == MODE_TENSOR) {
|
||||||
// split by number of tensors per file
|
// split by number of tensors per file
|
||||||
return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
|
return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0;
|
||||||
}
|
}
|
||||||
|
// should never happen
|
||||||
|
GGML_ABORT("invalid mode");
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_info() {
|
void print_info() {
|
||||||
|
@ -559,9 +571,9 @@ int main(int argc, const char ** argv) {
|
||||||
split_params_parse(argc, argv, params);
|
split_params_parse(argc, argv, params);
|
||||||
|
|
||||||
switch (params.operation) {
|
switch (params.operation) {
|
||||||
case SPLIT_OP_SPLIT: gguf_split(params);
|
case OP_SPLIT: gguf_split(params);
|
||||||
break;
|
break;
|
||||||
case SPLIT_OP_MERGE: gguf_merge(params);
|
case OP_MERGE: gguf_merge(params);
|
||||||
break;
|
break;
|
||||||
default: split_print_usage(argv[0]);
|
default: split_print_usage(argv[0]);
|
||||||
exit(EXIT_FAILURE);
|
exit(EXIT_FAILURE);
|
||||||
|
|
|
@ -15,11 +15,11 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch batch = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
for (uint64_t i = 0; i < sentences.size(); i++) {
|
for (uint64_t i = 0; i < sentences.size(); i++) {
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
const std::string input_string = instruction + sentences[i];
|
const std::string input_string = instruction + sentences[i];
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(model, input_string, true, false);
|
std::vector<llama_token> inputs = common_tokenize(model, input_string, true, false);
|
||||||
|
|
||||||
const int32_t n_toks = inputs.size();
|
const int32_t n_toks = inputs.size();
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
// inputs.push_back(llama_token_eos(model));
|
// inputs.push_back(llama_token_eos(model));
|
||||||
|
|
||||||
// we want to ignore instruction tokens for mean pooling
|
// we want to ignore instruction tokens for mean pooling
|
||||||
const int32_t n_inst = llama_tokenize(model, instruction, true, false).size();
|
const int32_t n_inst = common_tokenize(model, instruction, true, false).size();
|
||||||
|
|
||||||
#ifdef GRIT_DEBUG
|
#ifdef GRIT_DEBUG
|
||||||
// debug tokens - should be matching as referenced in the GritLM sample
|
// debug tokens - should be matching as referenced in the GritLM sample
|
||||||
|
@ -40,7 +40,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
|
|
||||||
// add input to batch (this increments n_tokens)
|
// add input to batch (this increments n_tokens)
|
||||||
for (int32_t j = 0; j < n_toks; j++) {
|
for (int32_t j = 0; j < n_toks; j++) {
|
||||||
llama_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
common_batch_add(batch, inputs[j], j, { 0 }, j >= n_inst);
|
||||||
}
|
}
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
|
@ -75,7 +75,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> emb_norm(emb_unorm.size());
|
std::vector<float> emb_norm(emb_unorm.size());
|
||||||
llama_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
|
common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd);
|
||||||
result.push_back(emb_norm);
|
result.push_back(emb_norm);
|
||||||
|
|
||||||
#ifdef GRIT_DEBUG
|
#ifdef GRIT_DEBUG
|
||||||
|
@ -105,16 +105,16 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||||
|
|
||||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(model, prompt, false, true);
|
std::vector<llama_token> inputs = common_tokenize(model, prompt, false, true);
|
||||||
int32_t i_current_token = 0;
|
int32_t i_current_token = 0;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
llama_batch_clear(bat);
|
common_batch_clear(bat);
|
||||||
{
|
{
|
||||||
const int32_t n_inputs = inputs.size();
|
const int32_t n_inputs = inputs.size();
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_inputs; i++) {
|
for (int32_t i = 0; i < n_inputs; i++) {
|
||||||
llama_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
common_batch_add(bat, inputs[i], i_current_token++, { 0 }, i == n_inputs - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
inputs.clear();
|
inputs.clear();
|
||||||
|
@ -127,7 +127,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string piece = llama_token_to_piece(ctx, token);
|
std::string piece = common_token_to_piece(ctx, token);
|
||||||
if (stream) {
|
if (stream) {
|
||||||
std::printf("%s", piece.c_str());
|
std::printf("%s", piece.c_str());
|
||||||
std::fflush(stdout);
|
std::fflush(stdout);
|
||||||
|
@ -152,16 +152,16 @@ static std::string gritlm_instruction(const std::string & instruction) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
llama_model_params mparams = common_model_params_to_llama(params);
|
||||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
llama_context_params cparams = common_context_params_to_llama(params);
|
||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
|
@ -199,10 +199,10 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
const int n_embd = llama_n_embd(model);
|
const int n_embd = llama_n_embd(model);
|
||||||
|
|
||||||
const float cosine_sim_q0_d0 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
const float cosine_sim_q0_d0 = common_embd_similarity_cos(q_rep[0].data(), d_rep[0].data(), n_embd);
|
||||||
const float cosine_sim_q0_d1 = llama_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
const float cosine_sim_q0_d1 = common_embd_similarity_cos(q_rep[0].data(), d_rep[1].data(), n_embd);
|
||||||
const float cosine_sim_q1_d0 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
|
const float cosine_sim_q1_d0 = common_embd_similarity_cos(q_rep[1].data(), d_rep[0].data(), n_embd);
|
||||||
const float cosine_sim_q1_d1 = llama_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
|
const float cosine_sim_q1_d1 = common_embd_similarity_cos(q_rep[1].data(), d_rep[1].data(), n_embd);
|
||||||
|
|
||||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
|
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[0].c_str(), cosine_sim_q0_d0);
|
||||||
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
|
std::printf("Cosine similarity between \"%.50s\" and \"%.50s\" is: %.3f\n", queries[0].c_str(), documents[1].c_str(), cosine_sim_q0_d1);
|
||||||
|
|
|
@ -37,13 +37,13 @@ struct Stats {
|
||||||
class IMatrixCollector {
|
class IMatrixCollector {
|
||||||
public:
|
public:
|
||||||
IMatrixCollector() = default;
|
IMatrixCollector() = default;
|
||||||
void set_params(gpt_params params) { m_params = std::move(params); }
|
void set_params(common_params params) { m_params = std::move(params); }
|
||||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||||
void save_imatrix(int ncall = -1) const;
|
void save_imatrix(int ncall = -1) const;
|
||||||
bool load_imatrix(const char * file_name);
|
bool load_imatrix(const char * file_name);
|
||||||
private:
|
private:
|
||||||
std::unordered_map<std::string, Stats> m_stats;
|
std::unordered_map<std::string, Stats> m_stats;
|
||||||
gpt_params m_params;
|
common_params m_params;
|
||||||
std::mutex m_mutex;
|
std::mutex m_mutex;
|
||||||
int m_last_call = 0;
|
int m_last_call = 0;
|
||||||
std::vector<float> m_src1_data;
|
std::vector<float> m_src1_data;
|
||||||
|
@ -428,7 +428,7 @@ static void process_logits(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
static bool compute_imatrix(llama_context * ctx, const common_params & params) {
|
||||||
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_add_bos_token(llama_get_model(ctx));
|
||||||
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
GGML_ASSERT(!llama_add_eos_token(llama_get_model(ctx)));
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
@ -436,7 +436,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
@ -568,17 +568,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.escape = false;
|
params.escape = false;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||||
|
|
||||||
|
@ -607,7 +607,7 @@ int main(int argc, char ** argv) {
|
||||||
params.warmup = false;
|
params.warmup = false;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -625,7 +625,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!compute_imatrix(ctx, params)) {
|
if (!compute_imatrix(ctx, params)) {
|
||||||
|
|
|
@ -35,8 +35,8 @@
|
||||||
|
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
static llama_model ** g_model;
|
static llama_model ** g_model;
|
||||||
static gpt_sampler ** g_smpl;
|
static common_sampler ** g_smpl;
|
||||||
static gpt_params * g_params;
|
static common_params * g_params;
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
static std::vector<llama_token> * g_input_tokens;
|
||||||
static std::ostringstream * g_output_ss;
|
static std::ostringstream * g_output_ss;
|
||||||
static std::vector<llama_token> * g_output_tokens;
|
static std::vector<llama_token> * g_output_tokens;
|
||||||
|
@ -44,7 +44,7 @@ static std::vector<llama_token> * g_output_tokens;
|
||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
|
|
||||||
static void write_logfile(
|
static void write_logfile(
|
||||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
const llama_context * ctx, const common_params & params, const llama_model * model,
|
||||||
const std::vector<llama_token> & input_tokens, const std::string & output,
|
const std::vector<llama_token> & input_tokens, const std::string & output,
|
||||||
const std::vector<llama_token> & output_tokens
|
const std::vector<llama_token> & output_tokens
|
||||||
) {
|
) {
|
||||||
|
@ -95,12 +95,12 @@ static void sigint_handler(int signo) {
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
gpt_perf_print(*g_ctx, *g_smpl);
|
common_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
|
|
||||||
// make sure all logs are flushed
|
// make sure all logs are flushed
|
||||||
LOG("Interrupted by user\n");
|
LOG("Interrupted by user\n");
|
||||||
gpt_log_pause(gpt_log_main());
|
common_log_pause(common_log_main());
|
||||||
|
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
|
@ -109,14 +109,14 @@ static void sigint_handler(int signo) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_INFILL)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
auto & sparams = params.sparams;
|
auto & sparams = params.sparams;
|
||||||
|
|
||||||
|
@ -166,7 +166,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
gpt_sampler * smpl = nullptr;
|
common_sampler * smpl = nullptr;
|
||||||
|
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
@ -174,7 +174,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
|
@ -195,21 +195,21 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
const bool add_bos = llama_add_bos_token(model);
|
const bool add_bos = llama_add_bos_token(model);
|
||||||
GGML_ASSERT(!llama_add_eos_token(model));
|
GGML_ASSERT(!llama_add_eos_token(model));
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
std::vector<llama_token> embd_end;
|
std::vector<llama_token> embd_end;
|
||||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||||
|
|
||||||
GGML_ASSERT(llama_token_prefix(model) >= 0);
|
GGML_ASSERT(llama_token_fim_pre(model) >= 0);
|
||||||
GGML_ASSERT(llama_token_suffix(model) >= 0);
|
GGML_ASSERT(llama_token_fim_suf(model) >= 0);
|
||||||
|
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
|
||||||
|
|
||||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||||
|
@ -218,7 +218,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||||
|
|
||||||
const llama_token middle_token = llama_token_middle(model);
|
const llama_token middle_token = llama_token_fim_mid(model);
|
||||||
if (middle_token >= 0) {
|
if (middle_token >= 0) {
|
||||||
embd_inp.push_back(middle_token);
|
embd_inp.push_back(middle_token);
|
||||||
}
|
}
|
||||||
|
@ -257,13 +257,13 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > 0) {
|
if (params.n_keep > 0) {
|
||||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
LOG_CNT("'\n");
|
LOG_CNT("'\n");
|
||||||
}
|
}
|
||||||
|
@ -298,11 +298,11 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
smpl = gpt_sampler_init(model, sparams);
|
smpl = common_sampler_init(model, sparams);
|
||||||
|
|
||||||
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
||||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||||
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
||||||
|
|
||||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
|
|
||||||
|
@ -411,9 +411,9 @@ int main(int argc, char ** argv) {
|
||||||
embd.clear();
|
embd.clear();
|
||||||
|
|
||||||
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
if ((int) embd_inp.size() <= n_consumed && !is_interacting) {
|
||||||
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, id, true);
|
common_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
|
@ -434,7 +434,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
gpt_sampler_accept(smpl, embd_inp[n_consumed], false);
|
common_sampler_accept(smpl, embd_inp[n_consumed], false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
|
@ -446,7 +446,7 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (input_echo) {
|
if (input_echo) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = common_token_to_piece(ctx, id);
|
||||||
LOG("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
|
||||||
if (embd.size() > 1) {
|
if (embd.size() > 1) {
|
||||||
|
@ -465,10 +465,10 @@ int main(int argc, char ** argv) {
|
||||||
// if not currently processing queued inputs;
|
// if not currently processing queued inputs;
|
||||||
if ((int) embd_inp.size() <= n_consumed) {
|
if ((int) embd_inp.size() <= n_consumed) {
|
||||||
// deal with eot token in infill mode
|
// deal with eot token in infill mode
|
||||||
if ((gpt_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
if ((common_sampler_last(smpl) == llama_token_eot(model) || is_interacting) && params.interactive){
|
||||||
if (is_interacting && !params.interactive_first) {
|
if (is_interacting && !params.interactive_first) {
|
||||||
// print an eot token
|
// print an eot token
|
||||||
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||||
}
|
}
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
console::set_display(console::user_input);
|
console::set_display(console::user_input);
|
||||||
|
@ -505,11 +505,11 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenize new prefix and suffix
|
// tokenize new prefix and suffix
|
||||||
std::vector<llama_token> inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false);
|
std::vector<llama_token> inp_pfx = common_tokenize(ctx, params.input_prefix, false);
|
||||||
std::vector<llama_token> inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false);
|
std::vector<llama_token> inp_sfx = common_tokenize(ctx, params.input_suffix, false);
|
||||||
|
|
||||||
inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model));
|
inp_pfx.insert(inp_pfx.begin(), llama_token_fim_pre(model));
|
||||||
inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model));
|
inp_sfx.insert(inp_sfx.begin(), llama_token_fim_suf(model));
|
||||||
|
|
||||||
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
embd_inp = params.spm_infill ? inp_sfx : inp_pfx;
|
||||||
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
embd_end = params.spm_infill ? inp_pfx : inp_sfx;
|
||||||
|
@ -529,7 +529,7 @@ int main(int argc, char ** argv) {
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
else if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
else if (llama_token_is_eog(model, common_sampler_last(smpl))) {
|
||||||
LOG_DBG("found EOS token\n");
|
LOG_DBG("found EOS token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
@ -579,7 +579,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const size_t original_size = embd_inp.size();
|
const size_t original_size = embd_inp.size();
|
||||||
|
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false);
|
const auto line_inp = common_tokenize(ctx, buffer, false);
|
||||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||||
|
|
||||||
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
|
||||||
|
@ -587,7 +587,7 @@ int main(int argc, char ** argv) {
|
||||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||||
const llama_token token = embd_inp[i];
|
const llama_token token = embd_inp[i];
|
||||||
output_tokens.push_back(token);
|
output_tokens.push_back(token);
|
||||||
output_ss << llama_token_to_piece(ctx, token);
|
output_ss << common_token_to_piece(ctx, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
|
@ -601,7 +601,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (n_past > 0) {
|
if (n_past > 0) {
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
gpt_sampler_reset(smpl);
|
common_sampler_reset(smpl);
|
||||||
}
|
}
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
|
@ -620,17 +620,17 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!params.interactive && n_remain <= 0) {
|
if (!params.interactive && n_remain <= 0) {
|
||||||
LOG("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
LOG("%s", common_token_to_piece(ctx, llama_token_eot(model)).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
common_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -304,9 +304,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||||
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
|
||||||
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
|
||||||
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
|
||||||
#ifdef GGML_USE_RPC
|
if (llama_supports_rpc()) {
|
||||||
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
|
||||||
#endif
|
}
|
||||||
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
||||||
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
||||||
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
||||||
|
@ -497,14 +497,12 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
auto p = string_split<int>(argv[i], split_delim);
|
auto p = string_split<int>(argv[i], split_delim);
|
||||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||||
#ifdef GGML_USE_RPC
|
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
|
||||||
} else if (arg == "-rpc" || arg == "--rpc") {
|
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
params.rpc_servers.push_back(argv[i]);
|
params.rpc_servers.push_back(argv[i]);
|
||||||
#endif
|
|
||||||
} else if (arg == "-sm" || arg == "--split-mode") {
|
} else if (arg == "-sm" || arg == "--split-mode") {
|
||||||
if (++i >= argc) {
|
if (++i >= argc) {
|
||||||
invalid_param = true;
|
invalid_param = true;
|
||||||
|
|
|
@ -18,6 +18,7 @@ android {
|
||||||
}
|
}
|
||||||
externalNativeBuild {
|
externalNativeBuild {
|
||||||
cmake {
|
cmake {
|
||||||
|
arguments += "-DLLAMA_BUILD_COMMON=ON"
|
||||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||||
cppFlags += listOf()
|
cppFlags += listOf()
|
||||||
arguments += listOf()
|
arguments += listOf()
|
||||||
|
|
|
@ -186,11 +186,11 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||||
for (nri = 0; nri < nr; nri++) {
|
for (nri = 0; nri < nr; nri++) {
|
||||||
LOGi("Benchmark prompt processing (pp)");
|
LOGi("Benchmark prompt processing (pp)");
|
||||||
|
|
||||||
llama_batch_clear(*batch);
|
common_batch_clear(*batch);
|
||||||
|
|
||||||
const int n_tokens = pp;
|
const int n_tokens = pp;
|
||||||
for (i = 0; i < n_tokens; i++) {
|
for (i = 0; i < n_tokens; i++) {
|
||||||
llama_batch_add(*batch, 0, i, { 0 }, false);
|
common_batch_add(*batch, 0, i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
batch->logits[batch->n_tokens - 1] = true;
|
batch->logits[batch->n_tokens - 1] = true;
|
||||||
|
@ -210,9 +210,9 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
|
||||||
const auto t_tg_start = ggml_time_us();
|
const auto t_tg_start = ggml_time_us();
|
||||||
for (i = 0; i < tg; i++) {
|
for (i = 0; i < tg; i++) {
|
||||||
|
|
||||||
llama_batch_clear(*batch);
|
common_batch_clear(*batch);
|
||||||
for (j = 0; j < pl; j++) {
|
for (j = 0; j < pl; j++) {
|
||||||
llama_batch_add(*batch, 0, i, { j }, true);
|
common_batch_add(*batch, 0, i, { j }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOGi("llama_decode() text generation: %d", i);
|
LOGi("llama_decode() text generation: %d", i);
|
||||||
|
@ -357,7 +357,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||||
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
const auto context = reinterpret_cast<llama_context *>(context_pointer);
|
||||||
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
|
||||||
|
|
||||||
const auto tokens_list = llama_tokenize(context, text, 1);
|
const auto tokens_list = common_tokenize(context, text, 1);
|
||||||
|
|
||||||
auto n_ctx = llama_n_ctx(context);
|
auto n_ctx = llama_n_ctx(context);
|
||||||
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
auto n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
|
||||||
|
@ -369,14 +369,14 @@ Java_android_llama_cpp_LLamaAndroid_completion_1init(
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto id : tokens_list) {
|
for (auto id : tokens_list) {
|
||||||
LOGi("%s", llama_token_to_piece(context, id).c_str());
|
LOGi("%s", common_token_to_piece(context, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(*batch);
|
common_batch_clear(*batch);
|
||||||
|
|
||||||
// evaluate the initial prompt
|
// evaluate the initial prompt
|
||||||
for (auto i = 0; i < tokens_list.size(); i++) {
|
for (auto i = 0; i < tokens_list.size(); i++) {
|
||||||
llama_batch_add(*batch, tokens_list[i], i, { 0 }, false);
|
common_batch_add(*batch, tokens_list[i], i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// llama_decode will output logits only for the last token of the prompt
|
// llama_decode will output logits only for the last token of the prompt
|
||||||
|
@ -419,7 +419,7 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto new_token_chars = llama_token_to_piece(context, new_token_id);
|
auto new_token_chars = common_token_to_piece(context, new_token_id);
|
||||||
cached_token_chars += new_token_chars;
|
cached_token_chars += new_token_chars;
|
||||||
|
|
||||||
jstring new_token = nullptr;
|
jstring new_token = nullptr;
|
||||||
|
@ -431,8 +431,8 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
|
||||||
new_token = env->NewStringUTF("");
|
new_token = env->NewStringUTF("");
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(*batch);
|
common_batch_clear(*batch);
|
||||||
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
|
common_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
|
||||||
|
|
||||||
env->CallVoidMethod(intvar_ncur, la_int_var_inc);
|
env->CallVoidMethod(intvar_ncur, la_int_var_inc);
|
||||||
|
|
||||||
|
|
|
@ -1,135 +0,0 @@
|
||||||
" Requires an already running llama.cpp server
|
|
||||||
" To install either copy or symlink to ~/.vim/autoload/llama.vim
|
|
||||||
" Then start with either :call llama#doLlamaGen(),
|
|
||||||
" or add a keybind to your vimrc such as
|
|
||||||
" nnoremap Z :call llama#doLlamaGen()<CR>
|
|
||||||
" Similarly, you could add an insert mode keybind with
|
|
||||||
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
|
|
||||||
"
|
|
||||||
" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
|
|
||||||
" let g:llama_api_url = "192.168.1.10:8080"
|
|
||||||
" llama_overrides can also be set through buffer/window scopes. For instance
|
|
||||||
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
|
|
||||||
" Could be added to your .vimrc to automatically set a lower temperature when
|
|
||||||
" editing a python script
|
|
||||||
" Additionally, an override dict can be stored at the top of a file
|
|
||||||
" !*{"stop": ["User:"]}
|
|
||||||
" Could be added to the start of your chatlog.txt to set the stopping token
|
|
||||||
" These parameter dicts are merged together from lowest to highest priority:
|
|
||||||
" server default -> g:llama_overrides -> w:llama_overrides ->
|
|
||||||
" b:llama_overrides -> in file (!*) overrides
|
|
||||||
"
|
|
||||||
" Sublists (like logit_bias and stop) are overridden, not merged
|
|
||||||
" Example override:
|
|
||||||
" !*{"logit_bias": [[13, -5], [2, false]], "temperature": 1, "top_k": 5, "top_p": 0.5, "n_predict": 256, "repeat_last_n": 256, "repeat_penalty": 1.17647}
|
|
||||||
if !exists("g:llama_api_url")
|
|
||||||
let g:llama_api_url= "127.0.0.1:8080"
|
|
||||||
endif
|
|
||||||
if !exists("g:llama_overrides")
|
|
||||||
let g:llama_overrides = {}
|
|
||||||
endif
|
|
||||||
const s:querydata = {"n_predict": 256, "stop": [ "\n" ], "stream": v:true }
|
|
||||||
const s:curlcommand = ['curl','--data-raw', "{\"prompt\":\"### System:\"}", '--silent', '--no-buffer', '--request', 'POST', '--url', g:llama_api_url .. '/completion', '--header', "Content-Type: application/json"]
|
|
||||||
let s:linedict = {}
|
|
||||||
|
|
||||||
func s:callbackHandler(bufn, channel, msg)
|
|
||||||
if len(a:msg) < 3
|
|
||||||
return
|
|
||||||
elseif a:msg[0] == "d"
|
|
||||||
let l:msg = a:msg[6:-1]
|
|
||||||
else
|
|
||||||
let l:msg = a:msg
|
|
||||||
endif
|
|
||||||
let l:decoded_msg = json_decode(l:msg)
|
|
||||||
let l:newtext = split(l:decoded_msg['content'], "\n", 1)
|
|
||||||
if len(l:newtext) > 0
|
|
||||||
call setbufline(a:bufn, s:linedict[a:bufn], getbufline(a:bufn, s:linedict[a:bufn])[0] .. newtext[0])
|
|
||||||
else
|
|
||||||
echo "nothing genned"
|
|
||||||
endif
|
|
||||||
if len(newtext) > 1
|
|
||||||
let l:failed = appendbufline(a:bufn, s:linedict[a:bufn], newtext[1:-1])
|
|
||||||
let s:linedict[a:bufn] = s:linedict[a:bufn] + len(newtext)-1
|
|
||||||
endif
|
|
||||||
if has_key(l:decoded_msg, "stop") && l:decoded_msg.stop
|
|
||||||
echo "Finished generation"
|
|
||||||
endif
|
|
||||||
endfunction
|
|
||||||
|
|
||||||
func llama#doLlamaGen()
|
|
||||||
if exists("b:job")
|
|
||||||
if job_status(b:job) == "run"
|
|
||||||
call job_stop(b:job)
|
|
||||||
return
|
|
||||||
endif
|
|
||||||
endif
|
|
||||||
|
|
||||||
let l:cbuffer = bufnr("%")
|
|
||||||
let s:linedict[l:cbuffer] = line('$')
|
|
||||||
let l:buflines = getbufline(l:cbuffer, 1, 1000)
|
|
||||||
let l:querydata = copy(s:querydata)
|
|
||||||
call extend(l:querydata, g:llama_overrides)
|
|
||||||
if exists("w:llama_overrides")
|
|
||||||
call extend(l:querydata, w:llama_overrides)
|
|
||||||
endif
|
|
||||||
if exists("b:llama_overrides")
|
|
||||||
call extend(l:querydata, b:llama_overrides)
|
|
||||||
endif
|
|
||||||
if l:buflines[0][0:1] == '!*'
|
|
||||||
let l:userdata = json_decode(l:buflines[0][2:-1])
|
|
||||||
call extend(l:querydata, l:userdata)
|
|
||||||
let l:buflines = l:buflines[1:-1]
|
|
||||||
endif
|
|
||||||
let l:querydata.prompt = join(l:buflines, "\n")
|
|
||||||
let l:curlcommand = copy(s:curlcommand)
|
|
||||||
if exists("g:llama_api_key")
|
|
||||||
call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
|
|
||||||
endif
|
|
||||||
let l:curlcommand[2] = json_encode(l:querydata)
|
|
||||||
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
|
|
||||||
endfunction
|
|
||||||
|
|
||||||
" Echos the tokkenization of the provided string , or cursor to end of word
|
|
||||||
" Onus is placed on the user to include the preceding space
|
|
||||||
func llama#tokenizeWord(...)
|
|
||||||
if (a:0 > 0)
|
|
||||||
let l:input = a:1
|
|
||||||
else
|
|
||||||
exe "normal \"*ye"
|
|
||||||
let l:input = @*
|
|
||||||
endif
|
|
||||||
let l:querydata = {"content": l:input}
|
|
||||||
let l:curlcommand = copy(s:curlcommand)
|
|
||||||
let l:curlcommand[2] = json_encode(l:querydata)
|
|
||||||
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
|
|
||||||
let s:token_job = job_start(l:curlcommand, {"callback": function("s:tokenizeWordCallback", [l:input])})
|
|
||||||
endfunction
|
|
||||||
|
|
||||||
func s:tokenizeWordCallback(plaintext, channel, msg)
|
|
||||||
echo '"' .. a:plaintext ..'" - ' .. string(json_decode(a:msg).tokens)
|
|
||||||
endfunction
|
|
||||||
|
|
||||||
|
|
||||||
" Echos the token count of the entire buffer (or provided string)
|
|
||||||
" Example usage :echo llama#tokenCount()
|
|
||||||
func llama#tokenCount(...)
|
|
||||||
if (a:0 > 0)
|
|
||||||
let l:buflines = a:1
|
|
||||||
else
|
|
||||||
let l:buflines = getline(1,1000)
|
|
||||||
if l:buflines[0][0:1] == '!*'
|
|
||||||
let l:buflines = l:buflines[1:-1]
|
|
||||||
endif
|
|
||||||
let l:buflines = join(l:buflines, "\n")
|
|
||||||
endif
|
|
||||||
let l:querydata = {"content": l:buflines}
|
|
||||||
let l:curlcommand = copy(s:curlcommand)
|
|
||||||
let l:curlcommand[2] = json_encode(l:querydata)
|
|
||||||
let l:curlcommand[8] = g:llama_api_url .. "/tokenize"
|
|
||||||
let s:token_job = job_start(l:curlcommand, {"callback": "s:tokenCountCallback"})
|
|
||||||
endfunction
|
|
||||||
|
|
||||||
func s:tokenCountCallback(channel, msg)
|
|
||||||
let resp = json_decode(a:msg)
|
|
||||||
echo len(resp.tokens)
|
|
||||||
endfunction
|
|
|
@ -37,21 +37,21 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||||
|
|
||||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||||
std::string str2 = str;
|
std::string str2 = str;
|
||||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||||
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * sample(struct gpt_sampler * smpl,
|
static const char * sample(struct common_sampler * smpl,
|
||||||
struct llama_context * ctx_llama,
|
struct llama_context * ctx_llama,
|
||||||
int * n_past) {
|
int * n_past) {
|
||||||
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||||
gpt_sampler_accept(smpl, id, true);
|
common_sampler_accept(smpl, id, true);
|
||||||
static std::string ret;
|
static std::string ret;
|
||||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
} else {
|
} else {
|
||||||
ret = llama_token_to_piece(ctx_llama, id);
|
ret = common_token_to_piece(ctx_llama, id);
|
||||||
}
|
}
|
||||||
eval_id(ctx_llama, id, n_past);
|
eval_id(ctx_llama, id, n_past);
|
||||||
return ret.c_str();
|
return ret.c_str();
|
||||||
|
@ -120,7 +120,7 @@ static void print_usage(int, char ** argv) {
|
||||||
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
|
||||||
|
|
||||||
// load and preprocess the image
|
// load and preprocess the image
|
||||||
llava_image_embed * embed = NULL;
|
llava_image_embed * embed = NULL;
|
||||||
|
@ -146,7 +146,7 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
||||||
return embed;
|
return embed;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, gpt_params * params, const std::string & prompt) {
|
static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||||
|
@ -159,16 +159,16 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||||
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
LOG_INF("system_prompt: %s\n", system_prompt.c_str());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
LOG_INF("user_prompt: %s\n", user_prompt.c_str());
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -176,9 +176,9 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
||||||
user_prompt = prompt + "\nASSISTANT:";
|
user_prompt = prompt + "\nASSISTANT:";
|
||||||
if (params->verbose_prompt) {
|
if (params->verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
|
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
|
|
||||||
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
|
||||||
if (!smpl) {
|
if (!smpl) {
|
||||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -211,15 +211,15 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_model * llava_init(gpt_params * params) {
|
static struct llama_model * llava_init(common_params * params) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params->numa);
|
llama_numa_init(params->numa);
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
@ -229,7 +229,7 @@ static struct llama_model * llava_init(gpt_params * params) {
|
||||||
return model;
|
return model;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||||
const char * clip_path = params->mmproj.c_str();
|
const char * clip_path = params->mmproj.c_str();
|
||||||
|
|
||||||
auto prompt = params->prompt;
|
auto prompt = params->prompt;
|
||||||
|
@ -240,7 +240,7 @@ static struct llava_context * llava_init_context(gpt_params * params, llama_mode
|
||||||
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
||||||
|
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||||
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
||||||
|
|
||||||
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params);
|
||||||
|
@ -272,13 +272,13 @@ static void llava_free(struct llava_context * ctx_llava) {
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
|
||||||
print_usage(argc, argv);
|
print_usage(argc, argv);
|
||||||
|
|
|
@ -25,11 +25,11 @@ static void show_additional_info(int /*argc*/, char ** argv) {
|
||||||
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llama_model * llava_init(gpt_params * params) {
|
static struct llama_model * llava_init(common_params * params) {
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
llama_numa_init(params->numa);
|
llama_numa_init(params->numa);
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_params_from_gpt_params(*params);
|
llama_model_params model_params = common_model_params_to_llama(*params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params);
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
|
@ -39,13 +39,13 @@ static struct llama_model * llava_init(gpt_params * params) {
|
||||||
return model;
|
return model;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
|
||||||
auto prompt = params->prompt;
|
auto prompt = params->prompt;
|
||||||
if (prompt.empty()) {
|
if (prompt.empty()) {
|
||||||
prompt = "describe the image in detail.";
|
prompt = "describe the image in detail.";
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
llama_context_params ctx_params = common_context_params_to_llama(*params);
|
||||||
if (params->n_ctx < 2048) {
|
if (params->n_ctx < 2048) {
|
||||||
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
// warn user here, "Image processing requires at least 2048 context, setting context to 2048"
|
||||||
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
|
||||||
|
@ -79,7 +79,7 @@ static void llava_free(struct llava_context * ctx_llava) {
|
||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct clip_ctx * clip_init_context(gpt_params * params) {
|
static struct clip_ctx * clip_init_context(common_params * params) {
|
||||||
const char * clip_path = params->mmproj.c_str();
|
const char * clip_path = params->mmproj.c_str();
|
||||||
|
|
||||||
auto prompt = params->prompt;
|
auto prompt = params->prompt;
|
||||||
|
@ -114,7 +114,7 @@ static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
|
||||||
|
|
||||||
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
|
||||||
std::string str2 = str;
|
std::string str2 = str;
|
||||||
std::vector<llama_token> embd_inp = ::llama_tokenize(ctx_llama, str2, add_bos, true);
|
std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
|
||||||
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -129,7 +129,7 @@ static void process_eval_image_embed(struct llava_context * ctx_llava, const str
|
||||||
llava_image_embed_free(slice_embed);
|
llava_image_embed_free(slice_embed);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, gpt_params * params, int &n_past) {
|
static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
|
||||||
std::string system_prompt;
|
std::string system_prompt;
|
||||||
int idx = 0;
|
int idx = 0;
|
||||||
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
|
||||||
|
@ -162,22 +162,22 @@ static void process_image(struct llava_context * ctx_llava, struct llava_image_e
|
||||||
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
LOG_INF("%s: image token past: %d\n", __func__, n_past);
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * sample(struct gpt_sampler * smpl,
|
static const char * sample(struct common_sampler * smpl,
|
||||||
struct llama_context * ctx_llama,
|
struct llama_context * ctx_llama,
|
||||||
int * n_past) {
|
int * n_past) {
|
||||||
const llama_token id = gpt_sampler_sample(smpl, ctx_llama, -1);
|
const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
|
||||||
gpt_sampler_accept(smpl, id, true);
|
common_sampler_accept(smpl, id, true);
|
||||||
static std::string ret;
|
static std::string ret;
|
||||||
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
if (llama_token_is_eog(llama_get_model(ctx_llama), id)) {
|
||||||
ret = "</s>";
|
ret = "</s>";
|
||||||
} else {
|
} else {
|
||||||
ret = llama_token_to_piece(ctx_llama, id);
|
ret = common_token_to_piece(ctx_llama, id);
|
||||||
}
|
}
|
||||||
eval_id(ctx_llama, id, n_past);
|
eval_id(ctx_llama, id, n_past);
|
||||||
return ret.c_str();
|
return ret.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct llava_context * minicpmv_init(gpt_params * params, const std::string & fname, int &n_past){
|
static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
|
||||||
auto * ctx_clip = clip_init_context(params);
|
auto * ctx_clip = clip_init_context(params);
|
||||||
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
|
||||||
if (!embeds) {
|
if (!embeds) {
|
||||||
|
@ -213,7 +213,7 @@ static struct llava_context * minicpmv_init(gpt_params * params, const std::stri
|
||||||
return ctx_llava;
|
return ctx_llava;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
|
||||||
std::string user_prompt = prompt;
|
std::string user_prompt = prompt;
|
||||||
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
|
||||||
if (!is_first) {
|
if (!is_first) {
|
||||||
|
@ -237,11 +237,11 @@ static struct gpt_sampler * llama_init(struct llava_context * ctx_llava, gpt_par
|
||||||
|
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
struct gpt_sampler * smpl = gpt_sampler_init(ctx_llava->model, params->sparams);
|
struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
|
||||||
return smpl;
|
return smpl;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampler * smpl, int &n_past){
|
static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
|
||||||
|
|
||||||
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
|
||||||
return tmp;
|
return tmp;
|
||||||
|
@ -250,13 +250,13 @@ static const char * llama_loop(struct llava_context * ctx_llava,struct gpt_sampl
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
if (params.mmproj.empty() || (params.image.empty())) {
|
if (params.mmproj.empty() || (params.image.empty())) {
|
||||||
show_additional_info(argc, argv);
|
show_additional_info(argc, argv);
|
||||||
|
@ -290,7 +290,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
}else {
|
}else {
|
||||||
while (true) {
|
while (true) {
|
||||||
LOG("<user>");
|
LOG("<user>");
|
||||||
|
@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
|
||||||
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
if (strstr(response.c_str(), "<user>")) break; // minicpm-v
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
}
|
}
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
|
|
@ -37,13 +37,13 @@ struct ngram_container {
|
||||||
};
|
};
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
const int W = 15; // lookahead window
|
const int W = 15; // lookahead window
|
||||||
const int N = 5; // n-gram size
|
const int N = 5; // n-gram size
|
||||||
|
@ -56,7 +56,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
std::vector<llama_token> all;
|
std::vector<llama_token> all;
|
||||||
|
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
all = inp;
|
all = inp;
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx);
|
const int max_context_size = llama_n_ctx(ctx);
|
||||||
|
@ -79,7 +79,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
|
||||||
|
|
||||||
// target model sampling context
|
// target model sampling context
|
||||||
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
||||||
|
|
||||||
// verification n-grams
|
// verification n-grams
|
||||||
std::vector<ngram_data> ngrams_cur(G);
|
std::vector<ngram_data> ngrams_cur(G);
|
||||||
|
@ -156,12 +156,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// sample first token
|
// sample first token
|
||||||
{
|
{
|
||||||
id = gpt_sampler_sample(smpl, ctx, 0);
|
id = common_sampler_sample(smpl, ctx, 0);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, id, true);
|
common_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
{
|
{
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = common_token_to_piece(ctx, id);
|
||||||
|
|
||||||
LOG("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
@ -172,7 +172,7 @@ int main(int argc, char ** argv) {
|
||||||
// debug
|
// debug
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||||
}
|
}
|
||||||
|
|
||||||
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
// build the mask from https://lmsys.org/blog/2023-11-21-lookahead-decoding/
|
||||||
|
@ -201,10 +201,10 @@ int main(int argc, char ** argv) {
|
||||||
// V V V V V V
|
// V V V V V V
|
||||||
// id
|
// id
|
||||||
{
|
{
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// current token - first token of the first level
|
// current token - first token of the first level
|
||||||
llama_batch_add(batch, id, n_past, seq_id_all, true);
|
common_batch_add(batch, id, n_past, seq_id_all, true);
|
||||||
|
|
||||||
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
// verification n-grams - queue this before the lookahead tokens for less KV cache fragmentation
|
||||||
{
|
{
|
||||||
|
@ -229,7 +229,7 @@ int main(int argc, char ** argv) {
|
||||||
ngrams_cur[g].tokens [j + 1] = t;
|
ngrams_cur[g].tokens [j + 1] = t;
|
||||||
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
ngrams_cur[g].i_batch[j + 1] = batch.n_tokens;
|
||||||
|
|
||||||
llama_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
common_batch_add(batch, t, n_past + j + 1, { W + 1 + g }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -241,13 +241,13 @@ int main(int argc, char ** argv) {
|
||||||
seq_id_look[j] = i + j + 1;
|
seq_id_look[j] = i + j + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
common_batch_add(batch, tokens_j[0][i], n_past + i, seq_id_look, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// fill the rest of the levels
|
// fill the rest of the levels
|
||||||
for (int j = 1; j < N - 1; j++) {
|
for (int j = 1; j < N - 1; j++) {
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
llama_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
common_batch_add(batch, tokens_j[j][i], n_past + j + i, { i + 1 }, j == N - 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -281,13 +281,13 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// sample the next token
|
// sample the next token
|
||||||
id = gpt_sampler_sample(smpl, ctx, i_batch);
|
id = common_sampler_sample(smpl, ctx, i_batch);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, id, true);
|
common_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
// print
|
// print
|
||||||
{
|
{
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = common_token_to_piece(ctx, id);
|
||||||
|
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
LOG("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
@ -327,7 +327,7 @@ int main(int argc, char ** argv) {
|
||||||
// print known n-grams starting with token id (debug)
|
// print known n-grams starting with token id (debug)
|
||||||
if (0 && v == 0) {
|
if (0 && v == 0) {
|
||||||
if (ngrams_observed.cnt[id] > 0) {
|
if (ngrams_observed.cnt[id] > 0) {
|
||||||
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], llama_token_to_piece(ctx, id).c_str());
|
LOG("\n - %d n-grams starting with '%s'\n", ngrams_observed.cnt[id], common_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
for (int i = 0; i < ngrams_observed.cnt[id]; i++) {
|
||||||
|
@ -336,7 +336,7 @@ int main(int argc, char ** argv) {
|
||||||
const int idx = id*(N - 1)*G + i*(N - 1);
|
const int idx = id*(N - 1)*G + i*(N - 1);
|
||||||
|
|
||||||
for (int j = 0; j < N - 1; j++) {
|
for (int j = 0; j < N - 1; j++) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
const std::string token_str = common_token_to_piece(ctx, ngrams_observed.tokens[idx + j]);
|
||||||
|
|
||||||
LOG("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
}
|
}
|
||||||
|
@ -358,7 +358,7 @@ int main(int argc, char ** argv) {
|
||||||
if (v == 0) {
|
if (v == 0) {
|
||||||
// sample from the last level
|
// sample from the last level
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
tokens_j[N - 2][i] = gpt_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
tokens_j[N - 2][i] = common_sampler_sample(smpl, ctx, ngrams_cur.size()*(N-1) + W*(N - 2) + i);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < W; i++) {
|
for (int i = 0; i < W; i++) {
|
||||||
|
@ -466,9 +466,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("n_accept = %d\n", n_accept);
|
LOG_INF("n_accept = %d\n", n_accept);
|
||||||
|
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
common_perf_print(ctx, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
|
|
||||||
llama_kv_cache_view_free(&kvc_view);
|
llama_kv_cache_view_free(&kvc_view);
|
||||||
|
|
||||||
|
|
|
@ -12,9 +12,9 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -31,15 +31,15 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
fprintf(stderr, "%s: tokenization done\n", __func__);
|
fprintf(stderr, "%s: tokenization done\n", __func__);
|
||||||
|
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache;
|
common_ngram_cache ngram_cache;
|
||||||
llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
common_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true);
|
||||||
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str());
|
||||||
|
|
||||||
llama_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
common_ngram_cache_save(ngram_cache, params.lookup_cache_static);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,15 +33,15 @@ int main(int argc, char ** argv){
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str());
|
||||||
llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]);
|
common_ngram_cache ngram_cache_merged = common_ngram_cache_load(args[0]);
|
||||||
|
|
||||||
for (size_t i = 1; i < args.size()-1; ++i) {
|
for (size_t i = 1; i < args.size()-1; ++i) {
|
||||||
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str());
|
||||||
llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]);
|
common_ngram_cache ngram_cache = common_ngram_cache_load(args[i]);
|
||||||
|
|
||||||
llama_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
common_ngram_cache_merge(ngram_cache_merged, ngram_cache);
|
||||||
}
|
}
|
||||||
|
|
||||||
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
|
fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str());
|
||||||
llama_ngram_cache_save(ngram_cache_merged, args.back());
|
common_ngram_cache_save(ngram_cache_merged, args.back());
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,13 +13,13 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.n_draft;
|
||||||
|
|
||||||
|
@ -28,18 +28,18 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
common_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
common_ngram_cache ngram_cache_dynamic;
|
||||||
llama_ngram_cache ngram_cache_static;
|
common_ngram_cache ngram_cache_static;
|
||||||
int64_t t_draft_flat_us = 0;
|
int64_t t_draft_flat_us = 0;
|
||||||
int64_t t_draft_us = 0;
|
int64_t t_draft_us = 0;
|
||||||
|
|
||||||
|
@ -48,7 +48,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
if (!params.lookup_cache_static.empty()) {
|
if (!params.lookup_cache_static.empty()) {
|
||||||
try {
|
try {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
||||||
} catch (std::ifstream::failure const &) {
|
} catch (std::ifstream::failure const &) {
|
||||||
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -57,7 +57,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
if (!params.lookup_cache_dynamic.empty()) {
|
if (!params.lookup_cache_dynamic.empty()) {
|
||||||
try {
|
try {
|
||||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
||||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -86,7 +86,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
{
|
{
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
common_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,7 +105,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
{
|
{
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -115,7 +115,7 @@ int main(int argc, char ** argv){
|
||||||
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
pseudo_output.push_back(inp_slice[pseudo_output.size()]);
|
||||||
{
|
{
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -133,7 +133,7 @@ int main(int argc, char ** argv){
|
||||||
}
|
}
|
||||||
|
|
||||||
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
// After each chunk, update the dynamic ngram cache with the context ngram cache:
|
||||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||||
ngram_cache_context.clear();
|
ngram_cache_context.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -13,13 +13,13 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char ** argv){
|
int main(int argc, char ** argv){
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LOOKUP)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
// max. number of additional tokens to draft if match is found
|
// max. number of additional tokens to draft if match is found
|
||||||
const int n_draft = params.n_draft;
|
const int n_draft = params.n_draft;
|
||||||
|
@ -31,29 +31,29 @@ int main(int argc, char ** argv){
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
inp = common_tokenize(ctx, params.prompt, true, true);
|
||||||
|
|
||||||
llama_ngram_cache ngram_cache_context;
|
common_ngram_cache ngram_cache_context;
|
||||||
llama_ngram_cache ngram_cache_dynamic;
|
common_ngram_cache ngram_cache_dynamic;
|
||||||
llama_ngram_cache ngram_cache_static;
|
common_ngram_cache ngram_cache_static;
|
||||||
int64_t t_draft_flat_us = 0;
|
int64_t t_draft_flat_us = 0;
|
||||||
int64_t t_draft_us = 0;
|
int64_t t_draft_us = 0;
|
||||||
|
|
||||||
{
|
{
|
||||||
// Fill up context ngram cache with tokens from user input:
|
// Fill up context ngram cache with tokens from user input:
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false);
|
||||||
|
|
||||||
if (!params.lookup_cache_static.empty()) {
|
if (!params.lookup_cache_static.empty()) {
|
||||||
try {
|
try {
|
||||||
ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static);
|
ngram_cache_static = common_ngram_cache_load(params.lookup_cache_static);
|
||||||
} catch (std::ifstream::failure const &) {
|
} catch (std::ifstream::failure const &) {
|
||||||
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
LOG_ERR("failed to open static lookup cache: %s", params.lookup_cache_static.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
|
@ -62,7 +62,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
if (!params.lookup_cache_dynamic.empty()) {
|
if (!params.lookup_cache_dynamic.empty()) {
|
||||||
try {
|
try {
|
||||||
ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic);
|
ngram_cache_dynamic = common_ngram_cache_load(params.lookup_cache_dynamic);
|
||||||
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
} catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ int main(int argc, char ** argv){
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
LOG("%s", common_token_to_piece(ctx, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
fflush(stderr);
|
fflush(stderr);
|
||||||
|
@ -102,7 +102,7 @@ int main(int argc, char ** argv){
|
||||||
|
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
struct gpt_sampler * smpl = gpt_sampler_init(model, params.sparams);
|
struct common_sampler * smpl = common_sampler_init(model, params.sparams);
|
||||||
|
|
||||||
std::vector<llama_token> draft;
|
std::vector<llama_token> draft;
|
||||||
|
|
||||||
|
@ -117,7 +117,7 @@ int main(int argc, char ** argv){
|
||||||
// debug
|
// debug
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||||
}
|
}
|
||||||
|
|
||||||
// print current draft sequence
|
// print current draft sequence
|
||||||
|
@ -126,11 +126,11 @@ int main(int argc, char ** argv){
|
||||||
int i_dft = 0;
|
int i_dft = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
llama_token id = gpt_sampler_sample(smpl, ctx, i_dft);
|
llama_token id = common_sampler_sample(smpl, ctx, i_dft);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, id, true);
|
common_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = common_token_to_piece(ctx, id);
|
||||||
|
|
||||||
if (!params.use_color) {
|
if (!params.use_color) {
|
||||||
LOG("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
@ -152,7 +152,7 @@ int main(int argc, char ** argv){
|
||||||
{
|
{
|
||||||
// Update context ngram cache with the newly accepted token:
|
// Update context ngram cache with the newly accepted token:
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -178,7 +178,7 @@ int main(int argc, char ** argv){
|
||||||
{
|
{
|
||||||
// Update context ngram cache with the newly accepted token:
|
// Update context ngram cache with the newly accepted token:
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
common_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false);
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
@ -192,18 +192,18 @@ int main(int argc, char ** argv){
|
||||||
// clean the cache of draft tokens that weren't accepted
|
// clean the cache of draft tokens that weren't accepted
|
||||||
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
|
||||||
|
|
||||||
llama_batch_clear(batch_tgt);
|
common_batch_clear(batch_tgt);
|
||||||
llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
|
||||||
|
|
||||||
// Draft already contains a single token sampled from the model:
|
// Draft already contains a single token sampled from the model:
|
||||||
GGML_ASSERT(draft.size() == 1);
|
GGML_ASSERT(draft.size() == 1);
|
||||||
GGML_ASSERT(draft[0] == inp.back());
|
GGML_ASSERT(draft[0] == inp.back());
|
||||||
const int64_t t_start_draft_us = ggml_time_us();
|
const int64_t t_start_draft_us = ggml_time_us();
|
||||||
|
|
||||||
llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
common_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static);
|
||||||
|
|
||||||
for (size_t i = 1; i < draft.size(); ++i) {
|
for (size_t i = 1; i < draft.size(); ++i) {
|
||||||
llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
common_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
t_draft_us += ggml_time_us() - t_start_draft_us;
|
t_draft_us += ggml_time_us() - t_start_draft_us;
|
||||||
|
@ -218,8 +218,8 @@ int main(int argc, char ** argv){
|
||||||
auto t_dec_end = ggml_time_us();
|
auto t_dec_end = ggml_time_us();
|
||||||
|
|
||||||
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
// Update dynamic ngram cache with context ngram cache and save it to disk:
|
||||||
llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
common_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context);
|
||||||
llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
common_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic);
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
|
@ -237,9 +237,9 @@ int main(int argc, char ** argv){
|
||||||
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
LOG_INF("accept = %.3f%%\n", 100.0f * n_accept / n_drafted);
|
||||||
|
|
||||||
LOG_INF("\ntarget:\n\n");
|
LOG_INF("\ntarget:\n\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
common_perf_print(ctx, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
|
|
||||||
llama_batch_free(batch_tgt);
|
llama_batch_free(batch_tgt);
|
||||||
|
|
||||||
|
|
|
@ -69,7 +69,7 @@ In this section, we cover the most commonly used options for running the `llama-
|
||||||
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
- `-c N, --ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference.
|
||||||
- `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
|
- `-mli, --multiline-input`: Allows you to write or paste multiple lines without ending each in '\'
|
||||||
- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
|
- `-t N, --threads N`: Set the number of threads to use during generation. For optimal performance, it is recommended to set this value to the number of physical CPU cores your system has.
|
||||||
- - `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
|
|
||||||
## Input Prompts
|
## Input Prompts
|
||||||
|
|
||||||
|
|
|
@ -33,8 +33,8 @@
|
||||||
|
|
||||||
static llama_context ** g_ctx;
|
static llama_context ** g_ctx;
|
||||||
static llama_model ** g_model;
|
static llama_model ** g_model;
|
||||||
static gpt_sampler ** g_smpl;
|
static common_sampler ** g_smpl;
|
||||||
static gpt_params * g_params;
|
static common_params * g_params;
|
||||||
static std::vector<llama_token> * g_input_tokens;
|
static std::vector<llama_token> * g_input_tokens;
|
||||||
static std::ostringstream * g_output_ss;
|
static std::ostringstream * g_output_ss;
|
||||||
static std::vector<llama_token> * g_output_tokens;
|
static std::vector<llama_token> * g_output_tokens;
|
||||||
|
@ -63,7 +63,7 @@ static bool file_is_empty(const std::string & path) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void write_logfile(
|
static void write_logfile(
|
||||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
const llama_context * ctx, const common_params & params, const llama_model * model,
|
||||||
const std::vector<llama_token> & input_tokens, const std::string & output,
|
const std::vector<llama_token> & input_tokens, const std::string & output,
|
||||||
const std::vector<llama_token> & output_tokens
|
const std::vector<llama_token> & output_tokens
|
||||||
) {
|
) {
|
||||||
|
@ -114,12 +114,12 @@ static void sigint_handler(int signo) {
|
||||||
} else {
|
} else {
|
||||||
console::cleanup();
|
console::cleanup();
|
||||||
LOG("\n");
|
LOG("\n");
|
||||||
gpt_perf_print(*g_ctx, *g_smpl);
|
common_perf_print(*g_ctx, *g_smpl);
|
||||||
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
|
||||||
|
|
||||||
// make sure all logs are flushed
|
// make sure all logs are flushed
|
||||||
LOG("Interrupted by user\n");
|
LOG("Interrupted by user\n");
|
||||||
gpt_log_pause(gpt_log_main());
|
common_log_pause(common_log_main());
|
||||||
|
|
||||||
_exit(130);
|
_exit(130);
|
||||||
}
|
}
|
||||||
|
@ -127,22 +127,22 @@ static void sigint_handler(int signo) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
static std::string chat_add_and_format(struct llama_model * model, std::vector<common_chat_msg> & chat_msgs, const std::string & role, const std::string & content) {
|
||||||
llama_chat_msg new_msg{role, content};
|
common_chat_msg new_msg{role, content};
|
||||||
auto formatted = llama_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
auto formatted = common_chat_format_single(model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
chat_msgs.push_back({role, content});
|
chat_msgs.push_back({role, content});
|
||||||
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
LOG_DBG("formatted: '%s'\n", formatted.c_str());
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_MAIN, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
auto & sparams = params.sparams;
|
auto & sparams = params.sparams;
|
||||||
|
|
||||||
|
@ -187,9 +187,9 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
gpt_sampler * smpl = nullptr;
|
common_sampler * smpl = nullptr;
|
||||||
|
|
||||||
std::vector<llama_chat_msg> chat_msgs;
|
std::vector<common_chat_msg> chat_msgs;
|
||||||
|
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
@ -197,7 +197,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
LOG_INF("%s: load the model and apply lora adapter, if any\n", __func__);
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
|
@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
|
||||||
// print chat template example in conversation mode
|
// print chat template example in conversation mode
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
if (params.enable_chat_template) {
|
if (params.enable_chat_template) {
|
||||||
LOG_INF("%s: chat template example:\n%s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(model, params.chat_template).c_str());
|
||||||
} else {
|
} else {
|
||||||
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
LOG_INF("%s: in-suffix/prefix is specified, chat template will be disabled\n", __func__);
|
||||||
}
|
}
|
||||||
|
@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -296,7 +296,7 @@ int main(int argc, char ** argv) {
|
||||||
: params.prompt;
|
: params.prompt;
|
||||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
LOG_DBG("tokenize the prompt\n");
|
LOG_DBG("tokenize the prompt\n");
|
||||||
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
embd_inp = common_tokenize(ctx, prompt, true, true);
|
||||||
} else {
|
} else {
|
||||||
LOG_DBG("use session tokens\n");
|
LOG_DBG("use session tokens\n");
|
||||||
embd_inp = session_tokens;
|
embd_inp = session_tokens;
|
||||||
|
@ -379,13 +379,13 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
LOG_INF("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
|
||||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
|
||||||
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
for (int i = 0; i < (int) embd_inp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", embd_inp[i], common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.n_keep > add_bos) {
|
if (params.n_keep > add_bos) {
|
||||||
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
LOG_INF("%s: static prompt based on n_keep: '", __func__);
|
||||||
for (int i = 0; i < params.n_keep; i++) {
|
for (int i = 0; i < params.n_keep; i++) {
|
||||||
LOG_CNT("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str());
|
LOG_CNT("%s", common_token_to_piece(ctx, embd_inp[i]).c_str());
|
||||||
}
|
}
|
||||||
LOG_CNT("'\n");
|
LOG_CNT("'\n");
|
||||||
}
|
}
|
||||||
|
@ -415,9 +415,9 @@ int main(int argc, char ** argv) {
|
||||||
for (const auto & antiprompt : params.antiprompt) {
|
for (const auto & antiprompt : params.antiprompt) {
|
||||||
LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
|
LOG_INF("Reverse prompt: '%s'\n", antiprompt.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, antiprompt, false, true);
|
auto tmp = common_tokenize(ctx, antiprompt, false, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -430,9 +430,9 @@ int main(int argc, char ** argv) {
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty()) {
|
||||||
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG_INF("Input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true);
|
auto tmp = common_tokenize(ctx, params.input_prefix, true, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -440,23 +440,23 @@ int main(int argc, char ** argv) {
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty()) {
|
||||||
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG_INF("Input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
if (params.verbose_prompt) {
|
if (params.verbose_prompt) {
|
||||||
auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
auto tmp = common_tokenize(ctx, params.input_suffix, false, true);
|
||||||
for (int i = 0; i < (int) tmp.size(); i++) {
|
for (int i = 0; i < (int) tmp.size(); i++) {
|
||||||
LOG_INF("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str());
|
LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx, tmp[i]).c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
smpl = gpt_sampler_init(model, sparams);
|
smpl = common_sampler_init(model, sparams);
|
||||||
if (!smpl) {
|
if (!smpl) {
|
||||||
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("sampler seed: %u\n", gpt_sampler_get_seed(smpl));
|
LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl));
|
||||||
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
LOG_INF("sampler params: \n%s\n", sparams.print().c_str());
|
||||||
LOG_INF("sampler chain: %s\n", gpt_sampler_print(smpl).c_str());
|
LOG_INF("sampler chain: %s\n", common_sampler_print(smpl).c_str());
|
||||||
|
|
||||||
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
LOG_INF("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
|
||||||
|
|
||||||
|
@ -521,7 +521,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
antiprompt_ids.reserve(params.antiprompt.size());
|
antiprompt_ids.reserve(params.antiprompt.size());
|
||||||
for (const std::string & antiprompt : params.antiprompt) {
|
for (const std::string & antiprompt : params.antiprompt) {
|
||||||
antiprompt_ids.emplace_back(::llama_tokenize(ctx, antiprompt, false, true));
|
antiprompt_ids.emplace_back(::common_tokenize(ctx, antiprompt, false, true));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_model_has_encoder(model)) {
|
if (llama_model_has_encoder(model)) {
|
||||||
|
@ -679,9 +679,9 @@ int main(int argc, char ** argv) {
|
||||||
LOG_DBG("saved session to %s\n", path_session.c_str());
|
LOG_DBG("saved session to %s\n", path_session.c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_token id = gpt_sampler_sample(smpl, ctx, -1);
|
const llama_token id = common_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
common_sampler_accept(smpl, id, /* accept_grammar= */ true);
|
||||||
|
|
||||||
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
// LOG_DBG("last: %s\n", string_from(ctx, smpl->prev.to_vector()).c_str());
|
||||||
|
|
||||||
|
@ -702,7 +702,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// push the prompt in the sampling context in order to apply repetition penalties later
|
// push the prompt in the sampling context in order to apply repetition penalties later
|
||||||
// for the prompt, we don't apply grammar rules
|
// for the prompt, we don't apply grammar rules
|
||||||
gpt_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
common_sampler_accept(smpl, embd_inp[n_consumed], /* accept_grammar= */ false);
|
||||||
|
|
||||||
++n_consumed;
|
++n_consumed;
|
||||||
if ((int) embd.size() >= params.n_batch) {
|
if ((int) embd.size() >= params.n_batch) {
|
||||||
|
@ -714,7 +714,7 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (input_echo && display) {
|
if (input_echo && display) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id, params.special);
|
const std::string token_str = common_token_to_piece(ctx, id, params.special);
|
||||||
|
|
||||||
// Console/Stream Output
|
// Console/Stream Output
|
||||||
LOG("%s", token_str.c_str());
|
LOG("%s", token_str.c_str());
|
||||||
|
@ -743,7 +743,7 @@ int main(int argc, char ** argv) {
|
||||||
// check for reverse prompt in the last n_prev tokens
|
// check for reverse prompt in the last n_prev tokens
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
const int n_prev = 32;
|
const int n_prev = 32;
|
||||||
const std::string last_output = gpt_sampler_prev_str(smpl, ctx, n_prev);
|
const std::string last_output = common_sampler_prev_str(smpl, ctx, n_prev);
|
||||||
|
|
||||||
is_antiprompt = false;
|
is_antiprompt = false;
|
||||||
// Check if each of the reverse prompts appears at the end of the output.
|
// Check if each of the reverse prompts appears at the end of the output.
|
||||||
|
@ -765,7 +765,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// check for reverse prompt using special tokens
|
// check for reverse prompt using special tokens
|
||||||
llama_token last_token = gpt_sampler_last(smpl);
|
llama_token last_token = common_sampler_last(smpl);
|
||||||
for (std::vector<llama_token> ids : antiprompt_ids) {
|
for (std::vector<llama_token> ids : antiprompt_ids) {
|
||||||
if (ids.size() == 1 && last_token == ids[0]) {
|
if (ids.size() == 1 && last_token == ids[0]) {
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
|
@ -782,13 +782,13 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
if (llama_token_is_eog(model, gpt_sampler_last(smpl))) {
|
if (llama_token_is_eog(model, common_sampler_last(smpl))) {
|
||||||
LOG_DBG("found an EOG token\n");
|
LOG_DBG("found an EOG token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
// tokenize and inject first reverse prompt
|
// tokenize and inject first reverse prompt
|
||||||
const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true);
|
const auto first_antiprompt = common_tokenize(ctx, params.antiprompt.front(), false, true);
|
||||||
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end());
|
||||||
is_antiprompt = true;
|
is_antiprompt = true;
|
||||||
}
|
}
|
||||||
|
@ -803,8 +803,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// if current token is not EOG, we add it to current assistant message
|
// if current token is not EOG, we add it to current assistant message
|
||||||
if (params.conversation) {
|
if (params.conversation) {
|
||||||
const auto id = gpt_sampler_last(smpl);
|
const auto id = common_sampler_last(smpl);
|
||||||
assistant_ss << llama_token_to_piece(ctx, id, false);
|
assistant_ss << common_token_to_piece(ctx, id, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
|
@ -862,9 +862,9 @@ int main(int argc, char ** argv) {
|
||||||
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||||
: std::move(buffer);
|
: std::move(buffer);
|
||||||
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
const auto line_pfx = common_tokenize(ctx, params.input_prefix, false, true);
|
||||||
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
|
const auto line_inp = common_tokenize(ctx, user_inp, false, format_chat);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = common_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
LOG_DBG("input tokens: %s\n", string_from(ctx, line_inp).c_str());
|
||||||
|
|
||||||
|
@ -882,7 +882,7 @@ int main(int argc, char ** argv) {
|
||||||
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
for (size_t i = original_size; i < embd_inp.size(); ++i) {
|
||||||
const llama_token token = embd_inp[i];
|
const llama_token token = embd_inp[i];
|
||||||
output_tokens.push_back(token);
|
output_tokens.push_back(token);
|
||||||
output_ss << llama_token_to_piece(ctx, token);
|
output_ss << common_token_to_piece(ctx, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
// reset assistant message
|
// reset assistant message
|
||||||
|
@ -899,7 +899,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
if (n_past > 0) {
|
if (n_past > 0) {
|
||||||
if (is_interacting) {
|
if (is_interacting) {
|
||||||
gpt_sampler_reset(smpl);
|
common_sampler_reset(smpl);
|
||||||
}
|
}
|
||||||
is_interacting = false;
|
is_interacting = false;
|
||||||
}
|
}
|
||||||
|
@ -925,10 +925,10 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
gpt_perf_print(ctx, smpl);
|
common_perf_print(ctx, smpl);
|
||||||
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
|
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
|
@ -54,7 +54,7 @@ static std::vector<std::string> k_prompts = {
|
||||||
struct client {
|
struct client {
|
||||||
~client() {
|
~client() {
|
||||||
if (smpl) {
|
if (smpl) {
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,7 +75,7 @@ struct client {
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
std::string response;
|
std::string response;
|
||||||
|
|
||||||
struct gpt_sampler * smpl = nullptr;
|
struct common_sampler * smpl = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
static void print_date_time() {
|
static void print_date_time() {
|
||||||
|
@ -103,13 +103,13 @@ static std::vector<std::string> split_string(const std::string& input, char deli
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
srand(1234);
|
srand(1234);
|
||||||
|
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
// number of simultaneous "clients" to simulate
|
// number of simultaneous "clients" to simulate
|
||||||
const int32_t n_clients = params.n_parallel;
|
const int32_t n_clients = params.n_parallel;
|
||||||
|
@ -130,7 +130,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -160,11 +160,11 @@ int main(int argc, char ** argv) {
|
||||||
for (size_t i = 0; i < clients.size(); ++i) {
|
for (size_t i = 0; i < clients.size(); ++i) {
|
||||||
auto & client = clients[i];
|
auto & client = clients[i];
|
||||||
client.id = i;
|
client.id = i;
|
||||||
client.smpl = gpt_sampler_init(model, params.sparams);
|
client.smpl = common_sampler_init(model, params.sparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens_system;
|
std::vector<llama_token> tokens_system;
|
||||||
tokens_system = ::llama_tokenize(ctx, k_system, true);
|
tokens_system = common_tokenize(ctx, k_system, true);
|
||||||
const int32_t n_tokens_system = tokens_system.size();
|
const int32_t n_tokens_system = tokens_system.size();
|
||||||
|
|
||||||
llama_seq_id g_seq_id = 0;
|
llama_seq_id g_seq_id = 0;
|
||||||
|
@ -189,7 +189,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
LOG_INF("%s: Evaluating the system prompt ...\n", __func__);
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
for (int32_t i = 0; i < n_tokens_system; ++i) {
|
||||||
llama_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
common_batch_add(batch, tokens_system[i], i, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
|
@ -210,10 +210,10 @@ int main(int argc, char ** argv) {
|
||||||
while (true) {
|
while (true) {
|
||||||
if (dump_kv_cache) {
|
if (dump_kv_cache) {
|
||||||
llama_kv_cache_view_update(ctx, &kvc_view);
|
llama_kv_cache_view_update(ctx, &kvc_view);
|
||||||
llama_kv_cache_dump_view_seqs(kvc_view, 40);
|
common_kv_cache_dump_view_seqs(kvc_view, 40);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// decode any currently ongoing sequences
|
// decode any currently ongoing sequences
|
||||||
for (auto & client : clients) {
|
for (auto & client : clients) {
|
||||||
|
@ -223,7 +223,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
client.i_batch = batch.n_tokens;
|
client.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
|
common_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
|
||||||
|
|
||||||
client.n_decoded += 1;
|
client.n_decoded += 1;
|
||||||
}
|
}
|
||||||
|
@ -252,14 +252,14 @@ int main(int argc, char ** argv) {
|
||||||
client.prompt = client.input + "\nAssistant:";
|
client.prompt = client.input + "\nAssistant:";
|
||||||
client.response = "";
|
client.response = "";
|
||||||
|
|
||||||
gpt_sampler_reset(client.smpl);
|
common_sampler_reset(client.smpl);
|
||||||
|
|
||||||
// do not prepend BOS because we have a system prompt!
|
// do not prepend BOS because we have a system prompt!
|
||||||
std::vector<llama_token> tokens_prompt;
|
std::vector<llama_token> tokens_prompt;
|
||||||
tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
|
tokens_prompt = common_tokenize(ctx, client.prompt, false);
|
||||||
|
|
||||||
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
for (size_t i = 0; i < tokens_prompt.size(); ++i) {
|
||||||
llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
|
common_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract the logits only for the last token
|
// extract the logits only for the last token
|
||||||
|
@ -340,9 +340,9 @@ int main(int argc, char ** argv) {
|
||||||
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
//printf("client %d, seq %d, token %d, pos %d, batch %d\n",
|
||||||
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
// client.id, client.seq_id, client.sampled, client.n_decoded, client.i_batch);
|
||||||
|
|
||||||
const llama_token id = gpt_sampler_sample(client.smpl, ctx, client.i_batch - i);
|
const llama_token id = common_sampler_sample(client.smpl, ctx, client.i_batch - i);
|
||||||
|
|
||||||
gpt_sampler_accept(client.smpl, id, true);
|
common_sampler_accept(client.smpl, id, true);
|
||||||
|
|
||||||
if (client.n_decoded == 1) {
|
if (client.n_decoded == 1) {
|
||||||
// start measuring generation time after the first token to make sure all concurrent clients
|
// start measuring generation time after the first token to make sure all concurrent clients
|
||||||
|
@ -350,7 +350,7 @@ int main(int argc, char ** argv) {
|
||||||
client.t_start_gen = ggml_time_us();
|
client.t_start_gen = ggml_time_us();
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = common_token_to_piece(ctx, id);
|
||||||
|
|
||||||
client.response += token_str;
|
client.response += token_str;
|
||||||
client.sampled = id;
|
client.sampled = id;
|
||||||
|
|
|
@ -15,17 +15,17 @@ static void print_usage(int, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
params.n_junk = 250;
|
params.n_junk = 250;
|
||||||
params.n_keep = 32;
|
params.n_keep = 32;
|
||||||
params.i_pos = -1;
|
params.i_pos = -1;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PASSKEY, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
int n_junk = params.n_junk;
|
int n_junk = params.n_junk;
|
||||||
int n_keep = params.n_keep;
|
int n_keep = params.n_keep;
|
||||||
|
@ -61,7 +61,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
llama_model_params model_params = common_model_params_to_llama(params);
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// initialize the context
|
// initialize the context
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
llama_context_params ctx_params = common_context_params_to_llama(params);
|
||||||
|
|
||||||
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
ctx_params.n_ctx = llama_n_ctx_train(model)*n_grp + n_keep;
|
||||||
|
|
||||||
|
@ -92,10 +92,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// tokenize the prompt
|
// tokenize the prompt
|
||||||
std::vector<llama_token> tokens_list;
|
std::vector<llama_token> tokens_list;
|
||||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
tokens_list = common_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
// tokenize the prefix and use it as a sink
|
// tokenize the prefix and use it as a sink
|
||||||
const int n_tokens_prefix = ::llama_tokenize(ctx, prompt_prefix, true).size();
|
const int n_tokens_prefix = common_tokenize(ctx, prompt_prefix, true).size();
|
||||||
|
|
||||||
const int n_tokens_all = tokens_list.size();
|
const int n_tokens_all = tokens_list.size();
|
||||||
|
|
||||||
|
@ -137,10 +137,10 @@ int main(int argc, char ** argv) {
|
||||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
|
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
|
||||||
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i + n_batch >= n_tokens_all) {
|
if (i + n_batch >= n_tokens_all) {
|
||||||
|
@ -171,10 +171,10 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
|
for (int j = 0; j < n_batch && i + j < n_tokens_all; j++) {
|
||||||
llama_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
common_batch_add(batch, tokens_list[i + j], n_past++, { 0 }, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i + n_batch >= n_tokens_all) {
|
if (i + n_batch >= n_tokens_all) {
|
||||||
|
@ -229,15 +229,15 @@ int main(int argc, char ** argv) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
LOG("%s", common_token_to_piece(ctx, new_token_id).c_str());
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
|
|
||||||
// prepare the next batch
|
// prepare the next batch
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// push this new token for next evaluation
|
// push this new token for next evaluation
|
||||||
llama_batch_add(batch, new_token_id, n_past++, { 0 }, true);
|
common_batch_add(batch, new_token_id, n_past++, { 0 }, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
n_cur += 1;
|
n_cur += 1;
|
||||||
|
|
|
@ -35,7 +35,7 @@ struct results_log_softmax {
|
||||||
};
|
};
|
||||||
|
|
||||||
static void write_logfile(
|
static void write_logfile(
|
||||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
const llama_context * ctx, const common_params & params, const llama_model * model,
|
||||||
const struct results_perplexity & results
|
const struct results_perplexity & results
|
||||||
) {
|
) {
|
||||||
if (params.logdir.empty()) {
|
if (params.logdir.empty()) {
|
||||||
|
@ -169,7 +169,7 @@ static void process_logits(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
|
const results_log_softmax results = log_softmax(n_vocab, logits + size_t(i)*n_vocab, tokens[i+1]);
|
||||||
const double v = -results.log_softmax;
|
const double v = -results.log_softmax;
|
||||||
local_nll += v;
|
local_nll += v;
|
||||||
local_nll2 += v*v;
|
local_nll2 += v*v;
|
||||||
|
@ -203,7 +203,7 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits,
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
const double v = log_softmax(n_vocab, logits + i*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
|
const double v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, log_probs.data() + i*nv, tokens[i+1]);
|
||||||
local_nll += v;
|
local_nll += v;
|
||||||
local_nll2 += v*v;
|
local_nll2 += v*v;
|
||||||
}
|
}
|
||||||
|
@ -281,7 +281,9 @@ static std::pair<double, float> log_softmax(int n_vocab, const float * logits, c
|
||||||
kld.sum_kld += sum;
|
kld.sum_kld += sum;
|
||||||
kld.sum_kld2 += sum*sum;
|
kld.sum_kld2 += sum*sum;
|
||||||
++kld.count;
|
++kld.count;
|
||||||
if (imax == imax_base) ++kld.n_same_top;
|
if (imax == imax_base) {
|
||||||
|
++kld.n_same_top;
|
||||||
|
}
|
||||||
|
|
||||||
const float p_base = expf(-nll_base);
|
const float p_base = expf(-nll_base);
|
||||||
const float p = expf(-nll);
|
const float p = expf(-nll);
|
||||||
|
@ -323,7 +325,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
lock.unlock();
|
lock.unlock();
|
||||||
std::pair<double, float> v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
|
std::pair<double, float> v = log_softmax(n_vocab, logits + size_t(i)*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld);
|
||||||
kld_values[i] = (float)v.first;
|
kld_values[i] = (float)v.first;
|
||||||
p_diff_values[i] = v.second;
|
p_diff_values[i] = v.second;
|
||||||
}
|
}
|
||||||
|
@ -337,7 +339,7 @@ static void process_logits(int n_vocab, const float * logits, const int * tokens
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & params) {
|
static results_perplexity perplexity_v2(llama_context * ctx, const common_params & params) {
|
||||||
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
// Download: https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
|
||||||
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
// Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
|
||||||
// Output: `perplexity: 13.5106 [114/114]`
|
// Output: `perplexity: 13.5106 [114/114]`
|
||||||
|
@ -348,7 +350,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
|
|
||||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
@ -383,9 +385,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
|
const int n_chunk_max = (tokens.size() - calc_chunk + params.ppl_stride - 1) / params.ppl_stride;
|
||||||
|
|
||||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
|
|
||||||
|
@ -424,8 +427,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto batch_logits = llama_get_logits(ctx);
|
const auto * batch_logits = llama_get_logits(ctx);
|
||||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
|
||||||
|
|
||||||
if (j == 0) {
|
if (j == 0) {
|
||||||
tokens[batch_start] = token_org;
|
tokens[batch_start] = token_org;
|
||||||
|
@ -447,11 +450,10 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
|
|
||||||
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
//LOG_DBG("%s: using tokens %d...%d\n",__func__,params.n_ctx - params.ppl_stride + start, params.n_ctx + start);
|
||||||
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
for (int j = n_ctx - params.ppl_stride - 1; j < n_ctx - 1; ++j) {
|
||||||
|
|
||||||
// Calculate probability of next token, given the previous ones.
|
// Calculate probability of next token, given the previous ones.
|
||||||
const std::vector<float> tok_logits(
|
const std::vector<float> tok_logits(
|
||||||
logits.begin() + (j + 0) * n_vocab,
|
logits.begin() + size_t(j + 0) * n_vocab,
|
||||||
logits.begin() + (j + 1) * n_vocab);
|
logits.begin() + size_t(j + 1) * n_vocab);
|
||||||
|
|
||||||
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
const float prob = softmax(tok_logits)[tokens[start + j + 1]];
|
||||||
logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
|
logit_history[start + j + 1] = tok_logits[tokens[start + j + 1]];
|
||||||
|
@ -472,7 +474,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params &
|
||||||
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
return {tokens, std::exp(nll / count), logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
static results_perplexity perplexity(llama_context * ctx, const gpt_params & params, const int32_t n_ctx) {
|
static results_perplexity perplexity(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
|
||||||
if (params.ppl_stride > 0) {
|
if (params.ppl_stride > 0) {
|
||||||
return perplexity_v2(ctx, params);
|
return perplexity_v2(ctx, params);
|
||||||
}
|
}
|
||||||
|
@ -500,7 +502,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
auto tim1 = std::chrono::high_resolution_clock::now();
|
auto tim1 = std::chrono::high_resolution_clock::now();
|
||||||
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
LOG_INF("%s: tokenizing the input ..\n", __func__);
|
||||||
|
|
||||||
std::vector<llama_token> tokens = ::llama_tokenize(ctx, params.prompt, true);
|
std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
auto tim2 = std::chrono::high_resolution_clock::now();
|
auto tim2 = std::chrono::high_resolution_clock::now();
|
||||||
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
|
||||||
|
@ -521,9 +523,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
const int n_chunk_max = tokens.size() / n_ctx;
|
const int n_chunk_max = tokens.size() / n_ctx;
|
||||||
|
|
||||||
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
double nll = 0.0;
|
double nll = 0.0;
|
||||||
double nll2 = 0.0;
|
double nll2 = 0.0;
|
||||||
|
@ -538,7 +541,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
|
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
if (num_batches > 1) {
|
if (num_batches > 1) {
|
||||||
logits.reserve((size_t)n_ctx * n_vocab);
|
logits.reserve(size_t(n_ctx) * n_vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
LOG_INF("%s: calculating perplexity over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
|
||||||
|
@ -620,7 +623,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
|
|
||||||
if (num_batches > 1 && n_outputs > 0) {
|
if (num_batches > 1 && n_outputs > 0) {
|
||||||
const auto * batch_logits = llama_get_logits(ctx);
|
const auto * batch_logits = llama_get_logits(ctx);
|
||||||
logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab);
|
logits.insert(logits.end(), batch_logits, batch_logits + size_t(n_outputs) * n_vocab);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -661,7 +664,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
} else {
|
} else {
|
||||||
double av = nll/count;
|
double av = nll/count;
|
||||||
double av2 = nll2/count - av*av;
|
double av2 = nll2/count - av*av;
|
||||||
if (av2 > 0) av2 = sqrt(av2/(count-1));
|
if (av2 > 0) {
|
||||||
|
av2 = sqrt(av2/(count-1));
|
||||||
|
}
|
||||||
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
LOG("%8d %.4lf %4lf %4lf\n", i*n_ctx, std::exp(nll / count), av, av2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -686,10 +691,10 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par
|
||||||
return {tokens, ppl, logit_history, prob_history};
|
return {tokens, ppl, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int32_t n_batch, int32_t n_vocab) {
|
static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<float> & batch_logits, int n_batch, int n_vocab) {
|
||||||
int prev_outputs = 0;
|
int prev_outputs = 0;
|
||||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) {
|
for (int i = 0; i < (int) batch.n_tokens; i += n_batch) {
|
||||||
const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i));
|
const int n_tokens = std::min<int>(n_batch, batch.n_tokens - i);
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
n_tokens,
|
n_tokens,
|
||||||
|
@ -713,7 +718,7 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
|
||||||
n_outputs += batch_view.logits[i] != 0;
|
n_outputs += batch_view.logits[i] != 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float));
|
memcpy(batch_logits.data() + size_t(prev_outputs)*n_vocab, llama_get_logits(ctx), size_t(n_outputs)*n_vocab*sizeof(float));
|
||||||
|
|
||||||
prev_outputs += n_outputs;
|
prev_outputs += n_outputs;
|
||||||
}
|
}
|
||||||
|
@ -728,7 +733,9 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
||||||
if (eval_results.size() != eval_pairs.size()) {
|
if (eval_results.size() != eval_pairs.size()) {
|
||||||
eval_results.resize(eval_pairs.size());
|
eval_results.resize(eval_pairs.size());
|
||||||
}
|
}
|
||||||
if (eval_pairs.empty()) return;
|
if (eval_pairs.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
|
size_t max_threads = std::min((eval_pairs.size() + K_TOKEN_CHUNK - 1)/K_TOKEN_CHUNK, workers.size());
|
||||||
|
|
||||||
|
@ -736,11 +743,13 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
||||||
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
|
auto compute = [&counter, &eval_pairs, &eval_results, batch_logits, n_vocab] () {
|
||||||
float local_logprobs[K_TOKEN_CHUNK];
|
float local_logprobs[K_TOKEN_CHUNK];
|
||||||
while (true) {
|
while (true) {
|
||||||
size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
|
const size_t first = counter.fetch_add(K_TOKEN_CHUNK, std::memory_order_relaxed);
|
||||||
if (first >= eval_results.size()) break;
|
if (first >= eval_results.size()) {
|
||||||
size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
|
break;
|
||||||
|
}
|
||||||
|
const size_t last = std::min(first + K_TOKEN_CHUNK, eval_results.size());
|
||||||
for (size_t i = first; i < last; ++i) {
|
for (size_t i = first; i < last; ++i) {
|
||||||
auto logits = batch_logits + eval_pairs[i].first * n_vocab;
|
const auto * logits = batch_logits + eval_pairs[i].first * n_vocab;
|
||||||
float max_logit = logits[0];
|
float max_logit = logits[0];
|
||||||
for (int j = 1; j < n_vocab; ++j) {
|
for (int j = 1; j < n_vocab; ++j) {
|
||||||
max_logit = std::max(max_logit, logits[j]);
|
max_logit = std::max(max_logit, logits[j]);
|
||||||
|
@ -763,7 +772,7 @@ static void compute_logprobs(const float * batch_logits, int n_vocab, std::vecto
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
static void hellaswag_score(llama_context * ctx, const common_params & params) {
|
||||||
// Calculates hellaswag score (acc_norm) from prompt
|
// Calculates hellaswag score (acc_norm) from prompt
|
||||||
//
|
//
|
||||||
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
||||||
|
@ -844,7 +853,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] );
|
||||||
for (size_t j = 0; j < 4; j++) {
|
for (size_t j = 0; j < 4; j++) {
|
||||||
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
hs_cur.ending[j] = prompt_lines[idx*6+2+j];
|
||||||
hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
hs_cur.seq_tokens[j] = common_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// determine the common prefix of the endings
|
// determine the common prefix of the endings
|
||||||
|
@ -877,10 +886,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
double acc = 0.0f;
|
double acc = 0.0f;
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
const int max_tasks_per_batch = 32;
|
const int max_tasks_per_batch = 32;
|
||||||
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
||||||
|
|
||||||
|
@ -888,7 +898,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
std::vector<float> tok_logits(n_vocab);
|
std::vector<float> tok_logits(n_vocab);
|
||||||
// TODO: this could be made smaller; it's currently the worst-case size
|
// TODO: this could be made smaller; it's currently the worst-case size
|
||||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
|
||||||
|
|
||||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||||
std::vector<float> eval_results;
|
std::vector<float> eval_results;
|
||||||
|
@ -900,7 +910,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
size_t i1 = i0;
|
size_t i1 = i0;
|
||||||
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// batch as much tasks as possible into the available context
|
// batch as much tasks as possible into the available context
|
||||||
// each task has 4 unique sequence ids - one for each ending
|
// each task has 4 unique sequence ids - one for each ending
|
||||||
|
@ -916,7 +926,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
|
for (size_t i = 0; i < hs_cur.common_prefix; ++i) {
|
||||||
llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
|
common_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false);
|
||||||
}
|
}
|
||||||
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
||||||
n_logits += 1;
|
n_logits += 1;
|
||||||
|
@ -926,7 +936,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
// TODO: don't evaluate the last token of each sequence
|
// TODO: don't evaluate the last token of each sequence
|
||||||
for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
|
for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) {
|
||||||
const bool needs_logits = i < seq_tokens_size - 1;
|
const bool needs_logits = i < seq_tokens_size - 1;
|
||||||
llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
common_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
||||||
n_logits += needs_logits;
|
n_logits += needs_logits;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -975,7 +985,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
auto & hs_cur = hs_data[i];
|
auto & hs_cur = hs_data[i];
|
||||||
|
|
||||||
// get the logits of the last token of the common prefix
|
// get the logits of the last token of the common prefix
|
||||||
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float));
|
std::memcpy(tok_logits.data(), batch_logits.data() + hs_cur.i_logits*n_vocab, n_vocab*sizeof(float));
|
||||||
|
|
||||||
const auto first_probs = softmax(tok_logits);
|
const auto first_probs = softmax(tok_logits);
|
||||||
|
|
||||||
|
@ -1102,7 +1112,7 @@ static std::vector<winogrande_entry> load_winogrande_from_csv(const std::string
|
||||||
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
|
* 0,Sarah was a much better surgeon than Maria so _ always got the easier cases.,Sarah,Maria,2
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
static void winogrande_score(llama_context * ctx, const common_params & params) {
|
||||||
|
|
||||||
constexpr int k_min_trailing_ctx = 3;
|
constexpr int k_min_trailing_ctx = 3;
|
||||||
|
|
||||||
|
@ -1136,8 +1146,8 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
LOG_INF("%s : tokenizing selected tasks\n", __func__);
|
LOG_INF("%s : tokenizing selected tasks\n", __func__);
|
||||||
|
|
||||||
for (auto & task : data) {
|
for (auto & task : data) {
|
||||||
task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
task.seq_tokens[0] = common_tokenize(ctx, task.first + task.choices[0] + task.second, true);
|
||||||
task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
task.seq_tokens[1] = common_tokenize(ctx, task.first + task.choices[1] + task.second, true);
|
||||||
|
|
||||||
task.common_prefix = 0;
|
task.common_prefix = 0;
|
||||||
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
for (size_t k = 0; k < task.seq_tokens[0].size(); k++) {
|
||||||
|
@ -1152,16 +1162,17 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
task.seq_tokens[0].size() - task.common_prefix +
|
task.seq_tokens[0].size() - task.common_prefix +
|
||||||
task.seq_tokens[1].size() - task.common_prefix;
|
task.seq_tokens[1].size() - task.common_prefix;
|
||||||
|
|
||||||
task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size();
|
task.n_base1 = common_tokenize(ctx, task.first + task.choices[0], true).size();
|
||||||
task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size();
|
task.n_base2 = common_tokenize(ctx, task.first + task.choices[1], true).size();
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
|
LOG_INF("%s : calculating winogrande score over selected tasks.\n", __func__);
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
const int max_tasks_per_batch = 128;
|
const int max_tasks_per_batch = 128;
|
||||||
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
||||||
|
|
||||||
|
@ -1169,7 +1180,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
std::vector<float> tok_logits(n_vocab);
|
std::vector<float> tok_logits(n_vocab);
|
||||||
// TODO: this could be made smaller; it's currently the worst-case size
|
// TODO: this could be made smaller; it's currently the worst-case size
|
||||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
|
||||||
|
|
||||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||||
std::vector<float> eval_results;
|
std::vector<float> eval_results;
|
||||||
|
@ -1184,7 +1195,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
size_t i1 = i0;
|
size_t i1 = i0;
|
||||||
size_t i_logits = 0;
|
size_t i_logits = 0;
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
|
while (n_cur + (int) data[i1].required_tokens <= n_ctx) {
|
||||||
int n_logits = 0;
|
int n_logits = 0;
|
||||||
|
@ -1194,7 +1205,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
|
for (size_t i = 0; i < data[i1].common_prefix; ++i) {
|
||||||
llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
|
common_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false);
|
||||||
}
|
}
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
batch.logits[batch.n_tokens - 1] = true;
|
||||||
n_logits += 1;
|
n_logits += 1;
|
||||||
|
@ -1202,7 +1213,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
|
||||||
for (int s = 0; s < 2; ++s) {
|
for (int s = 0; s < 2; ++s) {
|
||||||
// TODO: end before the last token, no need to predict past the end of the sequences
|
// TODO: end before the last token, no need to predict past the end of the sequences
|
||||||
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
|
for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) {
|
||||||
llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
|
common_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true);
|
||||||
n_logits += 1;
|
n_logits += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1359,7 +1370,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true));
|
task.seq_tokens.emplace_back(::common_tokenize(ctx, task.question + " " + answer, true));
|
||||||
}
|
}
|
||||||
auto min_len = task.seq_tokens.front().size();
|
auto min_len = task.seq_tokens.front().size();
|
||||||
for (auto& seq : task.seq_tokens) {
|
for (auto& seq : task.seq_tokens) {
|
||||||
|
@ -1403,7 +1414,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choic
|
||||||
// git@hf.co:datasets/Stevross/mmlu
|
// git@hf.co:datasets/Stevross/mmlu
|
||||||
// https://huggingface.co/datasets/truthful_qa
|
// https://huggingface.co/datasets/truthful_qa
|
||||||
//
|
//
|
||||||
static void multiple_choice_score(llama_context * ctx, const gpt_params & params) {
|
static void multiple_choice_score(llama_context * ctx, const common_params & params) {
|
||||||
|
|
||||||
std::istringstream strstream(params.prompt);
|
std::istringstream strstream(params.prompt);
|
||||||
uint32_t n_task;
|
uint32_t n_task;
|
||||||
|
@ -1509,17 +1520,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
|
|
||||||
LOG("\ntask\tacc_norm\n");
|
LOG("\ntask\tacc_norm\n");
|
||||||
|
|
||||||
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
const int n_batch = params.n_batch;
|
const int n_batch = params.n_batch;
|
||||||
|
|
||||||
|
const int n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
const int max_tasks_per_batch = 32;
|
const int max_tasks_per_batch = 32;
|
||||||
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
|
||||||
|
|
||||||
std::vector<float> tok_logits(n_vocab);
|
std::vector<float> tok_logits(n_vocab);
|
||||||
std::vector<float> batch_logits(n_vocab*n_ctx);
|
std::vector<float> batch_logits(size_t(n_ctx)*n_vocab);
|
||||||
|
|
||||||
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
std::vector<std::pair<size_t, llama_token>> eval_pairs;
|
||||||
std::vector<float> eval_results;
|
std::vector<float> eval_results;
|
||||||
|
@ -1536,7 +1548,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
size_t i1 = i0;
|
size_t i1 = i0;
|
||||||
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// batch as much tasks as possible into the available context
|
// batch as much tasks as possible into the available context
|
||||||
// each task has 4 unique sequence ids - one for each ending
|
// each task has 4 unique sequence ids - one for each ending
|
||||||
|
@ -1559,7 +1571,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
|
|
||||||
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
|
for (size_t i = 0; i < cur_task.common_prefix; ++i) {
|
||||||
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
|
//llama_batch_add(batch, cur_task.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false);
|
||||||
llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
|
common_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false);
|
||||||
}
|
}
|
||||||
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix
|
||||||
n_logits += 1;
|
n_logits += 1;
|
||||||
|
@ -1569,7 +1581,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
// TODO: don't evaluate the last token of each sequence
|
// TODO: don't evaluate the last token of each sequence
|
||||||
for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
|
for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) {
|
||||||
const bool needs_logits = i < seq_tokens_size - 1;
|
const bool needs_logits = i < seq_tokens_size - 1;
|
||||||
llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
common_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits);
|
||||||
n_logits += needs_logits;
|
n_logits += needs_logits;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1627,7 +1639,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
|
//LOG("\n common_prefix: %zu\n", cur_task.common_prefix);
|
||||||
|
|
||||||
// get the logits of the last token of the common prefix
|
// get the logits of the last token of the common prefix
|
||||||
std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float));
|
std::memcpy(tok_logits.data(), batch_logits.data() + cur_task.i_logits*n_vocab, n_vocab*sizeof(float));
|
||||||
|
|
||||||
const auto first_probs = softmax(tok_logits);
|
const auto first_probs = softmax(tok_logits);
|
||||||
|
|
||||||
|
@ -1683,7 +1695,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
static void kl_divergence(llama_context * ctx, const common_params & params) {
|
||||||
if (params.logits_file.empty()) {
|
if (params.logits_file.empty()) {
|
||||||
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
LOG_ERR("%s: you must provide a name of a file containing the log probabilities of the base model\n", __func__);
|
||||||
return;
|
return;
|
||||||
|
@ -1709,7 +1721,8 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
__func__, params.logits_file.c_str(), n_ctx, params.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
int n_vocab, n_chunk;
|
int n_vocab;
|
||||||
|
int n_chunk;
|
||||||
in.read((char *)&n_vocab, sizeof(n_vocab));
|
in.read((char *)&n_vocab, sizeof(n_vocab));
|
||||||
in.read((char *)&n_chunk, sizeof(n_chunk));
|
in.read((char *)&n_chunk, sizeof(n_chunk));
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
|
@ -1720,7 +1733,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
LOG_ERR("%s: inconsistent vocabulary (%d vs %d)\n", __func__, n_vocab, llama_n_vocab(llama_get_model(ctx)));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokens(n_ctx * n_chunk);
|
std::vector<llama_token> tokens(size_t(n_ctx) * n_chunk);
|
||||||
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
if (in.read((char *)tokens.data(), tokens.size()*sizeof(tokens[0])).fail()) {
|
||||||
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
LOG_ERR("%s: failed reading evaluation tokens from %s\n", __func__, params.logits_file.c_str());
|
||||||
return;
|
return;
|
||||||
|
@ -1737,7 +1750,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
std::vector<float> p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk);
|
||||||
std::vector<float> logits;
|
std::vector<float> logits;
|
||||||
if (num_batches > 1) {
|
if (num_batches > 1) {
|
||||||
logits.reserve(n_ctx * n_vocab);
|
logits.reserve(size_t(n_ctx) * n_vocab);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||||
|
@ -1801,7 +1814,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
if (num_batches > 1) {
|
if (num_batches > 1) {
|
||||||
const auto * batch_logits = llama_get_logits(ctx);
|
const auto * batch_logits = llama_get_logits(ctx);
|
||||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
logits.insert(logits.end(), batch_logits, batch_logits + size_t(batch_size) * n_vocab);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1822,7 +1835,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
|
|
||||||
const int first = n_ctx/2;
|
const int first = n_ctx/2;
|
||||||
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
process_logits(n_vocab, all_logits + size_t(first)*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||||
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
|
workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr);
|
||||||
p_diff_ptr += n_ctx - 1 - first;
|
p_diff_ptr += n_ctx - 1 - first;
|
||||||
kld_ptr += n_ctx - 1 - first;
|
kld_ptr += n_ctx - 1 - first;
|
||||||
|
@ -1955,17 +1968,17 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
params.n_ctx = 512;
|
params.n_ctx = 512;
|
||||||
params.logits_all = true;
|
params.logits_all = true;
|
||||||
params.escape = false;
|
params.escape = false;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PERPLEXITY)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
const int32_t n_ctx = params.n_ctx;
|
const int32_t n_ctx = params.n_ctx;
|
||||||
|
|
||||||
|
@ -2004,7 +2017,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model and apply lora adapter, if any
|
// load the model and apply lora adapter, if any
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -2023,7 +2036,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
struct results_perplexity results;
|
struct results_perplexity results;
|
||||||
|
|
|
@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static void test_roundtrip_on_chunk(
|
static void test_roundtrip_on_chunk(
|
||||||
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
|
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference,
|
||||||
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
||||||
) {
|
) {
|
||||||
if (layer->type == GGML_TYPE_F16) {
|
if (layer->type == GGML_TYPE_F16) {
|
||||||
|
@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk(
|
||||||
|
|
||||||
// Run quantization function for a single layer and update error stats
|
// Run quantization function for a single layer and update error stats
|
||||||
static void test_roundtrip_on_layer(
|
static void test_roundtrip_on_layer(
|
||||||
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
|
std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference,
|
||||||
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
||||||
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
||||||
) {
|
) {
|
||||||
|
@ -371,8 +371,8 @@ int main(int argc, char ** argv) {
|
||||||
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
|
const auto * qfns = ggml_get_type_traits(type);
|
||||||
if (qfns.from_float && qfns.to_float) {
|
if (qfns->from_float && qfns->to_float) {
|
||||||
if (params.verbose) {
|
if (params.verbose) {
|
||||||
printf("testing %s ...\n", ggml_type_name(type));
|
printf("testing %s ...\n", ggml_type_name(type));
|
||||||
}
|
}
|
||||||
|
@ -393,7 +393,7 @@ int main(int argc, char ** argv) {
|
||||||
test_roundtrip_on_layer(
|
test_roundtrip_on_layer(
|
||||||
layer_name,
|
layer_name,
|
||||||
params.per_layer_stats,
|
params.per_layer_stats,
|
||||||
qfns,
|
*qfns,
|
||||||
params.reference,
|
params.reference,
|
||||||
kv_tensor.second,
|
kv_tensor.second,
|
||||||
input_scratch,
|
input_scratch,
|
||||||
|
|
|
@ -77,7 +77,7 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
size_t n_tokens = tokens.size();
|
size_t n_tokens = tokens.size();
|
||||||
for (size_t i = 0; i < n_tokens; i++) {
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
common_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -107,18 +107,18 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
}
|
}
|
||||||
|
|
||||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
llama_embd_normalize(embd, out, n_embd);
|
common_embd_normalize(embd, out, n_embd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_RETRIEVAL, print_usage)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
// For BERT models, batch size must be equal to ubatch size
|
// For BERT models, batch size must be equal to ubatch size
|
||||||
params.n_ubatch = params.n_batch;
|
params.n_ubatch = params.n_batch;
|
||||||
|
@ -149,7 +149,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_numa_init(params.numa);
|
llama_numa_init(params.numa);
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -176,7 +176,7 @@ int main(int argc, char ** argv) {
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
|
@ -185,7 +185,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// tokenize the prompts and trim
|
// tokenize the prompts and trim
|
||||||
for (auto & chunk : chunks) {
|
for (auto & chunk : chunks) {
|
||||||
auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false);
|
auto inp = common_tokenize(ctx, chunk.textdata, true, false);
|
||||||
if (inp.size() > n_batch) {
|
if (inp.size() > n_batch) {
|
||||||
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
LOG_ERR("%s: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
|
||||||
__func__, (long long int) inp.size(), (long long int) n_batch);
|
__func__, (long long int) inp.size(), (long long int) n_batch);
|
||||||
|
@ -204,7 +204,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
LOG_INF("%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str());
|
||||||
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
LOG_INF("%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size());
|
||||||
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
for (int j = 0; j < (int) chunks[i].tokens.size(); j++) {
|
||||||
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
LOG_INF("%6d -> '%s'\n", chunks[i].tokens[j], common_token_to_piece(ctx, chunks[i].tokens[j]).c_str());
|
||||||
}
|
}
|
||||||
LOG_INF("\n\n");
|
LOG_INF("\n\n");
|
||||||
}
|
}
|
||||||
|
@ -232,7 +232,7 @@ int main(int argc, char ** argv) {
|
||||||
if (batch.n_tokens + n_toks > n_batch) {
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd);
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
}
|
}
|
||||||
|
@ -260,20 +260,20 @@ int main(int argc, char ** argv) {
|
||||||
while (true) {
|
while (true) {
|
||||||
LOG("Enter query: ");
|
LOG("Enter query: ");
|
||||||
std::getline(std::cin, query);
|
std::getline(std::cin, query);
|
||||||
std::vector<int32_t> query_tokens = llama_tokenize(ctx, query, true);
|
std::vector<int32_t> query_tokens = common_tokenize(ctx, query, true);
|
||||||
|
|
||||||
batch_add_seq(query_batch, query_tokens, 0);
|
batch_add_seq(query_batch, query_tokens, 0);
|
||||||
|
|
||||||
std::vector<float> query_emb(n_embd, 0);
|
std::vector<float> query_emb(n_embd, 0);
|
||||||
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd);
|
||||||
|
|
||||||
llama_batch_clear(query_batch);
|
common_batch_clear(query_batch);
|
||||||
|
|
||||||
// compute cosine similarities
|
// compute cosine similarities
|
||||||
{
|
{
|
||||||
std::vector<std::pair<int, float>> similarities;
|
std::vector<std::pair<int, float>> similarities;
|
||||||
for (int i = 0; i < n_chunks; i++) {
|
for (int i = 0; i < n_chunks; i++) {
|
||||||
float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
|
float sim = common_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd);
|
||||||
similarities.push_back(std::make_pair(i, sim));
|
similarities.push_back(std::make_pair(i, sim));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,6 +6,10 @@
|
||||||
#include "ggml-metal.h"
|
#include "ggml-metal.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef GGML_USE_VULKAN
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "ggml-rpc.h"
|
#include "ggml-rpc.h"
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
# include <windows.h>
|
# include <windows.h>
|
||||||
|
@ -79,6 +83,12 @@ static ggml_backend_t create_backend() {
|
||||||
if (!backend) {
|
if (!backend) {
|
||||||
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
|
||||||
}
|
}
|
||||||
|
#elif GGML_USE_VULKAN
|
||||||
|
fprintf(stderr, "%s: using Vulkan backend\n", __func__);
|
||||||
|
backend = ggml_backend_vk_init(0); // init device 0
|
||||||
|
if (!backend) {
|
||||||
|
fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// if there aren't GPU Backends fallback to CPU backend
|
// if there aren't GPU Backends fallback to CPU backend
|
||||||
|
@ -92,6 +102,8 @@ static ggml_backend_t create_backend() {
|
||||||
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
static void get_backend_memory(size_t * free_mem, size_t * total_mem) {
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
ggml_backend_cuda_get_device_memory(0, free_mem, total_mem);
|
||||||
|
#elif GGML_USE_VULKAN
|
||||||
|
ggml_backend_vk_get_device_memory(0, free_mem, total_mem);
|
||||||
#else
|
#else
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
MEMORYSTATUSEX status;
|
MEMORYSTATUSEX status;
|
||||||
|
@ -139,7 +151,7 @@ int main(int argc, char * argv[]) {
|
||||||
get_backend_memory(&free_mem, &total_mem);
|
get_backend_memory(&free_mem, &total_mem);
|
||||||
}
|
}
|
||||||
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
printf("Starting RPC server on %s, backend memory: %zu MB\n", endpoint.c_str(), free_mem / (1024 * 1024));
|
||||||
start_rpc_server(backend, endpoint.c_str(), free_mem, total_mem);
|
ggml_backend_rpc_start_server(backend, endpoint.c_str(), free_mem, total_mem);
|
||||||
ggml_backend_free(backend);
|
ggml_backend_free(backend);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,12 +6,12 @@
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
params.prompt = "The quick brown fox";
|
params.prompt = "The quick brown fox";
|
||||||
params.sparams.seed = 1234;
|
params.sparams.seed = 1234;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,7 +28,7 @@ int main(int argc, char ** argv) {
|
||||||
std::string result2;
|
std::string result2;
|
||||||
|
|
||||||
// init
|
// init
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
llama_model * model = llama_init.model;
|
llama_model * model = llama_init.model;
|
||||||
llama_context * ctx = llama_init.context;
|
llama_context * ctx = llama_init.context;
|
||||||
|
@ -46,7 +46,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
|
||||||
|
|
||||||
// tokenize prompt
|
// tokenize prompt
|
||||||
auto tokens = llama_tokenize(ctx, params.prompt, true);
|
auto tokens = common_tokenize(ctx, params.prompt, true);
|
||||||
|
|
||||||
// evaluate prompt
|
// evaluate prompt
|
||||||
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
|
llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), n_past, 0));
|
||||||
|
@ -72,7 +72,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
auto next_token = llama_sampler_sample(smpl, ctx, -1);
|
||||||
auto next_token_str = llama_token_to_piece(ctx, next_token);
|
auto next_token_str = common_token_to_piece(ctx, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result0 += next_token_str;
|
result0 += next_token_str;
|
||||||
|
@ -92,7 +92,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
// make new context
|
// make new context
|
||||||
auto * ctx2 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx2 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
||||||
|
|
||||||
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
@ -128,7 +128,7 @@ int main(int argc, char ** argv) {
|
||||||
// second run
|
// second run
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
auto next_token = llama_sampler_sample(smpl2, ctx2, -1);
|
||||||
auto next_token_str = llama_token_to_piece(ctx2, next_token);
|
auto next_token_str = common_token_to_piece(ctx2, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result1 += next_token_str;
|
result1 += next_token_str;
|
||||||
|
@ -152,7 +152,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// make new context
|
// make new context
|
||||||
auto * ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params));
|
auto * ctx3 = llama_new_context_with_model(model, common_context_params_to_llama(params));
|
||||||
|
|
||||||
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
|
@ -216,7 +216,7 @@ int main(int argc, char ** argv) {
|
||||||
// third run with seq 1 instead of 0
|
// third run with seq 1 instead of 0
|
||||||
for (auto i = 0; i < params.n_predict; i++) {
|
for (auto i = 0; i < params.n_predict; i++) {
|
||||||
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
auto next_token = llama_sampler_sample(smpl3, ctx3, -1);
|
||||||
auto next_token_str = llama_token_to_piece(ctx3, next_token);
|
auto next_token_str = common_token_to_piece(ctx3, next_token);
|
||||||
|
|
||||||
printf("%s", next_token_str.c_str());
|
printf("%s", next_token_str.c_str());
|
||||||
result2 += next_token_str;
|
result2 += next_token_str;
|
||||||
|
|
|
@ -18,6 +18,8 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
<!-- Note for contributors: The list below is generated by llama-gen-docs -->
|
||||||
|
|
||||||
**Common params**
|
**Common params**
|
||||||
|
|
||||||
| Argument | Explanation |
|
| Argument | Explanation |
|
||||||
|
@ -58,8 +60,6 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: 1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
|
||||||
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: 1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
|
||||||
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: 32.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
|
||||||
| `-gan, --grp-attn-n N` | group-attention factor (default: 1)<br/>(env: LLAMA_ARG_GRP_ATTN_N) |
|
|
||||||
| `-gaw, --grp-attn-w N` | group-attention width (default: 512.0)<br/>(env: LLAMA_ARG_GRP_ATTN_W) |
|
|
||||||
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
| `-dkvc, --dump-kv-cache` | verbose print of the KV cache |
|
||||||
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
|
||||||
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
| `-ctk, --cache-type-k TYPE` | KV cache data type for K (default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
|
||||||
|
@ -100,7 +100,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| Argument | Explanation |
|
| Argument | Explanation |
|
||||||
| -------- | ----------- |
|
| -------- | ----------- |
|
||||||
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
| `--samplers SAMPLERS` | samplers that will be used for generation in the order, separated by ';'<br/>(default: top_k;tfs_z;typ_p;top_p;min_p;temperature) |
|
||||||
| `-s, --seed SEED` | RNG seed (default: 4294967295, use random seed for 4294967295) |
|
| `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) |
|
||||||
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
| `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: kfypmt) |
|
||||||
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
| `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) |
|
||||||
| `--penalize-nl` | penalize newline tokens (default: false) |
|
| `--penalize-nl` | penalize newline tokens (default: false) |
|
||||||
|
@ -147,9 +147,10 @@ The project is under active development, and we are [looking for feedback and co
|
||||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||||
| `-spf, --system-prompt-file FNAME` | set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications |
|
|
||||||
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
|
||||||
| `--no-slots` | disables slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
| `--slots` | enable slots monitoring endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
|
||||||
|
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
|
||||||
|
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted:<br/>https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||||
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)<br/> |
|
||||||
|
@ -316,7 +317,6 @@ node index.js
|
||||||
|
|
||||||
- The prompt is a string or an array with the first element given as a string
|
- The prompt is a string or an array with the first element given as a string
|
||||||
- The model's `tokenizer.ggml.add_bos_token` metadata is `true`
|
- The model's `tokenizer.ggml.add_bos_token` metadata is `true`
|
||||||
- The system prompt is empty
|
|
||||||
|
|
||||||
`temperature`: Adjust the randomness of the generated text. Default: `0.8`
|
`temperature`: Adjust the randomness of the generated text. Default: `0.8`
|
||||||
|
|
||||||
|
@ -374,14 +374,14 @@ node index.js
|
||||||
|
|
||||||
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
|
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
|
||||||
|
|
||||||
|
`t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled.
|
||||||
|
|
||||||
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "<BASE64_STRING>", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA.
|
||||||
|
|
||||||
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
|
`id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1`
|
||||||
|
|
||||||
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
|
`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `false`
|
||||||
|
|
||||||
`system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
|
||||||
|
|
||||||
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
|
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values.
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
|
@ -521,32 +521,37 @@ Takes a prefix and a suffix and returns the predicted completion as stream.
|
||||||
|
|
||||||
*Options:*
|
*Options:*
|
||||||
|
|
||||||
`input_prefix`: Set the prefix of the code to infill.
|
- `input_prefix`: Set the prefix of the code to infill.
|
||||||
|
- `input_suffix`: Set the suffix of the code to infill.
|
||||||
|
|
||||||
`input_suffix`: Set the suffix of the code to infill.
|
It also accepts all the options of `/completion`.
|
||||||
|
|
||||||
It also accepts all the options of `/completion` except `stream` and `prompt`.
|
### **GET** `/props`: Get server global properties.
|
||||||
|
|
||||||
- **GET** `/props`: Return current server settings.
|
This endpoint is public (no API key check). By default, it is read-only. To make POST request to change global properties, you need to start server with `--props`
|
||||||
|
|
||||||
**Response format**
|
**Response format**
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"assistant_name": "",
|
|
||||||
"user_name": "",
|
|
||||||
"default_generation_settings": { ... },
|
"default_generation_settings": { ... },
|
||||||
"total_slots": 1,
|
"total_slots": 1,
|
||||||
"chat_template": ""
|
"chat_template": ""
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
- `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots.
|
|
||||||
- `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots.
|
|
||||||
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
|
- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint.
|
||||||
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
- `total_slots` - the total number of slots for process requests (defined by `--parallel` option)
|
||||||
- `chat_template` - the model's original Jinja2 prompt template
|
- `chat_template` - the model's original Jinja2 prompt template
|
||||||
|
|
||||||
|
### POST `/props`: Change server global properties.
|
||||||
|
|
||||||
|
To use this endpoint with POST method, you need to start server with `--props`
|
||||||
|
|
||||||
|
*Options:*
|
||||||
|
|
||||||
|
- None yet
|
||||||
|
|
||||||
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
|
||||||
|
|
||||||
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
|
||||||
|
@ -813,28 +818,6 @@ To know the `id` of the adapter, use GET `/lora-adapters`
|
||||||
|
|
||||||
## More examples
|
## More examples
|
||||||
|
|
||||||
### Change system prompt on runtime
|
|
||||||
|
|
||||||
To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once.
|
|
||||||
|
|
||||||
`prompt`: Specify a context that you want all connecting clients to respect.
|
|
||||||
|
|
||||||
`anti_prompt`: Specify the word you want to use to instruct the model to stop. This must be sent to each client through the `/props` endpoint.
|
|
||||||
|
|
||||||
`assistant_name`: The bot's name is necessary for each customer to generate the prompt. This must be sent to each client through the `/props` endpoint.
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"system_prompt": {
|
|
||||||
"prompt": "Transcript of a never ending dialog, where the User interacts with an Assistant.\nThe Assistant is helpful, kind, honest, good at writing, and never fails to answer the User's requests immediately and with precision.\nUser: Recommend a nice restaurant in the area.\nAssistant: I recommend the restaurant \"The Golden Duck\". It is a 5 star restaurant with a great view of the city. The food is delicious and the service is excellent. The prices are reasonable and the portions are generous. The restaurant is located at 123 Main Street, New York, NY 10001. The phone number is (212) 555-1234. The hours are Monday through Friday from 11:00 am to 10:00 pm. The restaurant is closed on Saturdays and Sundays.\nUser: Who is Richard Feynman?\nAssistant: Richard Feynman was an American physicist who is best known for his work in quantum mechanics and particle physics. He was awarded the Nobel Prize in Physics in 1965 for his contributions to the development of quantum electrodynamics. He was a popular lecturer and author, and he wrote several books, including \"Surely You're Joking, Mr. Feynman!\" and \"What Do You Care What Other People Think?\".\nUser:",
|
|
||||||
"anti_prompt": "User:",
|
|
||||||
"assistant_name": "Assistant:"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**NOTE**: You can do this automatically when starting the server by simply creating a .json file with these options and using the CLI option `-spf FNAME` or `--system-prompt-file FNAME`.
|
|
||||||
|
|
||||||
### Interactive mode
|
### Interactive mode
|
||||||
|
|
||||||
Check the sample in [chat.mjs](chat.mjs).
|
Check the sample in [chat.mjs](chat.mjs).
|
||||||
|
|
|
@ -132,6 +132,9 @@ struct slot_params {
|
||||||
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
|
||||||
int32_t n_predict = -1; // new tokens to predict
|
int32_t n_predict = -1; // new tokens to predict
|
||||||
|
|
||||||
|
int64_t t_max_prompt_ms = -1; // TODO: implement
|
||||||
|
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
|
||||||
|
|
||||||
std::vector<std::string> antiprompt;
|
std::vector<std::string> antiprompt;
|
||||||
|
|
||||||
json input_prefix;
|
json input_prefix;
|
||||||
|
@ -175,6 +178,7 @@ struct server_slot {
|
||||||
server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
server_task_cmpl_type cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
||||||
|
|
||||||
bool has_next_token = true;
|
bool has_next_token = true;
|
||||||
|
bool has_new_line = false;
|
||||||
bool truncated = false;
|
bool truncated = false;
|
||||||
bool stopped_eos = false;
|
bool stopped_eos = false;
|
||||||
bool stopped_word = false;
|
bool stopped_word = false;
|
||||||
|
@ -188,17 +192,11 @@ struct server_slot {
|
||||||
// sampling
|
// sampling
|
||||||
json json_schema;
|
json json_schema;
|
||||||
|
|
||||||
struct gpt_sampler_params sparams;
|
struct common_sampler_params sparams;
|
||||||
struct gpt_sampler * smpl = nullptr;
|
struct common_sampler * smpl = nullptr;
|
||||||
|
|
||||||
llama_token sampled;
|
llama_token sampled;
|
||||||
|
|
||||||
int32_t ga_i = 0; // group-attention state
|
|
||||||
int32_t ga_n = 1; // group-attention factor
|
|
||||||
int32_t ga_w = 512; // group-attention width
|
|
||||||
|
|
||||||
int32_t n_past_se = 0; // self-extend
|
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
size_t n_sent_text = 0; // number of sent text character
|
size_t n_sent_text = 0; // number of sent text character
|
||||||
size_t n_sent_token_probs = 0;
|
size_t n_sent_token_probs = 0;
|
||||||
|
@ -216,6 +214,7 @@ struct server_slot {
|
||||||
|
|
||||||
n_prompt_tokens = 0;
|
n_prompt_tokens = 0;
|
||||||
generated_text = "";
|
generated_text = "";
|
||||||
|
has_new_line = false;
|
||||||
truncated = false;
|
truncated = false;
|
||||||
stopped_eos = false;
|
stopped_eos = false;
|
||||||
stopped_word = false;
|
stopped_word = false;
|
||||||
|
@ -225,13 +224,11 @@ struct server_slot {
|
||||||
n_sent_text = 0;
|
n_sent_text = 0;
|
||||||
n_sent_token_probs = 0;
|
n_sent_token_probs = 0;
|
||||||
cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
cmpl_type = SERVER_TASK_CMPL_TYPE_NORMAL;
|
||||||
ga_i = 0;
|
|
||||||
n_past_se = 0;
|
|
||||||
|
|
||||||
generated_token_probs.clear();
|
generated_token_probs.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool has_budget(gpt_params &global_params) {
|
bool has_budget(common_params &global_params) {
|
||||||
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
if (params.n_predict == -1 && global_params.n_predict == -1) {
|
||||||
return true; // limitless
|
return true; // limitless
|
||||||
}
|
}
|
||||||
|
@ -611,9 +608,9 @@ struct server_response {
|
||||||
struct server_context {
|
struct server_context {
|
||||||
llama_model * model = nullptr;
|
llama_model * model = nullptr;
|
||||||
llama_context * ctx = nullptr;
|
llama_context * ctx = nullptr;
|
||||||
std::vector<llama_lora_adapter_container> loras;
|
std::vector<common_lora_adapter_container> loras;
|
||||||
|
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
llama_batch batch = {};
|
llama_batch batch = {};
|
||||||
|
|
||||||
|
@ -623,12 +620,6 @@ struct server_context {
|
||||||
|
|
||||||
int32_t n_ctx; // total context for all clients / slots
|
int32_t n_ctx; // total context for all clients / slots
|
||||||
|
|
||||||
// system prompt
|
|
||||||
bool system_need_update = false;
|
|
||||||
|
|
||||||
std::string system_prompt;
|
|
||||||
std::vector<llama_token> system_tokens;
|
|
||||||
|
|
||||||
// slots / clients
|
// slots / clients
|
||||||
std::vector<server_slot> slots;
|
std::vector<server_slot> slots;
|
||||||
json default_generation_settings_for_props;
|
json default_generation_settings_for_props;
|
||||||
|
@ -655,20 +646,20 @@ struct server_context {
|
||||||
// Clear any sampling context
|
// Clear any sampling context
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
if (slot.smpl != nullptr) {
|
if (slot.smpl != nullptr) {
|
||||||
gpt_sampler_free(slot.smpl);
|
common_sampler_free(slot.smpl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_model(const gpt_params & params_) {
|
bool load_model(const common_params & params_) {
|
||||||
params = params_;
|
params = params_;
|
||||||
|
|
||||||
// dedicate one sequence to the system prompt
|
// reserve one extra sequence (seq_id == 0) for extra features
|
||||||
params.n_parallel += 1;
|
params.n_parallel += 1;
|
||||||
|
|
||||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
common_init_result llama_init = common_init_from_params(params);
|
||||||
|
|
||||||
model = llama_init.model;
|
model = llama_init.model;
|
||||||
ctx = llama_init.context;
|
ctx = llama_init.context;
|
||||||
|
@ -711,22 +702,6 @@ struct server_context {
|
||||||
|
|
||||||
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
|
||||||
|
|
||||||
const int ga_n = params.grp_attn_n;
|
|
||||||
const int ga_w = params.grp_attn_w;
|
|
||||||
|
|
||||||
if (ga_n != 1) {
|
|
||||||
GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT
|
|
||||||
GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT
|
|
||||||
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
|
|
||||||
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
|
|
||||||
|
|
||||||
SLT_INF(slot, "slot self-extend: ga_n = %d, ga_w = %d\n", ga_n, ga_w);
|
|
||||||
}
|
|
||||||
|
|
||||||
slot.ga_i = 0;
|
|
||||||
slot.ga_n = ga_n;
|
|
||||||
slot.ga_w = ga_w;
|
|
||||||
|
|
||||||
slot.sparams = params.sparams;
|
slot.sparams = params.sparams;
|
||||||
|
|
||||||
slot.callback_on_release = [this](int) {
|
slot.callback_on_release = [this](int) {
|
||||||
|
@ -753,12 +728,7 @@ struct server_context {
|
||||||
metrics.init();
|
metrics.init();
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special) const {
|
std::vector<llama_token> tokenize(const json & json_prompt, bool add_special, bool parse_special) const {
|
||||||
// TODO: currently, we tokenize using special tokens by default
|
|
||||||
// this is not always correct (see https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
|
|
||||||
// but it's better compared to completely ignoring ChatML and other chat templates
|
|
||||||
const bool TMP_FORCE_SPECIAL = true;
|
|
||||||
|
|
||||||
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
// If `add_bos` is true, we only add BOS, when json_prompt is a string,
|
||||||
// or the first element of the json_prompt array is a string.
|
// or the first element of the json_prompt array is a string.
|
||||||
std::vector<llama_token> prompt_tokens;
|
std::vector<llama_token> prompt_tokens;
|
||||||
|
@ -771,10 +741,10 @@ struct server_context {
|
||||||
|
|
||||||
std::vector<llama_token> p;
|
std::vector<llama_token> p;
|
||||||
if (first) {
|
if (first) {
|
||||||
p = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
p = common_tokenize(ctx, s, add_special, parse_special);
|
||||||
first = false;
|
first = false;
|
||||||
} else {
|
} else {
|
||||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
p = common_tokenize(ctx, s, false, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||||
|
@ -788,7 +758,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
auto s = json_prompt.template get<std::string>();
|
auto s = json_prompt.template get<std::string>();
|
||||||
prompt_tokens = ::llama_tokenize(ctx, s, add_special, TMP_FORCE_SPECIAL);
|
prompt_tokens = common_tokenize(ctx, s, add_special, parse_special);
|
||||||
}
|
}
|
||||||
|
|
||||||
return prompt_tokens;
|
return prompt_tokens;
|
||||||
|
@ -909,6 +879,8 @@ struct server_context {
|
||||||
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
slot.sparams.seed = json_value(data, "seed", default_sparams.seed);
|
||||||
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
slot.sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
|
||||||
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
//slot.params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", default_params.t_max_prompt_ms); // TODO: implement
|
||||||
|
slot.params.t_max_predict_ms = json_value(data, "t_max_predict_ms", default_params.t_max_predict_ms);
|
||||||
|
|
||||||
// process "json_schema" and "grammar"
|
// process "json_schema" and "grammar"
|
||||||
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
||||||
|
@ -927,11 +899,6 @@ struct server_context {
|
||||||
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
slot.sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (slot.params.cache_prompt && slot.ga_n != 1) {
|
|
||||||
slot.params.cache_prompt = false;
|
|
||||||
SLT_WRN(slot, "%s", "group-attention is not supported with prompt caching. disabling cache\n");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
if (slot.n_predict > 0 && slot.params.n_predict > slot.n_predict) {
|
||||||
// Might be better to reject the request with a 400 ?
|
// Might be better to reject the request with a 400 ?
|
||||||
slot.params.n_predict = slot.n_predict;
|
slot.params.n_predict = slot.n_predict;
|
||||||
|
@ -999,7 +966,7 @@ struct server_context {
|
||||||
slot.sparams.logit_bias.push_back({tok, bias});
|
slot.sparams.logit_bias.push_back({tok, bias});
|
||||||
}
|
}
|
||||||
} else if (el[0].is_string()) {
|
} else if (el[0].is_string()) {
|
||||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
|
||||||
for (auto tok : toks) {
|
for (auto tok : toks) {
|
||||||
slot.sparams.logit_bias.push_back({tok, bias});
|
slot.sparams.logit_bias.push_back({tok, bias});
|
||||||
}
|
}
|
||||||
|
@ -1031,7 +998,7 @@ struct server_context {
|
||||||
sampler_names.emplace_back(name);
|
sampler_names.emplace_back(name);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
slot.sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
slot.sparams.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||||
} else {
|
} else {
|
||||||
slot.sparams.samplers = default_sparams.samplers;
|
slot.sparams.samplers = default_sparams.samplers;
|
||||||
}
|
}
|
||||||
|
@ -1039,10 +1006,10 @@ struct server_context {
|
||||||
|
|
||||||
{
|
{
|
||||||
if (slot.smpl != nullptr) {
|
if (slot.smpl != nullptr) {
|
||||||
gpt_sampler_free(slot.smpl);
|
common_sampler_free(slot.smpl);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.smpl = gpt_sampler_init(model, slot.sparams);
|
slot.smpl = common_sampler_init(model, slot.sparams);
|
||||||
if (slot.smpl == nullptr) {
|
if (slot.smpl == nullptr) {
|
||||||
// for now, the only error that may happen here is invalid grammar
|
// for now, the only error that may happen here is invalid grammar
|
||||||
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Failed to parse grammar", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
@ -1066,59 +1033,9 @@ struct server_context {
|
||||||
clean_kv_cache = false;
|
clean_kv_cache = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void system_prompt_update() {
|
|
||||||
SRV_DBG("updating system prompt: '%s'\n", system_prompt.c_str());
|
|
||||||
|
|
||||||
kv_cache_clear();
|
|
||||||
system_tokens.clear();
|
|
||||||
|
|
||||||
if (!system_prompt.empty()) {
|
|
||||||
system_tokens = ::llama_tokenize(ctx, system_prompt, true);
|
|
||||||
|
|
||||||
const int32_t n_batch = llama_n_batch(ctx);
|
|
||||||
const int32_t n_tokens_prompt = system_tokens.size();
|
|
||||||
|
|
||||||
for (int32_t i = 0; i < n_tokens_prompt; i += n_batch) {
|
|
||||||
const int32_t n_tokens = std::min(n_batch, n_tokens_prompt - i);
|
|
||||||
|
|
||||||
llama_batch_clear(batch);
|
|
||||||
|
|
||||||
for (int32_t j = 0; j < n_tokens; ++j) {
|
|
||||||
llama_batch_add(batch, system_tokens[i + j], i + j, { 0 }, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
|
||||||
SRV_ERR("%s", "llama_decode() failed\n");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// assign the system KV cache to all parallel sequences
|
|
||||||
for (int32_t i = 1; i <= params.n_parallel; ++i) {
|
|
||||||
llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
system_need_update = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool system_prompt_set(const std::string & sys_prompt) {
|
|
||||||
SRV_DBG("system prompt set: '%s'\n", system_prompt.c_str());
|
|
||||||
|
|
||||||
system_prompt = sys_prompt;
|
|
||||||
|
|
||||||
// release all slots
|
|
||||||
for (server_slot & slot : slots) {
|
|
||||||
slot.release();
|
|
||||||
}
|
|
||||||
|
|
||||||
system_need_update = true;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||||
const std::string token_str = llama_token_to_piece(ctx, result.tok, params.special);
|
const std::string token_str = common_token_to_piece(ctx, result.tok, params.special);
|
||||||
slot.sampled = result.tok;
|
slot.sampled = result.tok;
|
||||||
|
|
||||||
// search stop word and delete it
|
// search stop word and delete it
|
||||||
|
@ -1191,13 +1108,28 @@ struct server_context {
|
||||||
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
|
SLT_DBG(slot, "stopped by limit, n_decoded = %d, n_predict = %d\n", slot.n_decoded, slot.params.n_predict);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if we have already seen a new line, we stop after a certain time limit
|
||||||
|
if (slot.has_new_line && slot.params.t_max_predict_ms > 0 &&
|
||||||
|
(ggml_time_us() - slot.t_start_generation > 1000.0f*slot.params.t_max_predict_ms)) {
|
||||||
|
slot.stopped_limit = true;
|
||||||
|
slot.has_next_token = false;
|
||||||
|
|
||||||
|
SLT_DBG(slot, "stopped by time limit, n_decoded = %d, t_max_predict_ms = %d ms\n", slot.n_decoded, (int) slot.params.t_max_predict_ms);
|
||||||
|
}
|
||||||
|
|
||||||
|
// check if there is a new line in the generated text
|
||||||
|
if (result.text_to_send.find('\n') != std::string::npos) {
|
||||||
|
slot.has_new_line = true;
|
||||||
|
}
|
||||||
|
|
||||||
// if context shift is disabled, we stop when it reaches the context limit
|
// if context shift is disabled, we stop when it reaches the context limit
|
||||||
if (slot.n_decoded >= slot.n_ctx) {
|
if (slot.n_past >= slot.n_ctx) {
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
slot.stopped_limit = true;
|
slot.stopped_limit = true;
|
||||||
slot.has_next_token = false;
|
slot.has_next_token = false;
|
||||||
|
|
||||||
SLT_DBG(slot, "stopped due to running out of context capacity, n_decoded = %d, n_ctx = %d\n", slot.n_decoded, slot.n_ctx);
|
SLT_DBG(slot, "stopped due to running out of context capacity, n_past = %d, n_prompt_tokens = %d, n_decoded = %d, n_ctx = %d\n",
|
||||||
|
slot.n_decoded, slot.n_prompt_tokens, slot.n_past, slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (llama_token_is_eog(model, result.tok)) {
|
if (llama_token_is_eog(model, result.tok)) {
|
||||||
|
@ -1209,18 +1141,18 @@ struct server_context {
|
||||||
|
|
||||||
const auto n_ctx_train = llama_n_ctx_train(model);
|
const auto n_ctx_train = llama_n_ctx_train(model);
|
||||||
|
|
||||||
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.ga_n == 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
if (slot.params.n_predict < 1 && slot.n_predict < 1 && slot.n_prompt_tokens + slot.n_decoded >= n_ctx_train) {
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
slot.stopped_limit = true;
|
slot.stopped_limit = true;
|
||||||
slot.has_next_token = false; // stop prediction
|
slot.has_next_token = false; // stop prediction
|
||||||
|
|
||||||
SLT_WRN(slot,
|
SLT_WRN(slot,
|
||||||
"n_predict (%d) is not set and self-context extend is disabled. "
|
"n_predict (%d) is set for infinite generation. "
|
||||||
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
|
"Limiting generated tokens to n_ctx_train (%d) to avoid EOS-less generation infinite loop\n",
|
||||||
slot.params.n_predict, n_ctx_train);
|
slot.params.n_predict, n_ctx_train);
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: '%s'\n", slot.n_decoded, slot.n_remaining, token_str.c_str());
|
SLT_DBG(slot, "n_decoded = %d, n_remaining = %d, next token: %5d '%s'\n", slot.n_decoded, slot.n_remaining, result.tok, token_str.c_str());
|
||||||
|
|
||||||
return slot.has_next_token; // continue
|
return slot.has_next_token; // continue
|
||||||
}
|
}
|
||||||
|
@ -1229,7 +1161,7 @@ struct server_context {
|
||||||
std::vector<std::string> samplers;
|
std::vector<std::string> samplers;
|
||||||
samplers.reserve(slot.sparams.samplers.size());
|
samplers.reserve(slot.sparams.samplers.size());
|
||||||
for (const auto & sampler : slot.sparams.samplers) {
|
for (const auto & sampler : slot.sparams.samplers) {
|
||||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||||
}
|
}
|
||||||
|
|
||||||
return json {
|
return json {
|
||||||
|
@ -1237,7 +1169,7 @@ struct server_context {
|
||||||
{"n_predict", slot.n_predict}, // Server configured n_predict
|
{"n_predict", slot.n_predict}, // Server configured n_predict
|
||||||
{"model", params.model_alias},
|
{"model", params.model_alias},
|
||||||
{"seed", slot.sparams.seed},
|
{"seed", slot.sparams.seed},
|
||||||
{"seed_cur", slot.smpl ? gpt_sampler_get_seed(slot.smpl) : 0},
|
{"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0},
|
||||||
{"temperature", slot.sparams.temp},
|
{"temperature", slot.sparams.temp},
|
||||||
{"dynatemp_range", slot.sparams.dynatemp_range},
|
{"dynatemp_range", slot.sparams.dynatemp_range},
|
||||||
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
{"dynatemp_exponent", slot.sparams.dynatemp_exponent},
|
||||||
|
@ -1302,7 +1234,7 @@ struct server_context {
|
||||||
};
|
};
|
||||||
|
|
||||||
if (slot.sparams.n_probs > 0) {
|
if (slot.sparams.n_probs > 0) {
|
||||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||||
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
|
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
|
||||||
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
|
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
|
||||||
|
|
||||||
|
@ -1339,6 +1271,7 @@ struct server_context {
|
||||||
{"tokens_evaluated", slot.n_prompt_tokens},
|
{"tokens_evaluated", slot.n_prompt_tokens},
|
||||||
{"generation_settings", get_formated_generation(slot)},
|
{"generation_settings", get_formated_generation(slot)},
|
||||||
{"prompt", slot.prompt},
|
{"prompt", slot.prompt},
|
||||||
|
{"has_new_line", slot.has_new_line},
|
||||||
{"truncated", slot.truncated},
|
{"truncated", slot.truncated},
|
||||||
{"stopped_eos", slot.stopped_eos},
|
{"stopped_eos", slot.stopped_eos},
|
||||||
{"stopped_word", slot.stopped_word},
|
{"stopped_word", slot.stopped_word},
|
||||||
|
@ -1352,7 +1285,7 @@ struct server_context {
|
||||||
if (slot.sparams.n_probs > 0) {
|
if (slot.sparams.n_probs > 0) {
|
||||||
std::vector<completion_token_output> probs;
|
std::vector<completion_token_output> probs;
|
||||||
if (!slot.params.stream && slot.stopped_word) {
|
if (!slot.params.stream && slot.stopped_word) {
|
||||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||||
|
|
||||||
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
|
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
|
||||||
probs = std::vector<completion_token_output>(
|
probs = std::vector<completion_token_output>(
|
||||||
|
@ -1406,7 +1339,7 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_embd_normalize(embd, embd_res.data(), n_embd);
|
common_embd_normalize(embd, embd_res.data(), n_embd);
|
||||||
|
|
||||||
res.data = json {
|
res.data = json {
|
||||||
{"embedding", embd_res},
|
{"embedding", embd_res},
|
||||||
|
@ -1488,9 +1421,8 @@ struct server_context {
|
||||||
if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
|
if (prompt.is_string() || json_is_array_of_numbers(prompt)) {
|
||||||
data["index"] = 0;
|
data["index"] = 0;
|
||||||
create_task(data, false, nullptr);
|
create_task(data, false, nullptr);
|
||||||
}
|
} else if (prompt.is_array()) {
|
||||||
// otherwise, it's a multiple-prompt task, we break it into smaller tasks
|
// otherwise, it's a multiple-prompt task, we break it into smaller tasks
|
||||||
else if (prompt.is_array()) {
|
|
||||||
std::vector<json> prompts = prompt;
|
std::vector<json> prompts = prompt;
|
||||||
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
|
if (cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
|
||||||
// prompts[0] is the question
|
// prompts[0] is the question
|
||||||
|
@ -1515,9 +1447,8 @@ struct server_context {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
// invalid case
|
// invalid case
|
||||||
else {
|
|
||||||
throw std::runtime_error(error_msg);
|
throw std::runtime_error(error_msg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1627,16 +1558,6 @@ struct server_context {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (task.data.contains("system_prompt")) {
|
|
||||||
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
|
||||||
system_prompt_set(sys_prompt);
|
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
|
||||||
slot.n_past = 0;
|
|
||||||
slot.n_past_se = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
slot->reset();
|
slot->reset();
|
||||||
|
|
||||||
slot->id_task = task.id;
|
slot->id_task = task.id;
|
||||||
|
@ -1677,6 +1598,7 @@ struct server_context {
|
||||||
slot_data["prompt"] = slot.prompt;
|
slot_data["prompt"] = slot.prompt;
|
||||||
slot_data["next_token"] = {
|
slot_data["next_token"] = {
|
||||||
{"has_next_token", slot.has_next_token},
|
{"has_next_token", slot.has_next_token},
|
||||||
|
{"has_new_line", slot.has_new_line},
|
||||||
{"n_remain", slot.n_remaining},
|
{"n_remain", slot.n_remaining},
|
||||||
{"n_decoded", slot.n_decoded},
|
{"n_decoded", slot.n_decoded},
|
||||||
{"stopped_eos", slot.stopped_eos},
|
{"stopped_eos", slot.stopped_eos},
|
||||||
|
@ -1800,6 +1722,9 @@ struct server_context {
|
||||||
}
|
}
|
||||||
slot->cache_tokens.resize(token_count);
|
slot->cache_tokens.resize(token_count);
|
||||||
|
|
||||||
|
// TODO: maybe detokenize the slot->cache_tokens instead?
|
||||||
|
slot->prompt = string_format("[restored %d tokens from file]", (int) token_count);
|
||||||
|
|
||||||
const int64_t t_end = ggml_time_us();
|
const int64_t t_end = ggml_time_us();
|
||||||
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
const double t_restore_ms = (t_end - t_start) / 1000.0;
|
||||||
|
|
||||||
|
@ -1850,7 +1775,7 @@ struct server_context {
|
||||||
} break;
|
} break;
|
||||||
case SERVER_TASK_TYPE_SET_LORA:
|
case SERVER_TASK_TYPE_SET_LORA:
|
||||||
{
|
{
|
||||||
llama_lora_adapters_apply(ctx, loras);
|
common_lora_adapters_apply(ctx, loras);
|
||||||
server_task_result result;
|
server_task_result result;
|
||||||
result.id = task.id;
|
result.id = task.id;
|
||||||
result.stop = true;
|
result.stop = true;
|
||||||
|
@ -1862,10 +1787,6 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
void update_slots() {
|
void update_slots() {
|
||||||
if (system_need_update) {
|
|
||||||
system_prompt_update();
|
|
||||||
}
|
|
||||||
|
|
||||||
// check if all slots are idle
|
// check if all slots are idle
|
||||||
{
|
{
|
||||||
bool all_idle = true;
|
bool all_idle = true;
|
||||||
|
@ -1879,7 +1800,7 @@ struct server_context {
|
||||||
|
|
||||||
if (all_idle) {
|
if (all_idle) {
|
||||||
SRV_INF("%s", "all slots are idle\n");
|
SRV_INF("%s", "all slots are idle\n");
|
||||||
if (system_prompt.empty() && clean_kv_cache) {
|
if (clean_kv_cache) {
|
||||||
kv_cache_clear();
|
kv_cache_clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1900,8 +1821,7 @@ struct server_context {
|
||||||
// apply context-shift if needed
|
// apply context-shift if needed
|
||||||
// TODO: simplify and improve
|
// TODO: simplify and improve
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
if (slot.ga_n == 1) {
|
if (slot.is_processing() && slot.n_past + 1 >= slot.n_ctx) {
|
||||||
if (slot.is_processing() && (int) system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
|
|
||||||
if (!params.ctx_shift) {
|
if (!params.ctx_shift) {
|
||||||
// this check is redundant (for good)
|
// this check is redundant (for good)
|
||||||
// we should never get here, because generation should already stopped in process_token()
|
// we should never get here, because generation should already stopped in process_token()
|
||||||
|
@ -1912,13 +1832,13 @@ struct server_context {
|
||||||
|
|
||||||
// Shift context
|
// Shift context
|
||||||
const int n_keep = slot.params.n_keep + add_bos_token;
|
const int n_keep = slot.params.n_keep + add_bos_token;
|
||||||
const int n_left = (int) system_tokens.size() + slot.n_past - n_keep;
|
const int n_left = slot.n_past - n_keep;
|
||||||
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
|
const int n_discard = slot.params.n_discard ? slot.params.n_discard : (n_left / 2);
|
||||||
|
|
||||||
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep , n_keep + n_discard);
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
|
llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, slot.n_past, -n_discard);
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
|
||||||
|
@ -1933,10 +1853,9 @@ struct server_context {
|
||||||
slot.truncated = true;
|
slot.truncated = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// start populating the batch for this iteration
|
// start populating the batch for this iteration
|
||||||
llama_batch_clear(batch);
|
common_batch_clear(batch);
|
||||||
|
|
||||||
// frist, add sampled tokens from any ongoing sequences
|
// frist, add sampled tokens from any ongoing sequences
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
|
@ -1946,11 +1865,7 @@ struct server_context {
|
||||||
|
|
||||||
slot.i_batch = batch.n_tokens;
|
slot.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
const int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
common_batch_add(batch, slot.sampled, slot.n_past, { slot.id + 1 }, true);
|
||||||
|
|
||||||
// TODO: we always have to take into account the "system_tokens"
|
|
||||||
// this is not great and needs to be improved somehow
|
|
||||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
|
|
||||||
|
|
||||||
slot.n_past += 1;
|
slot.n_past += 1;
|
||||||
|
|
||||||
|
@ -1958,8 +1873,8 @@ struct server_context {
|
||||||
slot.cache_tokens.push_back(slot.sampled);
|
slot.cache_tokens.push_back(slot.sampled);
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_system_tokens = %d, n_cache_tokens = %d, truncated = %d\n",
|
SLT_DBG(slot, "slot decode token, n_ctx = %d, n_past = %d, n_cache_tokens = %d, truncated = %d\n",
|
||||||
slot.n_ctx, slot.n_past, (int) system_tokens.size(), (int) slot.cache_tokens.size(), slot.truncated);
|
slot.n_ctx, slot.n_past, (int) slot.cache_tokens.size(), slot.truncated);
|
||||||
}
|
}
|
||||||
|
|
||||||
// process in chunks of params.n_batch
|
// process in chunks of params.n_batch
|
||||||
|
@ -1986,39 +1901,14 @@ struct server_context {
|
||||||
slot.t_start_process_prompt = ggml_time_us();
|
slot.t_start_process_prompt = ggml_time_us();
|
||||||
slot.t_start_generation = 0;
|
slot.t_start_generation = 0;
|
||||||
|
|
||||||
if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_INFILL) {
|
switch (slot.cmpl_type) {
|
||||||
const bool add_bos = llama_add_bos_token(model);
|
case SERVER_TASK_CMPL_TYPE_NORMAL:
|
||||||
bool suff_rm_leading_spc = true;
|
case SERVER_TASK_CMPL_TYPE_EMBEDDING:
|
||||||
if (params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
{
|
||||||
params.input_suffix.erase(0, 1);
|
prompt_tokens = tokenize(slot.prompt, llama_add_bos_token(model), true);
|
||||||
suff_rm_leading_spc = false;
|
} break;
|
||||||
}
|
case SERVER_TASK_CMPL_TYPE_RERANK:
|
||||||
|
{
|
||||||
auto prefix_tokens = tokenize(slot.params.input_prefix, false);
|
|
||||||
auto suffix_tokens = tokenize(slot.params.input_suffix, false);
|
|
||||||
|
|
||||||
const int space_token = 29871; // TODO: this should not be hardcoded
|
|
||||||
if (suff_rm_leading_spc && !suffix_tokens.empty() && suffix_tokens[0] == space_token) {
|
|
||||||
suffix_tokens.erase(suffix_tokens.begin());
|
|
||||||
}
|
|
||||||
|
|
||||||
prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model));
|
|
||||||
suffix_tokens.insert(suffix_tokens.begin(), llama_token_suffix(model));
|
|
||||||
|
|
||||||
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
|
||||||
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
|
||||||
if (add_bos) {
|
|
||||||
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
|
||||||
}
|
|
||||||
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
|
||||||
|
|
||||||
const llama_token middle_token = llama_token_middle(model);
|
|
||||||
if (middle_token >= 0) {
|
|
||||||
embd_inp.push_back(middle_token);
|
|
||||||
}
|
|
||||||
|
|
||||||
prompt_tokens = embd_inp;
|
|
||||||
} else if (slot.cmpl_type == SERVER_TASK_CMPL_TYPE_RERANK) {
|
|
||||||
// require slot.prompt to be array of 2 strings
|
// require slot.prompt to be array of 2 strings
|
||||||
if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
|
if (!slot.prompt.is_array() || slot.prompt.size() != 2) {
|
||||||
SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
|
SLT_ERR(slot, "%s", "invalid prompt for rerank task\n");
|
||||||
|
@ -2027,22 +1917,48 @@ struct server_context {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// prompt: <s>query</s><s>doc</s>
|
// prompt: [BOS]query[EOS][SEP]doc[EOS]
|
||||||
prompt_tokens.clear();
|
prompt_tokens.clear();
|
||||||
prompt_tokens.push_back(llama_token_bos(model));
|
prompt_tokens.push_back(llama_token_bos(model));
|
||||||
{
|
{
|
||||||
const auto part = tokenize(slot.prompt[0], false);
|
const auto part = tokenize(slot.prompt[0], false, false);
|
||||||
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
||||||
}
|
}
|
||||||
prompt_tokens.push_back(llama_token_eos(model));
|
prompt_tokens.push_back(llama_token_eos(model));
|
||||||
prompt_tokens.push_back(llama_token_bos(model));
|
prompt_tokens.push_back(llama_token_sep(model));
|
||||||
{
|
{
|
||||||
const auto part = tokenize(slot.prompt[1], false);
|
const auto part = tokenize(slot.prompt[1], false, false);
|
||||||
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
prompt_tokens.insert(prompt_tokens.end(), part.begin(), part.end());
|
||||||
}
|
}
|
||||||
prompt_tokens.push_back(llama_token_eos(model));
|
prompt_tokens.push_back(llama_token_eos(model));
|
||||||
} else {
|
} break;
|
||||||
prompt_tokens = tokenize(slot.prompt, system_prompt.empty()); // add BOS if there isn't system prompt
|
case SERVER_TASK_CMPL_TYPE_INFILL:
|
||||||
|
{
|
||||||
|
auto prefix_tokens = tokenize(slot.params.input_prefix, false, false);
|
||||||
|
auto suffix_tokens = tokenize(slot.params.input_suffix, false, false);
|
||||||
|
|
||||||
|
// for now pick context to fit in a single batch (ratio prefix:suffix = 3:1, TODO: configurable?)
|
||||||
|
const int n_suffix_take = std::min<int>(suffix_tokens.size(), n_batch/4);
|
||||||
|
const int n_prefix_take = std::min<int>(prefix_tokens.size(), (n_batch - 3) - n_suffix_take);
|
||||||
|
|
||||||
|
prefix_tokens.erase(prefix_tokens.begin(), prefix_tokens.begin() + prefix_tokens.size() - n_prefix_take);
|
||||||
|
suffix_tokens.resize(n_suffix_take);
|
||||||
|
|
||||||
|
prefix_tokens.insert(prefix_tokens.begin(), llama_token_fim_pre(model));
|
||||||
|
suffix_tokens.insert(suffix_tokens.begin(), llama_token_fim_suf(model));
|
||||||
|
|
||||||
|
auto embd_inp = params.spm_infill ? suffix_tokens : prefix_tokens;
|
||||||
|
auto embd_end = params.spm_infill ? prefix_tokens : suffix_tokens;
|
||||||
|
|
||||||
|
if (llama_add_bos_token(model)) {
|
||||||
|
embd_inp.insert(embd_inp.begin(), llama_token_bos(model));
|
||||||
|
}
|
||||||
|
|
||||||
|
embd_inp.insert(embd_inp.end(), embd_end.begin(), embd_end.end());
|
||||||
|
embd_inp.push_back(llama_token_fim_mid(model));
|
||||||
|
|
||||||
|
prompt_tokens = std::move(embd_inp);
|
||||||
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -2050,6 +1966,19 @@ struct server_context {
|
||||||
|
|
||||||
SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
|
SLT_INF(slot, "prompt tokenized, n_ctx_slot = %d, n_keep = %d, n_prompt_tokens = %d\n", slot.n_ctx, slot.params.n_keep, slot.n_prompt_tokens);
|
||||||
|
|
||||||
|
// print prompt tokens (for debugging)
|
||||||
|
if (1) {
|
||||||
|
// first 16 tokens (avoid flooding logs)
|
||||||
|
for (int i = 0; i < std::min<int>(16, prompt_tokens.size()); i++) {
|
||||||
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// all
|
||||||
|
for (int i = 0; i < (int) prompt_tokens.size(); i++) {
|
||||||
|
SLT_DBG(slot, "prompt token %3d: %6d '%s'\n", i, prompt_tokens[i], common_token_to_piece(ctx, prompt_tokens[i]).c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// empty prompt passed -> release the slot and send empty response
|
// empty prompt passed -> release the slot and send empty response
|
||||||
if (prompt_tokens.empty()) {
|
if (prompt_tokens.empty()) {
|
||||||
SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
|
SLT_WRN(slot, "%s", "empty prompt - releasing slot\n");
|
||||||
|
@ -2070,7 +1999,9 @@ struct server_context {
|
||||||
} else {
|
} else {
|
||||||
if (!params.ctx_shift) {
|
if (!params.ctx_shift) {
|
||||||
// if context shift is disabled, we make sure prompt size is smaller than KV size
|
// if context shift is disabled, we make sure prompt size is smaller than KV size
|
||||||
if ((int) system_tokens.size() + slot.n_prompt_tokens >= slot.n_ctx) {
|
// TODO: there should be a separate parameter that control prompt truncation
|
||||||
|
// context shift should be applied only during the generation phase
|
||||||
|
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
|
send_error(slot, "the request exceeds the available context size. try increasing the context size or enable context shift", ERROR_TYPE_INVALID_REQUEST);
|
||||||
continue;
|
continue;
|
||||||
|
@ -2082,7 +2013,7 @@ struct server_context {
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
// if input prompt is too big, truncate it (if group attention self-extend is disabled)
|
// if input prompt is too big, truncate it (if group attention self-extend is disabled)
|
||||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) {
|
if (slot.n_prompt_tokens >= slot.n_ctx) {
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
|
|
||||||
const int n_block_size = n_left / 2;
|
const int n_block_size = n_left / 2;
|
||||||
|
@ -2107,20 +2038,15 @@ struct server_context {
|
||||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_sampler_reset(slot.smpl);
|
common_sampler_reset(slot.smpl);
|
||||||
|
|
||||||
if (!slot.params.cache_prompt) {
|
|
||||||
slot.n_past_se = 0;
|
|
||||||
slot.ga_i = 0;
|
|
||||||
} else {
|
|
||||||
GGML_ASSERT(slot.ga_n == 1);
|
|
||||||
|
|
||||||
|
if (slot.params.cache_prompt) {
|
||||||
// reuse any previously computed tokens that are common with the new prompt
|
// reuse any previously computed tokens that are common with the new prompt
|
||||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||||
|
|
||||||
// push the prompt into the sampling context (do not apply grammar)
|
// push the prompt into the sampling context (do not apply grammar)
|
||||||
for (int i = 0; i < slot.n_past; ++i) {
|
for (int i = 0; i < slot.n_past; ++i) {
|
||||||
gpt_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
common_sampler_accept(slot.smpl, slot.cache_tokens[i], false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2130,9 +2056,6 @@ struct server_context {
|
||||||
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
|
||||||
|
|
||||||
slot.n_past--;
|
slot.n_past--;
|
||||||
if (slot.ga_i > 0) {
|
|
||||||
slot.n_past_se--;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed = 0;
|
slot.n_prompt_tokens_processed = 0;
|
||||||
|
@ -2158,55 +2081,31 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep only the common part
|
// keep only the common part
|
||||||
int p0 = (int) system_tokens.size() + slot.n_past;
|
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, slot.n_past, -1)) {
|
||||||
if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
|
|
||||||
// could not partially delete (likely using a non-Transformer model)
|
// could not partially delete (likely using a non-Transformer model)
|
||||||
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
|
llama_kv_cache_seq_rm(ctx, slot.id + 1, -1, -1);
|
||||||
|
|
||||||
p0 = (int) system_tokens.size();
|
// there is no common part left
|
||||||
if (p0 != 0) {
|
slot.n_past = 0;
|
||||||
// copy over the system prompt when there is one
|
|
||||||
llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
|
common_sampler_reset(slot.smpl);
|
||||||
}
|
}
|
||||||
|
|
||||||
// there is no common part left (except for the system prompt)
|
SLT_INF(slot, "kv cache rm [%d, end)\n", slot.n_past);
|
||||||
slot.n_past = 0;
|
|
||||||
slot.n_past_se = 0;
|
|
||||||
slot.ga_i = 0;
|
|
||||||
// TODO: is the system prompt ever in the sampling context?
|
|
||||||
gpt_sampler_reset(slot.smpl);
|
|
||||||
}
|
|
||||||
|
|
||||||
// remove the non-common part from the cache
|
// remove the non-common part from the cache
|
||||||
slot.cache_tokens.resize(slot.n_past);
|
slot.cache_tokens.resize(slot.n_past);
|
||||||
|
|
||||||
SLT_INF(slot, "kv cache rm [%d, end)\n", p0);
|
|
||||||
|
|
||||||
int32_t slot_npast = slot.n_past_se > 0 ? slot.n_past_se : slot.n_past;
|
|
||||||
|
|
||||||
int32_t ga_i = slot.ga_i;
|
|
||||||
int32_t ga_n = slot.ga_n;
|
|
||||||
int32_t ga_w = slot.ga_w;
|
|
||||||
|
|
||||||
// add prompt tokens for processing in the current batch
|
// add prompt tokens for processing in the current batch
|
||||||
// TODO: the self-extend stuff here is a mess - simplify and/or abstract it somehow
|
while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) {
|
||||||
for (; slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch; ++slot.n_past) {
|
common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id + 1 }, false);
|
||||||
if (slot.ga_n != 1) {
|
|
||||||
while (slot_npast >= ga_i + ga_w) {
|
|
||||||
const int bd = (ga_w/ga_n)*(ga_n - 1);
|
|
||||||
slot_npast -= bd;
|
|
||||||
ga_i += ga_w/ga_n;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_batch_add(batch, prompt_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
|
|
||||||
|
|
||||||
if (slot.params.cache_prompt) {
|
if (slot.params.cache_prompt) {
|
||||||
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
slot.cache_tokens.push_back(prompt_tokens[slot.n_past]);
|
||||||
}
|
}
|
||||||
|
|
||||||
slot.n_prompt_tokens_processed++;
|
slot.n_prompt_tokens_processed++;
|
||||||
slot_npast++;
|
slot.n_past++;
|
||||||
}
|
}
|
||||||
|
|
||||||
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
SLT_INF(slot, "prompt processing progress, n_past = %d, n_tokens = %d, progress = %f\n", slot.n_past, batch.n_tokens, (float) slot.n_prompt_tokens_processed / slot.n_prompt_tokens);
|
||||||
|
@ -2247,34 +2146,6 @@ struct server_context {
|
||||||
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
for (int32_t i = 0; i < batch.n_tokens; i += n_batch) {
|
||||||
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
const int32_t n_tokens = std::min(n_batch, batch.n_tokens - i);
|
||||||
|
|
||||||
for (auto & slot : slots) {
|
|
||||||
if (slot.ga_n != 1) {
|
|
||||||
// context extension via Self-Extend
|
|
||||||
// TODO: simplify and/or abstract this
|
|
||||||
while (slot.n_past_se >= slot.ga_i + slot.ga_w) {
|
|
||||||
const int ib = (slot.ga_n * slot.ga_i) / slot.ga_w;
|
|
||||||
const int bd = (slot.ga_w / slot.ga_n) * (slot.ga_n - 1);
|
|
||||||
const int dd = (slot.ga_w / slot.ga_n) - ib * bd - slot.ga_w;
|
|
||||||
|
|
||||||
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i, slot.n_past_se, ib * bd, slot.ga_i + ib * bd, slot.n_past_se + ib * bd);
|
|
||||||
SLT_DBG(slot, "div: [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
|
|
||||||
SLT_DBG(slot, "shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
|
|
||||||
|
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
|
|
||||||
llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n);
|
|
||||||
llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd);
|
|
||||||
|
|
||||||
slot.n_past_se -= bd;
|
|
||||||
|
|
||||||
slot.ga_i += slot.ga_w / slot.ga_n;
|
|
||||||
|
|
||||||
SLT_DBG(slot, "\nn_past_old = %d, n_past = %d, ga_i = %d\n\n", slot.n_past_se + bd, slot.n_past_se, slot.ga_i);
|
|
||||||
}
|
|
||||||
|
|
||||||
slot.n_past_se += n_tokens;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llama_batch batch_view = {
|
llama_batch batch_view = {
|
||||||
n_tokens,
|
n_tokens,
|
||||||
batch.token + i,
|
batch.token + i,
|
||||||
|
@ -2337,9 +2208,9 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
completion_token_output result;
|
completion_token_output result;
|
||||||
const llama_token id = gpt_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
const llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i);
|
||||||
|
|
||||||
gpt_sampler_accept(slot.smpl, id, true);
|
common_sampler_accept(slot.smpl, id, true);
|
||||||
|
|
||||||
slot.n_decoded += 1;
|
slot.n_decoded += 1;
|
||||||
if (slot.n_decoded == 1) {
|
if (slot.n_decoded == 1) {
|
||||||
|
@ -2350,7 +2221,7 @@ struct server_context {
|
||||||
|
|
||||||
result.tok = id;
|
result.tok = id;
|
||||||
|
|
||||||
const auto * cur_p = gpt_sampler_get_candidates(slot.smpl);
|
const auto * cur_p = common_sampler_get_candidates(slot.smpl);
|
||||||
|
|
||||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||||
result.probs.push_back({
|
result.probs.push_back({
|
||||||
|
@ -2414,13 +2285,13 @@ inline void signal_handler(int signal) {
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SERVER)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
// enabling this will output extra debug information in the HTTP responses from the server
|
// enabling this will output extra debug information in the HTTP responses from the server
|
||||||
// see format_final_response_oaicompat()
|
// see format_final_response_oaicompat()
|
||||||
|
@ -2429,10 +2300,6 @@ int main(int argc, char ** argv) {
|
||||||
// struct that contains llama context and inference
|
// struct that contains llama context and inference
|
||||||
server_context ctx_server;
|
server_context ctx_server;
|
||||||
|
|
||||||
if (!params.system_prompt.empty()) {
|
|
||||||
ctx_server.system_prompt_set(params.system_prompt);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (params.model_alias == "unknown") {
|
if (params.model_alias == "unknown") {
|
||||||
params.model_alias = params.model;
|
params.model_alias = params.model;
|
||||||
}
|
}
|
||||||
|
@ -2442,7 +2309,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
|
LOG_INF("system info: n_threads = %d, n_threads_batch = %d, total_threads = %d\n", params.cpuparams.n_threads, params.cpuparams_batch.n_threads, std::thread::hardware_concurrency());
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("%s\n", gpt_params_get_system_info(params).c_str());
|
LOG_INF("%s\n", common_params_get_system_info(params).c_str());
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
|
|
||||||
std::unique_ptr<httplib::Server> svr;
|
std::unique_ptr<httplib::Server> svr;
|
||||||
|
@ -2536,20 +2403,10 @@ int main(int argc, char ** argv) {
|
||||||
//
|
//
|
||||||
|
|
||||||
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
auto middleware_validate_api_key = [¶ms, &res_error](const httplib::Request & req, httplib::Response & res) {
|
||||||
// TODO: should we apply API key to all endpoints, including "/health" and "/models"?
|
static const std::unordered_set<std::string> public_endpoints = {
|
||||||
static const std::unordered_set<std::string> protected_endpoints = {
|
"/health",
|
||||||
"/props",
|
"/models",
|
||||||
"/completion",
|
"/v1/models",
|
||||||
"/completions",
|
|
||||||
"/v1/completions",
|
|
||||||
"/chat/completions",
|
|
||||||
"/v1/chat/completions",
|
|
||||||
"/infill",
|
|
||||||
"/tokenize",
|
|
||||||
"/detokenize",
|
|
||||||
"/embedding",
|
|
||||||
"/embeddings",
|
|
||||||
"/v1/embeddings",
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// If API key is not set, skip validation
|
// If API key is not set, skip validation
|
||||||
|
@ -2557,8 +2414,8 @@ int main(int argc, char ** argv) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// If path is not in protected_endpoints list, skip validation
|
// If path is public, skip validation
|
||||||
if (protected_endpoints.find(req.path) == protected_endpoints.end()) {
|
if (public_endpoints.find(req.path) != public_endpoints.end()) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2620,7 +2477,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_slots = [&](const httplib::Request & req, httplib::Response & res) {
|
||||||
if (!params.endpoint_slots) {
|
if (!params.endpoint_slots) {
|
||||||
res_error(res, format_error_response("This server does not support slots endpoint. Start it without `--no-slots`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support slots endpoint. Start it with `--slots`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2869,24 +2726,28 @@ int main(int argc, char ** argv) {
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
const auto handle_props = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) {
|
||||||
std::string template_key = "tokenizer.chat_template", curr_tmpl;
|
|
||||||
int32_t tlen = llama_model_meta_val_str(ctx_server.model, template_key.c_str(), nullptr, 0);
|
|
||||||
if (tlen > 0) {
|
|
||||||
std::vector<char> curr_tmpl_buf(tlen + 1, 0);
|
|
||||||
if (llama_model_meta_val_str(ctx_server.model, template_key.c_str(), curr_tmpl_buf.data(), curr_tmpl_buf.size()) == tlen) {
|
|
||||||
curr_tmpl = std::string(curr_tmpl_buf.data(), tlen);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
json data = {
|
json data = {
|
||||||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
|
||||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||||
{ "total_slots", ctx_server.params.n_parallel },
|
{ "total_slots", ctx_server.params.n_parallel },
|
||||||
{ "chat_template", curr_tmpl.c_str() },
|
{ "chat_template", llama_get_chat_template(ctx_server.model) },
|
||||||
};
|
};
|
||||||
|
|
||||||
res_ok(res, data);
|
res_ok(res, data);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const auto handle_props_change = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
if (!ctx_server.params.endpoint_props) {
|
||||||
|
res_error(res, format_error_response("This server does not support changing global properties. Start it with `--props`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
json data = json::parse(req.body);
|
||||||
|
|
||||||
|
// update any props here
|
||||||
|
|
||||||
|
res_ok(res, {{ "success", true }});
|
||||||
|
};
|
||||||
|
|
||||||
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
|
const auto handle_completions_generic = [&ctx_server, &res_error, &res_ok](server_task_cmpl_type cmpl_type, json & data, httplib::Response & res) {
|
||||||
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
if (ctx_server.params.embedding || ctx_server.params.reranking) {
|
||||||
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings` or `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
|
||||||
|
@ -2942,7 +2803,23 @@ int main(int argc, char ** argv) {
|
||||||
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
|
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_NORMAL, data, res);
|
||||||
};
|
};
|
||||||
|
|
||||||
const auto handle_infill = [&handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
|
const auto handle_infill = [&ctx_server, &res_error, &handle_completions_generic](const httplib::Request & req, httplib::Response & res) {
|
||||||
|
std::string err;
|
||||||
|
if (llama_token_fim_pre(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
||||||
|
err += "prefix token is missing. ";
|
||||||
|
}
|
||||||
|
if (llama_token_fim_suf(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
||||||
|
err += "suffix token is missing. ";
|
||||||
|
}
|
||||||
|
if (llama_token_fim_mid(ctx_server.model) == LLAMA_TOKEN_NULL) {
|
||||||
|
err += "middle token is missing. ";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!err.empty()) {
|
||||||
|
res_error(res, format_error_response(string_format("Infill is not supported by this model: %s", err.c_str()), ERROR_TYPE_NOT_SUPPORTED));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
json data = json::parse(req.body);
|
json data = json::parse(req.body);
|
||||||
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
|
return handle_completions_generic(SERVER_TASK_CMPL_TYPE_INFILL, data, res);
|
||||||
};
|
};
|
||||||
|
@ -3028,11 +2905,12 @@ int main(int argc, char ** argv) {
|
||||||
if (body.count("content") != 0) {
|
if (body.count("content") != 0) {
|
||||||
const bool add_special = json_value(body, "add_special", false);
|
const bool add_special = json_value(body, "add_special", false);
|
||||||
const bool with_pieces = json_value(body, "with_pieces", false);
|
const bool with_pieces = json_value(body, "with_pieces", false);
|
||||||
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
|
|
||||||
|
std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special, true);
|
||||||
|
|
||||||
if (with_pieces) {
|
if (with_pieces) {
|
||||||
for (const auto& token : tokens) {
|
for (const auto& token : tokens) {
|
||||||
std::string piece = llama_token_to_piece(ctx_server.ctx, token);
|
std::string piece = common_token_to_piece(ctx_server.ctx, token);
|
||||||
json piece_json;
|
json piece_json;
|
||||||
|
|
||||||
// Check if the piece is valid UTF-8
|
// Check if the piece is valid UTF-8
|
||||||
|
@ -3265,6 +3143,12 @@ int main(int argc, char ** argv) {
|
||||||
svr->set_base_dir(params.public_path);
|
svr->set_base_dir(params.public_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!params.api_keys.empty()) {
|
||||||
|
// for now, if API key is set, web UI is unusable
|
||||||
|
svr->Get("/", [&](const httplib::Request &, httplib::Response & res) {
|
||||||
|
return res.set_content("Web UI is disabled because API key is set.", "text/html; charset=utf-8");
|
||||||
|
});
|
||||||
|
} else {
|
||||||
// using embedded static files
|
// using embedded static files
|
||||||
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
svr->Get("/", handle_static_file(index_html, index_html_len, "text/html; charset=utf-8"));
|
||||||
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
svr->Get("/index.js", handle_static_file(index_js, index_js_len, "text/javascript; charset=utf-8"));
|
||||||
|
@ -3283,12 +3167,15 @@ int main(int argc, char ** argv) {
|
||||||
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
svr->Get("/index-new.html", handle_static_file(index_new_html, index_new_html_len, "text/html; charset=utf-8"));
|
||||||
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
svr->Get("/system-prompts.js", handle_static_file(system_prompts_js, system_prompts_js_len, "text/javascript; charset=utf-8"));
|
||||||
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
svr->Get("/prompt-formats.js", handle_static_file(prompt_formats_js, prompt_formats_js_len, "text/javascript; charset=utf-8"));
|
||||||
|
}
|
||||||
|
|
||||||
// register API routes
|
// register API routes
|
||||||
svr->Get ("/health", handle_health);
|
svr->Get ("/health", handle_health); // public endpoint (no API key check)
|
||||||
svr->Get ("/metrics", handle_metrics);
|
svr->Get ("/metrics", handle_metrics);
|
||||||
svr->Get ("/props", handle_props);
|
svr->Get ("/props", handle_props);
|
||||||
svr->Get ("/v1/models", handle_models);
|
svr->Post("/props", handle_props_change);
|
||||||
|
svr->Get ("/models", handle_models); // public endpoint (no API key check)
|
||||||
|
svr->Get ("/v1/models", handle_models); // public endpoint (no API key check)
|
||||||
svr->Post("/completion", handle_completions); // legacy
|
svr->Post("/completion", handle_completions); // legacy
|
||||||
svr->Post("/completions", handle_completions);
|
svr->Post("/completions", handle_completions);
|
||||||
svr->Post("/v1/completions", handle_completions);
|
svr->Post("/v1/completions", handle_completions);
|
||||||
|
@ -3366,7 +3253,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// print sample chat example to make it clear which template is used
|
// print sample chat example to make it clear which template is used
|
||||||
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), llama_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
LOG_INF("%s: chat template, built_in: %d, chat_example: '%s'\n", __func__, params.chat_template.empty(), common_chat_format_example(ctx_server.model, params.chat_template).c_str());
|
||||||
|
|
||||||
ctx_server.queue_tasks.on_new_task(std::bind(
|
ctx_server.queue_tasks.on_new_task(std::bind(
|
||||||
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
&server_context::process_single_task, &ctx_server, std::placeholders::_1));
|
||||||
|
|
|
@ -13,6 +13,10 @@ Feature: llama.cpp server
|
||||||
And 32 as batch size
|
And 32 as batch size
|
||||||
And 2 slots
|
And 2 slots
|
||||||
|
|
||||||
|
# the prompt is 301 tokens
|
||||||
|
# the slot context is 256/2 = 128 tokens
|
||||||
|
# the prompt is truncated to keep the last 109 tokens
|
||||||
|
# 64 tokens are generated thanks to shifting the context when it gets full
|
||||||
Scenario: Inference with context shift
|
Scenario: Inference with context shift
|
||||||
And 64 server max tokens to predict
|
And 64 server max tokens to predict
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
|
|
|
@ -5,7 +5,7 @@ Feature: Security
|
||||||
Background: Server startup with an api key defined
|
Background: Server startup with an api key defined
|
||||||
Given a server listening on localhost:8080
|
Given a server listening on localhost:8080
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
And a server api key llama.cpp
|
And a server api key THIS_IS_THE_KEY
|
||||||
Then the server is starting
|
Then the server is starting
|
||||||
Then the server is healthy
|
Then the server is healthy
|
||||||
|
|
||||||
|
@ -17,8 +17,8 @@ Feature: Security
|
||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| api_key | api_error |
|
| api_key | api_error |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| hackeme | raised |
|
| hackeme | raised |
|
||||||
| | raised |
|
| | raised |
|
||||||
|
|
||||||
|
@ -33,8 +33,8 @@ Feature: Security
|
||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| api_key | api_error |
|
| api_key | api_error |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| llama.cpp | no |
|
| THIS_IS_THE_KEY | no |
|
||||||
| hackme | raised |
|
| hackme | raised |
|
||||||
|
|
||||||
Scenario Outline: OAI Compatibility (invalid response formats)
|
Scenario Outline: OAI Compatibility (invalid response formats)
|
||||||
|
@ -55,7 +55,7 @@ Feature: Security
|
||||||
|
|
||||||
|
|
||||||
Scenario Outline: CORS Options
|
Scenario Outline: CORS Options
|
||||||
Given a user api key llama.cpp
|
Given a user api key THIS_IS_THE_KEY
|
||||||
When an OPTIONS request is sent from <origin>
|
When an OPTIONS request is sent from <origin>
|
||||||
Then CORS header <cors_header> is set to <cors_header_value>
|
Then CORS header <cors_header> is set to <cors_header_value>
|
||||||
|
|
||||||
|
|
|
@ -1299,7 +1299,8 @@ async def wait_for_slots_status(context,
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
async with aiohttp.ClientSession(timeout=DEFAULT_TIMEOUT_SECONDS) as session:
|
||||||
while True:
|
while True:
|
||||||
async with await session.get(f'{base_url}/slots', params=params) as slots_response:
|
headers = {'Authorization': f'Bearer {context.server_api_key}'}
|
||||||
|
async with await session.get(f'{base_url}/slots', params=params, headers=headers) as slots_response:
|
||||||
status_code = slots_response.status
|
status_code = slots_response.status
|
||||||
slots = await slots_response.json()
|
slots = await slots_response.json()
|
||||||
if context.debug:
|
if context.debug:
|
||||||
|
@ -1387,6 +1388,7 @@ def start_server_background(context):
|
||||||
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
context.server_path = os.environ['LLAMA_SERVER_BIN_PATH']
|
||||||
server_listen_addr = context.server_fqdn
|
server_listen_addr = context.server_fqdn
|
||||||
server_args = [
|
server_args = [
|
||||||
|
'--slots', # requires to get slot status via /slots endpoint
|
||||||
'--host', server_listen_addr,
|
'--host', server_listen_addr,
|
||||||
'--port', context.server_port,
|
'--port', context.server_port,
|
||||||
]
|
]
|
||||||
|
|
|
@ -57,7 +57,7 @@ static T json_value(const json & body, const std::string & key, const T & defaul
|
||||||
|
|
||||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||||
std::vector<llama_chat_msg> chat;
|
std::vector<common_chat_msg> chat;
|
||||||
|
|
||||||
for (size_t i = 0; i < messages.size(); ++i) {
|
for (size_t i = 0; i < messages.size(); ++i) {
|
||||||
const auto & curr_msg = messages[i];
|
const auto & curr_msg = messages[i];
|
||||||
|
@ -84,12 +84,25 @@ inline std::string format_chat(const struct llama_model * model, const std::stri
|
||||||
chat.push_back({role, content});
|
chat.push_back({role, content});
|
||||||
}
|
}
|
||||||
|
|
||||||
const auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
const auto formatted_chat = common_chat_apply_template(model, tmpl, chat, true);
|
||||||
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
LOG_DBG("formatted_chat: '%s'\n", formatted_chat.c_str());
|
||||||
|
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string llama_get_chat_template(const struct llama_model * model) {
|
||||||
|
std::string template_key = "tokenizer.chat_template";
|
||||||
|
// call with NULL buffer to get the total size of the string
|
||||||
|
int32_t res = llama_model_meta_val_str(model, template_key.c_str(), NULL, 0);
|
||||||
|
if (res < 0) {
|
||||||
|
return "";
|
||||||
|
} else {
|
||||||
|
std::vector<char> model_template(res, 0);
|
||||||
|
llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size());
|
||||||
|
return std::string(model_template.data(), model_template.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// base64 utils (TODO: move to common in the future)
|
// base64 utils (TODO: move to common in the future)
|
||||||
//
|
//
|
||||||
|
@ -233,7 +246,7 @@ template <class Iter>
|
||||||
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
std::string ret;
|
std::string ret;
|
||||||
for (; begin != end; ++begin) {
|
for (; begin != end; ++begin) {
|
||||||
ret += llama_token_to_piece(ctx, *begin);
|
ret += common_token_to_piece(ctx, *begin);
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -241,7 +254,7 @@ static std::string tokens_to_str(llama_context * ctx, Iter begin, Iter end) {
|
||||||
|
|
||||||
// format incomplete utf-8 multibyte character for output
|
// format incomplete utf-8 multibyte character for output
|
||||||
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
static std::string tokens_to_output_formatted_string(const llama_context * ctx, const llama_token token) {
|
||||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
||||||
|
|
||||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||||
// (size > 1 meaning it's already a known token)
|
// (size > 1 meaning it's already a known token)
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
set(TARGET llama-simple)
|
set(TARGET llama-simple)
|
||||||
add_executable(${TARGET} simple.cpp)
|
add_executable(${TARGET} simple.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -1,50 +1,112 @@
|
||||||
#include "arg.h"
|
|
||||||
#include "common.h"
|
|
||||||
#include "log.h"
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
static void print_usage(int, char ** argv) {
|
static void print_usage(int, char ** argv) {
|
||||||
LOG("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
LOG("\n %s -m model.gguf -p \"Hello my name is\" -n 32\n", argv[0]);
|
printf("\n %s -m model.gguf [-n n_predict] [-ngl n_gpu_layers] [prompt]\n", argv[0]);
|
||||||
LOG("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
// path to the model gguf file
|
||||||
|
std::string model_path;
|
||||||
|
// prompt to generate text from
|
||||||
|
std::string prompt = "Hello my name is";
|
||||||
|
// number of layers to offload to the GPU
|
||||||
|
int ngl = 99;
|
||||||
|
// number of tokens to predict
|
||||||
|
int n_predict = 32;
|
||||||
|
|
||||||
params.prompt = "Hello my name is";
|
// parse command line arguments
|
||||||
params.n_predict = 32;
|
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON, print_usage)) {
|
{
|
||||||
|
int i = 1;
|
||||||
|
for (; i < argc; i++) {
|
||||||
|
if (strcmp(argv[i], "-m") == 0) {
|
||||||
|
if (i + 1 < argc) {
|
||||||
|
model_path = argv[++i];
|
||||||
|
} else {
|
||||||
|
print_usage(argc, argv);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
} else if (strcmp(argv[i], "-n") == 0) {
|
||||||
gpt_init();
|
if (i + 1 < argc) {
|
||||||
|
try {
|
||||||
// total length of the sequence including the prompt
|
n_predict = std::stoi(argv[++i]);
|
||||||
const int n_predict = params.n_predict;
|
} catch (...) {
|
||||||
|
print_usage(argc, argv);
|
||||||
// init LLM
|
return 1;
|
||||||
|
}
|
||||||
llama_backend_init();
|
} else {
|
||||||
llama_numa_init(params.numa);
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else if (strcmp(argv[i], "-ngl") == 0) {
|
||||||
|
if (i + 1 < argc) {
|
||||||
|
try {
|
||||||
|
ngl = std::stoi(argv[++i]);
|
||||||
|
} catch (...) {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// prompt starts here
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (model_path.empty()) {
|
||||||
|
print_usage(argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (i < argc) {
|
||||||
|
prompt = argv[i++];
|
||||||
|
for (; i < argc; i++) {
|
||||||
|
prompt += " ";
|
||||||
|
prompt += argv[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// initialize the model
|
// initialize the model
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_params_from_gpt_params(params);
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
model_params.n_gpu_layers = ngl;
|
||||||
|
|
||||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params);
|
llama_model * model = llama_load_model_from_file(model_path.c_str(), model_params);
|
||||||
|
|
||||||
if (model == NULL) {
|
if (model == NULL) {
|
||||||
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
fprintf(stderr , "%s: error: unable to load model\n" , __func__);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// tokenize the prompt
|
||||||
|
|
||||||
|
// find the number of tokens in the prompt
|
||||||
|
const int n_prompt = -llama_tokenize(model, prompt.c_str(), prompt.size(), NULL, 0, true, true);
|
||||||
|
|
||||||
|
// allocate space for the tokens and tokenize the prompt
|
||||||
|
std::vector<llama_token> prompt_tokens(n_prompt);
|
||||||
|
if (llama_tokenize(model, prompt.c_str(), prompt.size(), prompt_tokens.data(), prompt_tokens.size(), true, true) < 0) {
|
||||||
|
fprintf(stderr, "%s: error: failed to tokenize the prompt\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
// initialize the context
|
// initialize the context
|
||||||
|
|
||||||
llama_context_params ctx_params = llama_context_params_from_gpt_params(params);
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
// n_ctx is the context size
|
||||||
|
ctx_params.n_ctx = n_prompt + n_predict - 1;
|
||||||
|
// n_batch is the maximum number of tokens that can be processed in a single call to llama_decode
|
||||||
|
ctx_params.n_batch = n_prompt;
|
||||||
|
// enable performance counters
|
||||||
|
ctx_params.no_perf = false;
|
||||||
|
|
||||||
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
|
||||||
|
|
||||||
|
@ -53,117 +115,87 @@ int main(int argc, char ** argv) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// initialize the sampler
|
||||||
|
|
||||||
auto sparams = llama_sampler_chain_default_params();
|
auto sparams = llama_sampler_chain_default_params();
|
||||||
|
|
||||||
sparams.no_perf = false;
|
sparams.no_perf = false;
|
||||||
|
|
||||||
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
llama_sampler * smpl = llama_sampler_chain_init(sparams);
|
||||||
|
|
||||||
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
|
||||||
|
|
||||||
// tokenize the prompt
|
|
||||||
|
|
||||||
std::vector<llama_token> tokens_list;
|
|
||||||
tokens_list = ::llama_tokenize(ctx, params.prompt, true);
|
|
||||||
|
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
|
||||||
const int n_kv_req = tokens_list.size() + (n_predict - tokens_list.size());
|
|
||||||
|
|
||||||
LOG("\n");
|
|
||||||
LOG_INF("%s: n_predict = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_predict, n_ctx, n_kv_req);
|
|
||||||
|
|
||||||
// make sure the KV cache is big enough to hold all the prompt and generated tokens
|
|
||||||
if (n_kv_req > n_ctx) {
|
|
||||||
LOG_ERR("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
|
|
||||||
LOG_ERR("%s: either reduce n_predict or increase n_ctx\n", __func__);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// print the prompt token-by-token
|
// print the prompt token-by-token
|
||||||
|
|
||||||
LOG("\n");
|
for (auto id : prompt_tokens) {
|
||||||
|
char buf[128];
|
||||||
for (auto id : tokens_list) {
|
int n = llama_token_to_piece(model, id, buf, sizeof(buf), 0, true);
|
||||||
LOG("%s", llama_token_to_piece(ctx, id).c_str());
|
if (n < 0) {
|
||||||
}
|
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
||||||
|
|
||||||
// create a llama_batch with size 512
|
|
||||||
// we use this object to submit token data for decoding
|
|
||||||
|
|
||||||
llama_batch batch = llama_batch_init(512, 0, 1);
|
|
||||||
|
|
||||||
// evaluate the initial prompt
|
|
||||||
for (size_t i = 0; i < tokens_list.size(); i++) {
|
|
||||||
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
// llama_decode will output logits only for the last token of the prompt
|
|
||||||
batch.logits[batch.n_tokens - 1] = true;
|
|
||||||
|
|
||||||
if (llama_decode(ctx, batch) != 0) {
|
|
||||||
LOG("%s: llama_decode() failed\n", __func__);
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
std::string s(buf, n);
|
||||||
|
printf("%s", s.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare a batch for the prompt
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_get_one(prompt_tokens.data(), prompt_tokens.size(), 0, 0);
|
||||||
|
|
||||||
// main loop
|
// main loop
|
||||||
|
|
||||||
int n_cur = batch.n_tokens;
|
|
||||||
int n_decode = 0;
|
|
||||||
|
|
||||||
const auto t_main_start = ggml_time_us();
|
const auto t_main_start = ggml_time_us();
|
||||||
|
int n_decode = 0;
|
||||||
|
llama_token new_token_id;
|
||||||
|
|
||||||
|
for (int n_pos = 0; n_pos + batch.n_tokens < n_prompt + n_predict; ) {
|
||||||
|
// evaluate the current batch with the transformer model
|
||||||
|
if (llama_decode(ctx, batch)) {
|
||||||
|
fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
n_pos += batch.n_tokens;
|
||||||
|
|
||||||
while (n_cur <= n_predict) {
|
|
||||||
// sample the next token
|
// sample the next token
|
||||||
{
|
{
|
||||||
const llama_token new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
new_token_id = llama_sampler_sample(smpl, ctx, -1);
|
||||||
|
|
||||||
// is it an end of generation?
|
// is it an end of generation?
|
||||||
if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
|
if (llama_token_is_eog(model, new_token_id)) {
|
||||||
LOG("\n");
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("%s", llama_token_to_piece(ctx, new_token_id).c_str());
|
char buf[128];
|
||||||
|
int n = llama_token_to_piece(model, new_token_id, buf, sizeof(buf), 0, true);
|
||||||
|
if (n < 0) {
|
||||||
|
fprintf(stderr, "%s: error: failed to convert token to piece\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
std::string s(buf, n);
|
||||||
|
printf("%s", s.c_str());
|
||||||
fflush(stdout);
|
fflush(stdout);
|
||||||
|
|
||||||
// prepare the next batch
|
// prepare the next batch with the sampled token
|
||||||
llama_batch_clear(batch);
|
batch = llama_batch_get_one(&new_token_id, 1, n_pos, 0);
|
||||||
|
|
||||||
// push this new token for next evaluation
|
|
||||||
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
|
|
||||||
|
|
||||||
n_decode += 1;
|
n_decode += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
n_cur += 1;
|
|
||||||
|
|
||||||
// evaluate the current batch with the transformer model
|
|
||||||
if (llama_decode(ctx, batch)) {
|
|
||||||
LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1);
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG("\n");
|
printf("\n");
|
||||||
|
|
||||||
const auto t_main_end = ggml_time_us();
|
const auto t_main_end = ggml_time_us();
|
||||||
|
|
||||||
LOG_INF("%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
fprintf(stderr, "%s: decoded %d tokens in %.2f s, speed: %.2f t/s\n",
|
||||||
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
__func__, n_decode, (t_main_end - t_main_start) / 1000000.0f, n_decode / ((t_main_end - t_main_start) / 1000000.0f));
|
||||||
|
|
||||||
LOG("\n");
|
fprintf(stderr, "\n");
|
||||||
llama_perf_sampler_print(smpl);
|
llama_perf_sampler_print(smpl);
|
||||||
llama_perf_context_print(ctx);
|
llama_perf_context_print(ctx);
|
||||||
|
fprintf(stderr, "\n");
|
||||||
|
|
||||||
LOG("\n");
|
|
||||||
|
|
||||||
llama_batch_free(batch);
|
|
||||||
llama_sampler_free(smpl);
|
llama_sampler_free(smpl);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
llama_backend_free();
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,20 +26,20 @@ struct seq_draft {
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
std::vector<std::vector<llama_token_data>> dists;
|
std::vector<std::vector<llama_token_data>> dists;
|
||||||
|
|
||||||
struct gpt_sampler * smpl = nullptr;
|
struct common_sampler * smpl = nullptr;
|
||||||
};
|
};
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
common_params params;
|
||||||
|
|
||||||
// needed to get candidate probs even for temp <= 0.0
|
// needed to get candidate probs even for temp <= 0.0
|
||||||
params.sparams.n_probs = 128;
|
params.sparams.n_probs = 128;
|
||||||
|
|
||||||
if (!gpt_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_SPECULATIVE)) {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_init();
|
common_init();
|
||||||
|
|
||||||
if (params.model_draft.empty()) {
|
if (params.model_draft.empty()) {
|
||||||
LOG_ERR("%s: --model-draft is required\n", __func__);
|
LOG_ERR("%s: --model-draft is required\n", __func__);
|
||||||
|
@ -66,7 +66,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_context * ctx_dft = NULL;
|
llama_context * ctx_dft = NULL;
|
||||||
|
|
||||||
// load the target model
|
// load the target model
|
||||||
llama_init_result llama_init_tgt = llama_init_from_gpt_params(params);
|
common_init_result llama_init_tgt = common_init_from_params(params);
|
||||||
model_tgt = llama_init_tgt.model;
|
model_tgt = llama_init_tgt.model;
|
||||||
ctx_tgt = llama_init_tgt.context;
|
ctx_tgt = llama_init_tgt.context;
|
||||||
|
|
||||||
|
@ -78,7 +78,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
params.cpuparams_batch.n_threads = params.draft_cpuparams_batch.n_threads;
|
||||||
llama_init_result llama_init_dft = llama_init_from_gpt_params(params);
|
common_init_result llama_init_dft = common_init_from_params(params);
|
||||||
model_dft = llama_init_dft.model;
|
model_dft = llama_init_dft.model;
|
||||||
ctx_dft = llama_init_dft.context;
|
ctx_dft = llama_init_dft.context;
|
||||||
|
|
||||||
|
@ -124,8 +124,8 @@ int main(int argc, char ** argv) {
|
||||||
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
|
||||||
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
|
LOG_ERR("%s: draft model vocab must match target model to use speculation but ", __func__);
|
||||||
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
|
LOG_ERR("token %d content differs - target '%s', draft '%s'\n", i,
|
||||||
llama_token_to_piece(ctx_tgt, i).c_str(),
|
common_token_to_piece(ctx_tgt, i).c_str(),
|
||||||
llama_token_to_piece(ctx_dft, i).c_str());
|
common_token_to_piece(ctx_dft, i).c_str());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -134,7 +134,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// Tokenize the prompt
|
// Tokenize the prompt
|
||||||
std::vector<llama_token> inp;
|
std::vector<llama_token> inp;
|
||||||
inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true);
|
inp = common_tokenize(ctx_tgt, params.prompt, true, true);
|
||||||
|
|
||||||
const int max_context_size = llama_n_ctx(ctx_tgt);
|
const int max_context_size = llama_n_ctx(ctx_tgt);
|
||||||
const int max_tokens_list_size = max_context_size - 4;
|
const int max_tokens_list_size = max_context_size - 4;
|
||||||
|
@ -147,7 +147,7 @@ int main(int argc, char ** argv) {
|
||||||
LOG("\n\n");
|
LOG("\n\n");
|
||||||
|
|
||||||
for (auto id : inp) {
|
for (auto id : inp) {
|
||||||
LOG("%s", llama_token_to_piece(ctx_tgt, id).c_str());
|
LOG("%s", common_token_to_piece(ctx_tgt, id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
const int n_input = inp.size();
|
const int n_input = inp.size();
|
||||||
|
@ -178,7 +178,7 @@ int main(int argc, char ** argv) {
|
||||||
bool has_eos = false;
|
bool has_eos = false;
|
||||||
|
|
||||||
// target model sampling context (reuse the llama_context's sampling instance)
|
// target model sampling context (reuse the llama_context's sampling instance)
|
||||||
struct gpt_sampler * smpl = gpt_sampler_init(model_tgt, params.sparams);
|
struct common_sampler * smpl = common_sampler_init(model_tgt, params.sparams);
|
||||||
|
|
||||||
struct llama_sampler * softmax = llama_sampler_init_softmax();
|
struct llama_sampler * softmax = llama_sampler_init_softmax();
|
||||||
|
|
||||||
|
@ -186,8 +186,8 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<seq_draft> drafts(n_seq_dft);
|
std::vector<seq_draft> drafts(n_seq_dft);
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
// allocate gpt_sampler for each draft sequence
|
// allocate llama_sampler for each draft sequence
|
||||||
drafts[s].smpl = gpt_sampler_init(model_dft, params.sparams);
|
drafts[s].smpl = common_sampler_init(model_dft, params.sparams);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
llama_batch batch_dft = llama_batch_init(params.n_ctx, 0, 1);
|
||||||
|
@ -229,9 +229,9 @@ int main(int argc, char ** argv) {
|
||||||
bool accept = false;
|
bool accept = false;
|
||||||
if (params.sparams.temp > 0) {
|
if (params.sparams.temp > 0) {
|
||||||
// stochastic verification
|
// stochastic verification
|
||||||
gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft], true);
|
||||||
|
|
||||||
auto & dist_tgt = *gpt_sampler_get_candidates(smpl);
|
auto & dist_tgt = *common_sampler_get_candidates(smpl);
|
||||||
|
|
||||||
float p_tgt = 0.0f;
|
float p_tgt = 0.0f;
|
||||||
float p_dft = 0.0f;
|
float p_dft = 0.0f;
|
||||||
|
@ -277,13 +277,13 @@ int main(int argc, char ** argv) {
|
||||||
s_keep = s;
|
s_keep = s;
|
||||||
accept = true;
|
accept = true;
|
||||||
token_id = drafts[s].tokens[i_dft];
|
token_id = drafts[s].tokens[i_dft];
|
||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = common_token_to_piece(ctx_tgt, token_id);
|
||||||
gpt_sampler_accept(smpl, token_id, true);
|
common_sampler_accept(smpl, token_id, true);
|
||||||
|
|
||||||
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
LOG_DBG("draft token %d of sequence %d (%d, '%s') accepted\n", i_dft, s, token_id, token_str.c_str());
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], llama_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
LOG_DBG("draft token %d of sequence %d (%d, '%s') rejected\n", i_dft, s, drafts[s].tokens[i_dft], common_token_to_piece(ctx_tgt, drafts[s].tokens[i_dft]).c_str());
|
||||||
drafts[s].active = false;
|
drafts[s].active = false;
|
||||||
|
|
||||||
// calculate residual probability
|
// calculate residual probability
|
||||||
|
@ -349,19 +349,19 @@ int main(int argc, char ** argv) {
|
||||||
const int idx = dist(rng);
|
const int idx = dist(rng);
|
||||||
|
|
||||||
token_id = dist_tgt.data[idx].id;
|
token_id = dist_tgt.data[idx].id;
|
||||||
gpt_sampler_accept(smpl, token_id, true);
|
common_sampler_accept(smpl, token_id, true);
|
||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = common_token_to_piece(ctx_tgt, token_id);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// greedy verification
|
// greedy verification
|
||||||
|
|
||||||
// sample from the target model
|
// sample from the target model
|
||||||
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
LOG_DBG("sampling target: s_keep = %3d, i_dft = %3d, i_batch_tgt = %3d\n", s_keep, i_dft, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
token_id = gpt_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
token_id = common_sampler_sample(smpl, ctx_tgt, drafts[s_keep].i_batch_tgt[i_dft]);
|
||||||
|
|
||||||
gpt_sampler_accept(smpl, token_id, true);
|
common_sampler_accept(smpl, token_id, true);
|
||||||
|
|
||||||
token_str = llama_token_to_piece(ctx_tgt, token_id);
|
token_str = common_token_to_piece(ctx_tgt, token_id);
|
||||||
|
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
if (!drafts[s].active) {
|
if (!drafts[s].active) {
|
||||||
|
@ -431,8 +431,8 @@ int main(int argc, char ** argv) {
|
||||||
drafts[0].dists.push_back(std::vector<llama_token_data>());
|
drafts[0].dists.push_back(std::vector<llama_token_data>());
|
||||||
drafts[0].i_batch_tgt.push_back(0);
|
drafts[0].i_batch_tgt.push_back(0);
|
||||||
|
|
||||||
llama_batch_clear(batch_dft);
|
common_batch_clear(batch_dft);
|
||||||
llama_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
common_batch_add (batch_dft, token_id, n_past_dft, { 0 }, true);
|
||||||
|
|
||||||
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
|
||||||
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
// LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
|
||||||
|
@ -446,9 +446,9 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (drafts[0].smpl) {
|
if (drafts[0].smpl) {
|
||||||
gpt_sampler_free(drafts[0].smpl);
|
common_sampler_free(drafts[0].smpl);
|
||||||
}
|
}
|
||||||
drafts[0].smpl = gpt_sampler_clone(smpl);
|
drafts[0].smpl = common_sampler_clone(smpl);
|
||||||
|
|
||||||
int n_seq_cur = 1;
|
int n_seq_cur = 1;
|
||||||
int n_past_cur = n_past_dft;
|
int n_past_cur = n_past_dft;
|
||||||
|
@ -461,8 +461,8 @@ int main(int argc, char ** argv) {
|
||||||
drafts[0].drafting = true;
|
drafts[0].drafting = true;
|
||||||
drafts[0].i_batch_dft = 0;
|
drafts[0].i_batch_dft = 0;
|
||||||
|
|
||||||
llama_batch_clear(batch_tgt);
|
common_batch_clear(batch_tgt);
|
||||||
llama_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
|
common_batch_add (batch_tgt, drafts[0].tokens[0], n_past_tgt, { 0 }, true);
|
||||||
|
|
||||||
// sample n_draft tokens from the draft model using tree-based sampling
|
// sample n_draft tokens from the draft model using tree-based sampling
|
||||||
for (int i = 0; i < n_draft; ++i) {
|
for (int i = 0; i < n_draft; ++i) {
|
||||||
|
@ -477,13 +477,13 @@ int main(int argc, char ** argv) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
gpt_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
common_sampler_sample(drafts[s].smpl, ctx_dft, drafts[s].i_batch_dft, true);
|
||||||
|
|
||||||
const auto * cur_p = gpt_sampler_get_candidates(drafts[s].smpl);
|
const auto * cur_p = common_sampler_get_candidates(drafts[s].smpl);
|
||||||
|
|
||||||
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
for (int k = 0; k < std::min(n_seq_dft + 3, (int) cur_p->size); ++k) {
|
||||||
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
LOG_DBG(" - draft candidate %3d for seq %3d, pos %3d: %6d (%8.3f) '%s'\n",
|
||||||
k, s, i, cur_p->data[k].id, cur_p->data[k].p, llama_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
k, s, i, cur_p->data[k].id, cur_p->data[k].p, common_token_to_piece(ctx_dft, cur_p->data[k].id).c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<int> sa(1, s);
|
std::vector<int> sa(1, s);
|
||||||
|
@ -518,9 +518,9 @@ int main(int argc, char ** argv) {
|
||||||
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
drafts[n_seq_cur].i_batch_tgt = drafts[s].i_batch_tgt;
|
||||||
|
|
||||||
if (drafts[n_seq_cur].smpl) {
|
if (drafts[n_seq_cur].smpl) {
|
||||||
gpt_sampler_free(drafts[n_seq_cur].smpl);
|
common_sampler_free(drafts[n_seq_cur].smpl);
|
||||||
}
|
}
|
||||||
drafts[n_seq_cur].smpl = gpt_sampler_clone(drafts[s].smpl);
|
drafts[n_seq_cur].smpl = common_sampler_clone(drafts[s].smpl);
|
||||||
|
|
||||||
sa.push_back(n_seq_cur);
|
sa.push_back(n_seq_cur);
|
||||||
|
|
||||||
|
@ -536,7 +536,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const int s = sa[is];
|
const int s = sa[is];
|
||||||
|
|
||||||
gpt_sampler_accept(drafts[s].smpl, id, true);
|
common_sampler_accept(drafts[s].smpl, id, true);
|
||||||
|
|
||||||
drafts[s].tokens.push_back(id);
|
drafts[s].tokens.push_back(id);
|
||||||
// save cur_p.data into drafts[s].dists
|
// save cur_p.data into drafts[s].dists
|
||||||
|
@ -545,12 +545,12 @@ int main(int argc, char ** argv) {
|
||||||
// add unique drafted tokens to the target batch
|
// add unique drafted tokens to the target batch
|
||||||
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
drafts[s].i_batch_tgt.push_back(batch_tgt.n_tokens);
|
||||||
|
|
||||||
llama_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
|
common_batch_add(batch_tgt, id, n_past_tgt + i + 1, { s }, true);
|
||||||
|
|
||||||
// add the token to the batch for batched decoding with the draft model
|
// add the token to the batch for batched decoding with the draft model
|
||||||
drafts[s].i_batch_dft = batch_dft.n_tokens;
|
drafts[s].i_batch_dft = batch_dft.n_tokens;
|
||||||
|
|
||||||
llama_batch_add(batch_dft, id, n_past_cur, { s }, true);
|
common_batch_add(batch_dft, id, n_past_cur, { s }, true);
|
||||||
|
|
||||||
if (batch_tgt.n_tokens > n_draft) {
|
if (batch_tgt.n_tokens > n_draft) {
|
||||||
drafts[s].drafting = false;
|
drafts[s].drafting = false;
|
||||||
|
@ -617,11 +617,11 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
LOG_INF("\n");
|
LOG_INF("\n");
|
||||||
LOG_INF("target:\n\n");
|
LOG_INF("target:\n\n");
|
||||||
gpt_perf_print(ctx_tgt, smpl);
|
common_perf_print(ctx_tgt, smpl);
|
||||||
|
|
||||||
gpt_sampler_free(smpl);
|
common_sampler_free(smpl);
|
||||||
for (int s = 0; s < n_seq_dft; ++s) {
|
for (int s = 0; s < n_seq_dft; ++s) {
|
||||||
gpt_sampler_free(drafts[s].smpl);
|
common_sampler_free(drafts[s].smpl);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_sampler_free(softmax);
|
llama_sampler_free(softmax);
|
||||||
|
|
|
@ -365,7 +365,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
const bool parse_special = !no_parse_special;
|
const bool parse_special = !no_parse_special;
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
tokens = ::llama_tokenize(model, prompt, add_bos, parse_special);
|
tokens = common_tokenize(model, prompt, add_bos, parse_special);
|
||||||
|
|
||||||
if (printing_ids) {
|
if (printing_ids) {
|
||||||
printf("[");
|
printf("[");
|
||||||
|
@ -380,7 +380,7 @@ int main(int raw_argc, char ** raw_argv) {
|
||||||
} else {
|
} else {
|
||||||
bool invalid_utf8 = false;
|
bool invalid_utf8 = false;
|
||||||
printf("%6d -> '", tokens[i]);
|
printf("%6d -> '", tokens[i]);
|
||||||
write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
write_utf8_cstr_to_stdout(common_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8);
|
||||||
if (invalid_utf8) {
|
if (invalid_utf8) {
|
||||||
printf("' (utf-8 decode failure)\n");
|
printf("' (utf-8 decode failure)\n");
|
||||||
} else {
|
} else {
|
||||||
|
|
20
flake.lock
generated
20
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1726153070,
|
"lastModified": 1727826117,
|
||||||
"narHash": "sha256-HO4zgY0ekfwO5bX0QH/3kJ/h4KvUDFZg8YpkNwIbg1U=",
|
"narHash": "sha256-K5ZLCyfO/Zj9mPFldf3iwS6oZStJcU4tSpiXTMYaaL0=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "bcef6817a8b2aa20a5a6dbb19b43e63c5bf8619a",
|
"rev": "3d04084d54bedc3d6b8b736c70ef449225c361b1",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1727348695,
|
"lastModified": 1728018373,
|
||||||
"narHash": "sha256-J+PeFKSDV+pHL7ukkfpVzCOO7mBSrrpJ3svwBFABbhI=",
|
"narHash": "sha256-NOiTvBbRLIOe5F6RbHaAh6++BNjsb149fGZd1T4+KBg=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "1925c603f17fc89f4c8f6bf6f631a802ad85d784",
|
"rev": "bc947f541ae55e999ffdb4013441347d83b00feb",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -36,14 +36,14 @@
|
||||||
},
|
},
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1725233747,
|
"lastModified": 1727825735,
|
||||||
"narHash": "sha256-Ss8QWLXdr2JCBPcYChJhz4xJm+h/xjl4G0c0XlP6a74=",
|
"narHash": "sha256-0xHYkMkeLVQAMa7gvkddbPqpxph+hDzdu1XdGPJR+Os=",
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"type": "tarball",
|
"type": "tarball",
|
||||||
"url": "https://github.com/NixOS/nixpkgs/archive/356624c12086a18f2ea2825fed34523d60ccc4e3.tar.gz"
|
"url": "https://github.com/NixOS/nixpkgs/archive/fb192fec7cc7a4c26d51779e9bab07ce6fa5597a.tar.gz"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": {
|
"root": {
|
||||||
|
|
|
@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
|
||||||
// Graph allocator
|
// Graph allocator
|
||||||
/*
|
/*
|
||||||
Example usage:
|
Example usage:
|
||||||
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
|
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
|
||||||
|
|
||||||
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
|
||||||
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
ggml_gallocr_reserve(galloc, build_graph(max_batch));
|
||||||
|
|
|
@ -12,20 +12,26 @@ extern "C" {
|
||||||
typedef struct ggml_backend_event * ggml_backend_event_t;
|
typedef struct ggml_backend_event * ggml_backend_event_t;
|
||||||
typedef struct ggml_backend * ggml_backend_t;
|
typedef struct ggml_backend * ggml_backend_t;
|
||||||
typedef void * ggml_backend_graph_plan_t;
|
typedef void * ggml_backend_graph_plan_t;
|
||||||
|
typedef struct ggml_backend_reg * ggml_backend_reg_t;
|
||||||
|
typedef struct ggml_backend_device * ggml_backend_dev_t;
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend buffer type
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend buffer
|
// Backend buffer
|
||||||
//
|
//
|
||||||
|
|
||||||
// buffer type
|
|
||||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
|
||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
|
||||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
|
||||||
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
|
||||||
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
|
||||||
|
|
||||||
// buffer
|
|
||||||
enum ggml_backend_buffer_usage {
|
enum ggml_backend_buffer_usage {
|
||||||
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
|
||||||
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
|
||||||
|
@ -36,7 +42,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
@ -47,8 +53,11 @@ extern "C" {
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
|
||||||
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
|
// tensor copy between different backends
|
||||||
|
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend (stream)
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
|
||||||
|
@ -64,9 +73,9 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
|
||||||
// "offset" refers to the offset of the tensor data for setting/getting data
|
// "offset" refers to the offset of the tensor data for setting/getting data
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
GGML_API GGML_CALL void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
|
||||||
|
|
||||||
|
@ -76,65 +85,121 @@ extern "C" {
|
||||||
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
|
// NOTE: will be removed, use device version instead
|
||||||
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||||
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// tensor copy between different backends
|
|
||||||
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
|
|
||||||
|
|
||||||
// asynchronous copy
|
// asynchronous copy
|
||||||
// the copy is performed after all the currently queued operations in backend_src
|
// the copy is performed after all the currently queued operations in backend_src
|
||||||
// backend_dst will wait for the copy to complete before performing other operations
|
// backend_dst will wait for the copy to complete before performing other operations
|
||||||
// automatic fallback to sync copy if async is not supported
|
// automatic fallback to sync copy if async is not supported
|
||||||
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
// events
|
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
|
||||||
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
|
|
||||||
|
//
|
||||||
|
// Events
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
|
||||||
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
|
||||||
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
|
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
|
||||||
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
|
||||||
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
|
||||||
|
|
||||||
//
|
//
|
||||||
// CPU backend
|
// Backend device
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
enum ggml_backend_dev_type {
|
||||||
|
GGML_BACKEND_DEVICE_TYPE_CPU,
|
||||||
|
GGML_BACKEND_DEVICE_TYPE_GPU,
|
||||||
|
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
|
||||||
|
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
|
||||||
|
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
|
||||||
|
};
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
|
// functionality supported by the device
|
||||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
struct ggml_backend_dev_caps {
|
||||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
// asynchronous operations
|
||||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
bool async;
|
||||||
|
// pinned host buffer
|
||||||
|
bool host_buffer;
|
||||||
|
// creating buffers from host ptr
|
||||||
|
bool buffer_from_host_ptr;
|
||||||
|
// event synchronization
|
||||||
|
bool events;
|
||||||
|
};
|
||||||
|
|
||||||
// Create a backend buffer from an existing pointer
|
// all the device properties
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
struct ggml_backend_dev_props {
|
||||||
|
const char * name;
|
||||||
|
const char * description;
|
||||||
|
size_t memory_free;
|
||||||
|
size_t memory_total;
|
||||||
|
enum ggml_backend_dev_type type;
|
||||||
|
struct ggml_backend_dev_caps caps;
|
||||||
|
};
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
|
||||||
|
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
|
||||||
|
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
|
||||||
|
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
|
||||||
|
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
|
||||||
|
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
|
||||||
|
|
||||||
#ifdef GGML_USE_CPU_HBM
|
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
|
||||||
#endif
|
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend (reg)
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
|
||||||
|
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
|
||||||
|
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
|
||||||
|
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
|
||||||
|
|
||||||
|
|
||||||
|
// Functions that may be obtained using ggml_backend_reg_get_proc_address
|
||||||
|
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
|
||||||
|
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend registry
|
// Backend registry
|
||||||
//
|
//
|
||||||
|
|
||||||
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
|
// Backend (reg) enumeration
|
||||||
|
GGML_API size_t ggml_backend_reg_count(void);
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
|
||||||
|
|
||||||
GGML_API size_t ggml_backend_reg_get_count(void);
|
// Device enumeration
|
||||||
GGML_API size_t ggml_backend_reg_find_by_name(const char * name); // returns index of backend with name, or SIZE_MAX if not found
|
GGML_API size_t ggml_backend_dev_count(void);
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
|
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
|
||||||
GGML_API const char * ggml_backend_reg_get_name(size_t i);
|
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
|
||||||
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
|
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
|
|
||||||
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
|
// Direct backend (stream) initialization
|
||||||
|
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
|
||||||
|
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
|
||||||
|
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
|
||||||
|
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
|
||||||
|
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
|
||||||
|
GGML_API ggml_backend_t ggml_backend_init_best(void);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend scheduler
|
// Backend scheduler
|
||||||
//
|
//
|
||||||
|
|
||||||
// The backend scheduler allows for multiple backends to be used together
|
// The backend scheduler allows for multiple backend devices to be used together
|
||||||
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
|
||||||
// The backends are selected based on:
|
// The backends are selected based on:
|
||||||
// - the backend that supports the operation
|
// - the backend that supports the operation
|
||||||
|
@ -169,9 +234,9 @@ extern "C" {
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
struct ggml_backend_sched;
|
|
||||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||||
|
|
||||||
|
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
|
||||||
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||||
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||||
//
|
//
|
||||||
|
@ -185,7 +250,7 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||||
|
|
||||||
// Initialize backend buffers from a measure graph
|
// Initialize backend buffers from a measure graph
|
||||||
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
|
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
|
||||||
|
|
||||||
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
|
||||||
|
@ -200,7 +265,7 @@ extern "C" {
|
||||||
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
|
||||||
|
|
||||||
// Allocate and compute graph on the backend scheduler
|
// Allocate and compute graph on the backend scheduler
|
||||||
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
|
||||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||||
|
@ -226,7 +291,7 @@ extern "C" {
|
||||||
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
|
||||||
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
|
||||||
|
|
||||||
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
|
||||||
|
|
||||||
// Compare the output of two backends
|
// Compare the output of two backends
|
||||||
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
|
||||||
|
@ -235,6 +300,26 @@ extern "C" {
|
||||||
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
|
||||||
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
//
|
||||||
|
// CPU backend
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||||
|
|
||||||
|
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||||
|
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||||
|
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||||
|
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||||
|
|
||||||
|
// Create a backend buffer from an existing pointer
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,13 +9,15 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
|
GGML_API ggml_backend_t ggml_backend_blas_init(void);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||||
|
|
||||||
// number of threads used for conversion to float
|
// number of threads used for conversion to float
|
||||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||||
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
|
@ -44,7 +44,7 @@ extern "C" {
|
||||||
* @param device The index of the device to initialize.
|
* @param device The index of the device to initialize.
|
||||||
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
|
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Checks if a given backend is a CANN backend.
|
* @brief Checks if a given backend is a CANN backend.
|
||||||
|
@ -55,7 +55,7 @@ GGML_API GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||||
* @param backend The backend instance to check.
|
* @param backend The backend instance to check.
|
||||||
* @return True if the backend is a CANN backend, false otherwise.
|
* @return True if the backend is a CANN backend, false otherwise.
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the CANN buffer type for a specified device.
|
* @brief Retrieves the CANN buffer type for a specified device.
|
||||||
|
@ -67,7 +67,7 @@ GGML_API GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||||
* @return A pointer to the buffer type interface for the specified device, or
|
* @return A pointer to the buffer type interface for the specified device, or
|
||||||
* nullptr if the device index is out of range.
|
* nullptr if the device index is out of range.
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t
|
GGML_API ggml_backend_buffer_type_t
|
||||||
ggml_backend_cann_buffer_type(int32_t device);
|
ggml_backend_cann_buffer_type(int32_t device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -78,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device);
|
||||||
*
|
*
|
||||||
* @return The number of CANN devices available.
|
* @return The number of CANN devices available.
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
|
GGML_API int32_t ggml_backend_cann_get_device_count(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||||
*
|
*
|
||||||
* @return A pointer to the host buffer type interface.
|
* @return A pointer to the host buffer type interface.
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Retrieves the description of a specific CANN device.
|
* @brief Retrieves the description of a specific CANN device.
|
||||||
|
@ -97,7 +97,7 @@ GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type
|
||||||
* @param description Pointer to a buffer where the description will be written.
|
* @param description Pointer to a buffer where the description will be written.
|
||||||
* @param description_size Size of the description buffer.
|
* @param description_size Size of the description buffer.
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
|
GGML_API void ggml_backend_cann_get_device_description(
|
||||||
int32_t device, char* description, size_t description_size);
|
int32_t device, char* description, size_t description_size);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -112,21 +112,10 @@ GGML_API GGML_CALL void ggml_backend_cann_get_device_description(
|
||||||
* @param total Pointer to a variable where the total memory size will be
|
* @param total Pointer to a variable where the total memory size will be
|
||||||
* stored.
|
* stored.
|
||||||
*/
|
*/
|
||||||
GGML_API GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device,
|
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
|
||||||
size_t* free,
|
size_t* free,
|
||||||
size_t* total);
|
size_t* total);
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Set the logging callback for GGML.
|
|
||||||
*
|
|
||||||
* This function sets the logging callback and user data for logging.
|
|
||||||
*
|
|
||||||
* @param log_callback The logging callback to set.
|
|
||||||
* @param user_data User data to pass to the logging callback.
|
|
||||||
*/
|
|
||||||
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
|
||||||
void* user_data);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -3,6 +3,10 @@
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
#ifdef GGML_USE_HIPBLAS
|
||||||
#define GGML_CUDA_NAME "ROCm"
|
#define GGML_CUDA_NAME "ROCm"
|
||||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||||
|
@ -13,35 +17,31 @@
|
||||||
#define GGML_CUDA_NAME "CUDA"
|
#define GGML_CUDA_NAME "CUDA"
|
||||||
#define GGML_CUBLAS_NAME "cuBLAS"
|
#define GGML_CUBLAS_NAME "cuBLAS"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define GGML_CUDA_MAX_DEVICES 16
|
#define GGML_CUDA_MAX_DEVICES 16
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
|
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||||
|
|
||||||
// device buffer
|
// device buffer
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
|
GGML_API int ggml_backend_cuda_get_device_count(void);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||||
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
||||||
|
|
||||||
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
// Note: this description is outdated
|
||||||
|
//
|
||||||
// An interface allowing to compute ggml_cgraph with Metal
|
// An interface allowing to compute ggml_cgraph with Metal
|
||||||
//
|
//
|
||||||
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
|
||||||
|
@ -37,17 +39,17 @@ extern "C" {
|
||||||
// user-code should use only these functions
|
// user-code should use only these functions
|
||||||
//
|
//
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
|
|
||||||
|
|
||||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
GGML_DEPRECATED(
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||||
|
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
// ideally, the user code should be doing these checks
|
// ideally, the user code should be doing these checks
|
||||||
|
@ -57,6 +59,8 @@ GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int fam
|
||||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -10,14 +10,18 @@ extern "C" {
|
||||||
#define GGML_RPC_MAX_SERVERS 16
|
#define GGML_RPC_MAX_SERVERS 16
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||||
|
|
||||||
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||||
|
|
||||||
// split tensor buffer that splits matrices by rows across multiple devices
|
// split tensor buffer that splits matrices by rows across multiple devices
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||||
|
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||||
|
|
||||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
|
||||||
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
|
||||||
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
|
GGML_API int ggml_backend_sycl_get_device_count();
|
||||||
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||||
|
|
||||||
// SYCL doesn't support registering host memory, keep here for reference
|
// SYCL doesn't support registering host memory, keep here for reference
|
||||||
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||||
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -13,16 +13,16 @@ extern "C" {
|
||||||
GGML_API void ggml_vk_instance_init(void);
|
GGML_API void ggml_vk_instance_init(void);
|
||||||
|
|
||||||
// backend API
|
// backend API
|
||||||
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||||
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
|
GGML_API int ggml_backend_vk_get_device_count(void);
|
||||||
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||||
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||||
|
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||||
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -187,16 +187,6 @@
|
||||||
# define GGML_API
|
# define GGML_API
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef GGML_MULTIPLATFORM
|
|
||||||
# if defined(_WIN32)
|
|
||||||
# define GGML_CALL
|
|
||||||
# else
|
|
||||||
# define GGML_CALL __attribute__((__ms_abi__))
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define GGML_CALL
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// TODO: support for clang
|
// TODO: support for clang
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
|
||||||
|
@ -340,7 +330,7 @@ extern "C" {
|
||||||
};
|
};
|
||||||
|
|
||||||
// get ggml_status name string
|
// get ggml_status name string
|
||||||
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
GGML_API const char * ggml_status_to_string(enum ggml_status status);
|
||||||
|
|
||||||
// ieee 754-2008 half-precision float16
|
// ieee 754-2008 half-precision float16
|
||||||
// todo: make this not an integral type
|
// todo: make this not an integral type
|
||||||
|
@ -466,6 +456,7 @@ extern "C" {
|
||||||
GGML_OP_SUM_ROWS,
|
GGML_OP_SUM_ROWS,
|
||||||
GGML_OP_MEAN,
|
GGML_OP_MEAN,
|
||||||
GGML_OP_ARGMAX,
|
GGML_OP_ARGMAX,
|
||||||
|
GGML_OP_COUNT_EQUAL,
|
||||||
GGML_OP_REPEAT,
|
GGML_OP_REPEAT,
|
||||||
GGML_OP_REPEAT_BACK,
|
GGML_OP_REPEAT_BACK,
|
||||||
GGML_OP_CONCAT,
|
GGML_OP_CONCAT,
|
||||||
|
@ -716,46 +707,46 @@ extern "C" {
|
||||||
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
GGML_API void ggml_print_object (const struct ggml_object * obj);
|
||||||
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
|
||||||
|
|
||||||
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||||
|
|
||||||
GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
|
GGML_API int64_t ggml_blck_size(enum ggml_type type);
|
||||||
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||||
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||||
|
|
||||||
GGML_DEPRECATED(
|
GGML_DEPRECATED(
|
||||||
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||||
"use ggml_row_size() instead");
|
"use ggml_row_size() instead");
|
||||||
|
|
||||||
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||||
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||||
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
GGML_API const char * ggml_op_symbol(enum ggml_op op);
|
||||||
|
|
||||||
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
|
||||||
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
|
||||||
|
|
||||||
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
|
GGML_API bool ggml_is_quantized(enum ggml_type type);
|
||||||
|
|
||||||
// TODO: temporary until model loading of ggml examples is refactored
|
// TODO: temporary until model loading of ggml examples is refactored
|
||||||
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
|
||||||
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
|
||||||
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
|
||||||
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
|
||||||
|
|
||||||
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
@ -847,7 +838,7 @@ extern "C" {
|
||||||
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
|
||||||
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
|
||||||
|
@ -1004,6 +995,12 @@ extern "C" {
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// count number of equal elements in a and b
|
||||||
|
GGML_API struct ggml_tensor * ggml_count_equal(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// if a is the same shape as b, and a is not parameter, return a
|
// if a is the same shape as b, and a is not parameter, return a
|
||||||
// otherwise, return a new tensor: repeat(a) to fit in b
|
// otherwise, return a new tensor: repeat(a) to fit in b
|
||||||
GGML_API struct ggml_tensor * ggml_repeat(
|
GGML_API struct ggml_tensor * ggml_repeat(
|
||||||
|
@ -1561,7 +1558,7 @@ extern "C" {
|
||||||
"use ggml_rope_ext_inplace instead");
|
"use ggml_rope_ext_inplace instead");
|
||||||
|
|
||||||
// compute correction dims for YaRN RoPE scaling
|
// compute correction dims for YaRN RoPE scaling
|
||||||
GGML_CALL void ggml_rope_yarn_corr_dims(
|
void ggml_rope_yarn_corr_dims(
|
||||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||||
|
|
||||||
// rotary position embedding backward, i.e compute dx from dy
|
// rotary position embedding backward, i.e compute dx from dy
|
||||||
|
@ -2179,6 +2176,10 @@ extern "C" {
|
||||||
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
||||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
|
// Set callback for all future logging events.
|
||||||
|
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||||
|
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||||
|
|
||||||
// optimization parameters
|
// optimization parameters
|
||||||
//
|
//
|
||||||
// see ggml.c (ggml_opt_default_params) for default values
|
// see ggml.c (ggml_opt_default_params) for default values
|
||||||
|
@ -2536,7 +2537,7 @@ extern "C" {
|
||||||
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
||||||
const void * GGML_RESTRICT y, int nr, int nc);
|
const void * GGML_RESTRICT y, int nr, int nc);
|
||||||
|
|
||||||
typedef struct {
|
struct ggml_type_traits {
|
||||||
const char * type_name;
|
const char * type_name;
|
||||||
int64_t blck_size;
|
int64_t blck_size;
|
||||||
int64_t blck_size_interleave; // interleave elements in blocks
|
int64_t blck_size_interleave; // interleave elements in blocks
|
||||||
|
@ -2552,9 +2553,9 @@ extern "C" {
|
||||||
int64_t ncols; // number of columns to process simultaneously
|
int64_t ncols; // number of columns to process simultaneously
|
||||||
ggml_gemv_t gemv;
|
ggml_gemv_t gemv;
|
||||||
ggml_gemm_t gemm;
|
ggml_gemm_t gemm;
|
||||||
} ggml_type_traits_t;
|
};
|
||||||
|
|
||||||
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
|
@ -163,8 +163,8 @@ if (GGML_OPENMP)
|
||||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||||
|
|
||||||
if (GGML_MUSA)
|
if (GGML_MUSA)
|
||||||
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
|
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
|
||||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
|
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
|
||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
message(WARNING "OpenMP not found")
|
message(WARNING "OpenMP not found")
|
||||||
|
@ -190,22 +190,24 @@ if (GGML_BLAS)
|
||||||
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||||
find_package(PkgConfig REQUIRED)
|
find_package(PkgConfig REQUIRED)
|
||||||
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
|
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
|
||||||
pkg_check_modules(DepBLAS REQUIRED blas)
|
pkg_check_modules(DepBLAS blas)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
|
||||||
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
|
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
|
||||||
pkg_check_modules(DepBLAS openblas64)
|
pkg_check_modules(DepBLAS openblas64)
|
||||||
if (NOT DepBLAS_FOUND)
|
if (NOT DepBLAS_FOUND)
|
||||||
pkg_check_modules(DepBLAS REQUIRED openblas)
|
pkg_check_modules(DepBLAS openblas)
|
||||||
endif()
|
endif()
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
||||||
pkg_check_modules(DepBLAS REQUIRED blis)
|
add_compile_definitions(GGML_BLAS_USE_BLIS)
|
||||||
|
pkg_check_modules(DepBLAS blis)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
||||||
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
|
pkg_check_modules(DepBLAS blas-atlas)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
||||||
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
|
pkg_check_modules(DepBLAS flexiblas_api)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
||||||
|
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||||
# all Intel* libraries share the same include path
|
# all Intel* libraries share the same include path
|
||||||
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
|
pkg_check_modules(DepBLAS mkl-sdl)
|
||||||
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
||||||
# this doesn't provide pkg-config
|
# this doesn't provide pkg-config
|
||||||
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||||
|
@ -511,8 +513,8 @@ if (GGML_HIPBLAS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_SYCL)
|
if (GGML_SYCL)
|
||||||
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
|
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
|
||||||
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
|
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
|
||||||
|
@ -532,6 +534,9 @@ if (GGML_SYCL)
|
||||||
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
|
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
|
||||||
|
|
||||||
if (GGML_SYCL_F16)
|
if (GGML_SYCL_F16)
|
||||||
|
if (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||||
|
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
|
||||||
|
endif()
|
||||||
add_compile_definitions(GGML_SYCL_F16)
|
add_compile_definitions(GGML_SYCL_F16)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -543,6 +548,12 @@ if (GGML_SYCL)
|
||||||
|
|
||||||
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
|
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||||
|
# INFO: Allowed Sub_group_sizes are not consistent through all
|
||||||
|
# hip targets. For example, 64 is used for certain models, but the backend
|
||||||
|
# does not support it.
|
||||||
|
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
|
||||||
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
|
||||||
else()
|
else()
|
||||||
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
|
||||||
endif()
|
endif()
|
||||||
|
@ -576,6 +587,12 @@ if (GGML_SYCL)
|
||||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
|
||||||
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
|
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
|
||||||
|
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
|
||||||
|
if (GGML_SYCL_HIP_TARGET STREQUAL "")
|
||||||
|
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
|
||||||
|
endif()
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
|
||||||
|
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
@ -1310,7 +1327,7 @@ add_library(ggml
|
||||||
../include/ggml-backend.h
|
../include/ggml-backend.h
|
||||||
ggml.c
|
ggml.c
|
||||||
ggml-alloc.c
|
ggml-alloc.c
|
||||||
ggml-backend.c
|
ggml-backend.cpp
|
||||||
ggml-quants.c
|
ggml-quants.c
|
||||||
ggml-quants.h
|
ggml-quants.h
|
||||||
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
|
||||||
|
@ -1346,6 +1363,10 @@ if (MATH_LIBRARY)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||||
|
list(APPEND GGML_EXTRA_LIBS_PRIVATE dl) # Must be linked explicitly
|
||||||
|
endif()
|
||||||
|
|
||||||
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
|
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
|
||||||
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
|
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
|
||||||
target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
|
target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
|
|
||||||
//#define GGML_ALLOCATOR_DEBUG
|
//#define GGML_ALLOCATOR_DEBUG
|
||||||
|
|
||||||
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
|
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
|
||||||
#define AT_PRINTF(...)
|
#define AT_PRINTF(...)
|
||||||
|
|
||||||
|
|
||||||
|
@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
|
||||||
size = GGML_PAD(size, talloc->alignment);
|
size = GGML_PAD(size, talloc->alignment);
|
||||||
|
|
||||||
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
|
||||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
||||||
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
||||||
GGML_ABORT("not enough space in the buffer");
|
GGML_ABORT("not enough space in the buffer");
|
||||||
}
|
}
|
||||||
|
@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||||
best_fit_block = alloc->n_free_blocks - 1;
|
best_fit_block = alloc->n_free_blocks - 1;
|
||||||
} else {
|
} else {
|
||||||
// this should never happen
|
// this should never happen
|
||||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
||||||
__func__, size, max_avail);
|
__func__, size, max_avail);
|
||||||
GGML_ABORT("not enough space in the buffer");
|
GGML_ABORT("not enough space in the buffer");
|
||||||
}
|
}
|
||||||
|
@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
||||||
for (int i = 0; i < 1024; i++) {
|
for (int i = 0; i < 1024; i++) {
|
||||||
if (alloc->allocated_tensors[i].tensor) {
|
if (alloc->allocated_tensors[i].tensor) {
|
||||||
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
||||||
alloc->allocated_tensors[i].offset,
|
alloc->allocated_tensors[i].offset,
|
||||||
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
||||||
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fprintf(stderr, "\n");
|
GGML_LOG_DEBUG("\n");
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -768,13 +768,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
|
||||||
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
||||||
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ggml_backend_buffer_free(galloc->buffers[i]);
|
ggml_backend_buffer_free(galloc->buffers[i]);
|
||||||
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
||||||
if (galloc->buffers[i] == NULL) {
|
if (galloc->buffers[i] == NULL) {
|
||||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
||||||
|
@ -825,14 +825,14 @@ static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_t
|
||||||
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
|
||||||
if (galloc->n_nodes != graph->n_nodes) {
|
if (galloc->n_nodes != graph->n_nodes) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
|
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (galloc->n_leafs != graph->n_leafs) {
|
if (galloc->n_leafs != graph->n_leafs) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
|
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -843,7 +843,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||||
|
|
||||||
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
|
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -855,7 +855,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
|
||||||
}
|
}
|
||||||
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
||||||
#endif
|
#endif
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -869,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
|
||||||
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
if (ggml_gallocr_needs_realloc(galloc, graph)) {
|
||||||
if (galloc->n_buffers == 1) {
|
if (galloc->n_buffers == 1) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
|
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
if (!ggml_gallocr_reserve(galloc, graph)) {
|
if (!ggml_gallocr_reserve(galloc, graph)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -940,7 +940,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
|
||||||
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
|
||||||
if (buffer == NULL) {
|
if (buffer == NULL) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
|
||||||
#endif
|
#endif
|
||||||
for (size_t i = 0; i < *n_buffers; i++) {
|
for (size_t i = 0; i < *n_buffers; i++) {
|
||||||
ggml_backend_buffer_free((*buffers)[i]);
|
ggml_backend_buffer_free((*buffers)[i]);
|
||||||
|
@ -990,7 +990,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this_size > max_size) {
|
if (this_size > max_size) {
|
||||||
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
||||||
__func__, t->name,
|
__func__, t->name,
|
||||||
ggml_backend_buft_name(buft),
|
ggml_backend_buft_name(buft),
|
||||||
this_size, max_size);
|
this_size, max_size);
|
||||||
|
@ -1022,7 +1022,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
||||||
|
|
||||||
if (n_buffers == 0) {
|
if (n_buffers == 0) {
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
|
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,145 +9,218 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend buffer
|
// Backend buffer type
|
||||||
//
|
//
|
||||||
|
|
||||||
// buffer type
|
|
||||||
typedef void * ggml_backend_buffer_type_context_t;
|
|
||||||
|
|
||||||
struct ggml_backend_buffer_type_i {
|
struct ggml_backend_buffer_type_i {
|
||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
|
const char * (*get_name) (ggml_backend_buffer_type_t buft);
|
||||||
// allocate a buffer of this type
|
// allocate a buffer of this type
|
||||||
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
|
||||||
// tensor alignment
|
// tensor alignment
|
||||||
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft);
|
||||||
// max buffer size that can be allocated
|
// (optional) max buffer size that can be allocated (defaults to SIZE_MAX)
|
||||||
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
|
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
|
||||||
// data size needed to allocate the tensor, including padding
|
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
|
||||||
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
|
||||||
// check if tensor data is in host memory
|
// (optional) check if tensor data is in host memory (defaults to false)
|
||||||
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
|
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer_type {
|
struct ggml_backend_buffer_type {
|
||||||
struct ggml_backend_buffer_type_i iface;
|
struct ggml_backend_buffer_type_i iface;
|
||||||
ggml_backend_buffer_type_context_t context;
|
ggml_backend_dev_t device;
|
||||||
|
void * context;
|
||||||
};
|
};
|
||||||
|
|
||||||
// buffer
|
//
|
||||||
typedef void * ggml_backend_buffer_context_t;
|
// Backend buffer
|
||||||
|
//
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
|
const char * (*get_name) (ggml_backend_buffer_t buffer);
|
||||||
void (*GGML_CALL free_buffer) (ggml_backend_buffer_t buffer);
|
// (optional) free the buffer
|
||||||
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
|
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||||
void (*GGML_CALL init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
// base address of the buffer
|
||||||
void (*GGML_CALL memset_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||||
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
// (optional) initialize a tensor in the buffer (eg. add tensor extras)
|
||||||
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
|
// tensor data access
|
||||||
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||||
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
|
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
|
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
|
||||||
|
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
// clear the entire buffer
|
||||||
|
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
|
// (optional) reset any internal state due to tensor initialization, such as tensor extras
|
||||||
|
void (*reset) (ggml_backend_buffer_t buffer);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
struct ggml_backend_buffer_i iface;
|
struct ggml_backend_buffer_i iface;
|
||||||
ggml_backend_buffer_type_t buft;
|
ggml_backend_buffer_type_t buft;
|
||||||
ggml_backend_buffer_context_t context;
|
void * context;
|
||||||
size_t size;
|
size_t size;
|
||||||
enum ggml_backend_buffer_usage usage;
|
enum ggml_backend_buffer_usage usage;
|
||||||
};
|
};
|
||||||
|
|
||||||
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
ggml_backend_buffer_type_t buft,
|
ggml_backend_buffer_type_t buft,
|
||||||
struct ggml_backend_buffer_i iface,
|
struct ggml_backend_buffer_i iface,
|
||||||
ggml_backend_buffer_context_t context,
|
void * context,
|
||||||
size_t size);
|
size_t size);
|
||||||
|
|
||||||
// do not use directly, use ggml_backend_tensor_copy instead
|
// do not use directly, use ggml_backend_tensor_copy instead
|
||||||
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
|
// multi-buffer
|
||||||
// buffer that contains a collection of buffers
|
// buffer that contains a collection of buffers
|
||||||
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
|
||||||
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
|
||||||
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend
|
// Backend (stream)
|
||||||
//
|
//
|
||||||
|
|
||||||
typedef void * ggml_backend_context_t;
|
|
||||||
|
|
||||||
struct ggml_backend_i {
|
struct ggml_backend_i {
|
||||||
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
|
const char * (*get_name)(ggml_backend_t backend);
|
||||||
|
|
||||||
void (*GGML_CALL free)(ggml_backend_t backend);
|
void (*free)(ggml_backend_t backend);
|
||||||
|
|
||||||
|
// Will be moved to the device interface
|
||||||
// buffer allocation
|
// buffer allocation
|
||||||
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
|
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
|
||||||
|
|
||||||
// (optional) asynchronous tensor data access
|
// (optional) asynchronous tensor data access
|
||||||
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
// (optional) complete all pending operations
|
// (optional) complete all pending operations
|
||||||
void (*GGML_CALL synchronize)(ggml_backend_t backend);
|
void (*synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan (not used currently)
|
// (optional) compute graph with a plan (not used currently)
|
||||||
// create a new plan for a graph
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
||||||
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
|
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
|
||||||
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
|
||||||
void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
|
||||||
// compute the graph with the plan
|
// compute the graph with the plan
|
||||||
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
// compute graph without a plan (async)
|
// compute graph (always async if supported by the backend)
|
||||||
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// check if the backend can compute an operation
|
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
|
||||||
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
// new backends should implement the device interface instead
|
||||||
|
// These functions are being moved to the device interface
|
||||||
// check if the backend can use tensors allocated in a buffer type
|
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
|
||||||
|
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
|
|
||||||
// these should be expensive operations with large batch sizes that may benefit from running on this backend
|
|
||||||
// even if the weight has to be copied from the CPU temporarily
|
|
||||||
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
|
||||||
|
|
||||||
// (optional) event synchronization
|
// (optional) event synchronization
|
||||||
// create a new event that can record events on this backend instance
|
// record an event on this stream
|
||||||
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
|
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
|
||||||
void (*GGML_CALL event_free) (ggml_backend_event_t event);
|
// wait for an event on on a different stream
|
||||||
// record an event on the backend instance that created it
|
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
||||||
void (*GGML_CALL event_record) (ggml_backend_event_t event);
|
|
||||||
// wait for an event on on a different backend instance
|
|
||||||
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
|
|
||||||
// block until an event is recorded
|
|
||||||
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend {
|
struct ggml_backend {
|
||||||
ggml_guid_t guid;
|
ggml_guid_t guid;
|
||||||
|
|
||||||
struct ggml_backend_i iface;
|
struct ggml_backend_i iface;
|
||||||
ggml_backend_context_t context;
|
ggml_backend_dev_t device;
|
||||||
|
void * context;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_event {
|
struct ggml_backend_event {
|
||||||
ggml_backend_t backend;
|
struct ggml_backend_device * device;
|
||||||
void * context;
|
void * context;
|
||||||
};
|
};
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend registry
|
// Backend device
|
||||||
//
|
//
|
||||||
|
|
||||||
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
|
// Note: if additional properties are needed, we should add a struct with all of them
|
||||||
|
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
|
||||||
|
struct ggml_backend_device_i {
|
||||||
|
// device name: short identifier for this device, such as "CPU" or "CUDA0"
|
||||||
|
const char * (*get_name)(ggml_backend_dev_t dev);
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
|
// device description: short informative description of the device, could be the model name
|
||||||
|
const char * (*get_description)(ggml_backend_dev_t dev);
|
||||||
|
|
||||||
|
// device memory in bytes
|
||||||
|
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
|
||||||
|
|
||||||
|
// device type
|
||||||
|
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
|
||||||
|
|
||||||
|
// device properties
|
||||||
|
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
|
||||||
|
|
||||||
|
// backend (stream) initialization
|
||||||
|
ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
|
||||||
|
|
||||||
|
// preferred buffer type
|
||||||
|
ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
|
||||||
|
|
||||||
|
// (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
|
||||||
|
ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
|
||||||
|
|
||||||
|
// (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
|
||||||
|
ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
|
||||||
|
|
||||||
|
// check if the backend can compute an operation
|
||||||
|
bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
// check if the backend can use tensors allocated in a buffer type
|
||||||
|
bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
|
// (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
|
||||||
|
// these should be expensive operations that may benefit from running on this backend instead of the CPU backend
|
||||||
|
bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
|
||||||
|
|
||||||
|
// (optional) event synchronization
|
||||||
|
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
|
||||||
|
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||||
|
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_device {
|
||||||
|
struct ggml_backend_device_i iface;
|
||||||
|
ggml_backend_reg_t reg;
|
||||||
|
void * context;
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// Backend (reg)
|
||||||
|
//
|
||||||
|
|
||||||
|
struct ggml_backend_reg_i {
|
||||||
|
const char * (*get_name)(ggml_backend_reg_t reg);
|
||||||
|
|
||||||
|
// enumerate available devices
|
||||||
|
size_t (*get_device_count)(ggml_backend_reg_t reg);
|
||||||
|
ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
|
||||||
|
|
||||||
|
// (optional) get a pointer to a function in the backend
|
||||||
|
// backends can add custom functions that are not part of the standard ggml-backend interface
|
||||||
|
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_reg {
|
||||||
|
// int api_version; // TODO: for dynamic loading
|
||||||
|
struct ggml_backend_reg_i iface;
|
||||||
|
void * context;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// Internal backend registry API
|
||||||
|
void ggml_backend_register(ggml_backend_reg_t reg);
|
||||||
|
void ggml_backend_device_register(ggml_backend_dev_t device);
|
||||||
|
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
|
||||||
|
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
#include <future>
|
#include <future>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <cstring>
|
||||||
|
|
||||||
#if defined(GGML_USE_ACCELERATE)
|
#if defined(GGML_USE_ACCELERATE)
|
||||||
# include <Accelerate/Accelerate.h>
|
# include <Accelerate/Accelerate.h>
|
||||||
|
@ -26,30 +27,6 @@ struct ggml_backend_blas_context {
|
||||||
#endif
|
#endif
|
||||||
};
|
};
|
||||||
|
|
||||||
// helper function to determine if it is better to use BLAS or not
|
|
||||||
// for large matrices, BLAS is faster
|
|
||||||
static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
|
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
|
||||||
const struct ggml_tensor * src1 = dst->src[1];
|
|
||||||
|
|
||||||
const int64_t ne10 = src1->ne[0];
|
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
|
||||||
const int64_t ne1 = dst->ne[1];
|
|
||||||
|
|
||||||
// TODO: find the optimal values for these
|
|
||||||
if (ggml_is_contiguous(src0) &&
|
|
||||||
ggml_is_contiguous(src1) &&
|
|
||||||
src1->type == GGML_TYPE_F32 &&
|
|
||||||
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
|
|
||||||
|
|
||||||
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
|
||||||
const struct ggml_tensor * src0 = dst->src[0];
|
const struct ggml_tensor * src0 = dst->src[0];
|
||||||
const struct ggml_tensor * src1 = dst->src[1];
|
const struct ggml_tensor * src1 = dst->src[1];
|
||||||
|
@ -88,8 +65,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
|
||||||
|
|
||||||
// convert src0 to float
|
// convert src0 to float
|
||||||
if (type != GGML_TYPE_F32) {
|
if (type != GGML_TYPE_F32) {
|
||||||
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
|
const auto * type_traits = ggml_get_type_traits(type);
|
||||||
ggml_to_float_t const to_float = type_traits.to_float;
|
ggml_to_float_t const to_float = type_traits->to_float;
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
|
@ -235,25 +212,25 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
|
||||||
|
|
||||||
// backend interface
|
// backend interface
|
||||||
|
|
||||||
GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
|
static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
|
||||||
return "BLAS";
|
return "BLAS";
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
|
static void ggml_backend_blas_free(ggml_backend_t backend) {
|
||||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
||||||
delete ctx;
|
delete ctx;
|
||||||
delete backend;
|
delete backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return ggml_backend_cpu_buffer_type();
|
return ggml_backend_cpu_buffer_type();
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
@ -285,29 +262,8 @@ GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
|
||||||
const struct ggml_tensor * src0 = op->src[0];
|
|
||||||
const struct ggml_tensor * src1 = op->src[1];
|
|
||||||
|
|
||||||
return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) ||
|
|
||||||
(op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
|
|
||||||
op->src[1]->type == GGML_TYPE_F32 &&
|
|
||||||
ggml_is_matrix(src0) &&
|
|
||||||
ggml_is_matrix(src1) &&
|
|
||||||
ggml_is_contiguous(src0) &&
|
|
||||||
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
|
||||||
return ggml_backend_buft_is_host(buft);
|
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static struct ggml_backend_i blas_backend_i = {
|
static struct ggml_backend_i blas_backend_i = {
|
||||||
/* .get_name = */ ggml_backend_blas_name,
|
/* .get_name = */ ggml_backend_blas_get_name,
|
||||||
/* .free = */ ggml_backend_blas_free,
|
/* .free = */ ggml_backend_blas_free,
|
||||||
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
|
||||||
/* .set_tensor_async = */ NULL,
|
/* .set_tensor_async = */ NULL,
|
||||||
|
@ -319,14 +275,11 @@ static struct ggml_backend_i blas_backend_i = {
|
||||||
/* .graph_plan_update = */ NULL,
|
/* .graph_plan_update = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
/* .graph_compute = */ ggml_backend_blas_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_blas_supports_op,
|
/* .supports_op = */ NULL,
|
||||||
/* .supports_buft = */ ggml_backend_blas_supports_buft,
|
/* .supports_buft = */ NULL,
|
||||||
/* .offload_op = */ NULL,
|
/* .offload_op = */ NULL,
|
||||||
/* .event_new = */ NULL,
|
|
||||||
/* .event_free = */ NULL,
|
|
||||||
/* .event_record = */ NULL,
|
/* .event_record = */ NULL,
|
||||||
/* .event_wait = */ NULL,
|
/* .event_wait = */ NULL,
|
||||||
/* .event_synchronize = */ NULL,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static ggml_guid_t ggml_backend_blas_guid(void) {
|
static ggml_guid_t ggml_backend_blas_guid(void) {
|
||||||
|
@ -340,23 +293,24 @@ ggml_backend_t ggml_backend_blas_init(void) {
|
||||||
ggml_backend_t backend = new ggml_backend {
|
ggml_backend_t backend = new ggml_backend {
|
||||||
/* .guid = */ ggml_backend_blas_guid(),
|
/* .guid = */ ggml_backend_blas_guid(),
|
||||||
/* .interface = */ blas_backend_i,
|
/* .interface = */ blas_backend_i,
|
||||||
|
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
|
||||||
/* .context = */ ctx,
|
/* .context = */ ctx,
|
||||||
};
|
};
|
||||||
|
|
||||||
#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
|
||||||
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
|
||||||
fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
|
||||||
fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
return backend;
|
return backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
|
bool ggml_backend_is_blas(ggml_backend_t backend) {
|
||||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
|
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -366,3 +320,205 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
|
||||||
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
|
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
|
||||||
ctx->n_threads = n_threads;
|
ctx->n_threads = n_threads;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// device interface
|
||||||
|
|
||||||
|
static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
|
||||||
|
return "BLAS";
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
|
||||||
|
#if defined(GGML_USE_ACCELERATE)
|
||||||
|
return "Accelerate";
|
||||||
|
#elif defined(GGML_BLAS_USE_MKL)
|
||||||
|
return "MKL";
|
||||||
|
#elif defined(GGML_BLAS_USE_BLIS)
|
||||||
|
return "BLIS";
|
||||||
|
#elif defined(GGML_BLAS_USE_NVPL)
|
||||||
|
return "NVPL";
|
||||||
|
#elif defined(OPENBLAS_VERSION)
|
||||||
|
return "OpenBLAS";
|
||||||
|
#else
|
||||||
|
return "BLAS";
|
||||||
|
#endif
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||||
|
// TODO
|
||||||
|
*free = 0;
|
||||||
|
*total = 0;
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
|
||||||
|
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||||
|
props->name = ggml_backend_blas_device_get_name(dev);
|
||||||
|
props->description = ggml_backend_blas_device_get_description(dev);
|
||||||
|
props->type = ggml_backend_blas_device_get_type(dev);
|
||||||
|
ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||||
|
props->caps = {
|
||||||
|
/* .async = */ false,
|
||||||
|
/* .host_buffer = */ false,
|
||||||
|
/* .buffer_from_host_ptr = */ true,
|
||||||
|
/* .events = */ false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
|
||||||
|
return ggml_backend_blas_init();
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
GGML_UNUSED(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||||
|
return ggml_backend_cpu_buffer_type();
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||||
|
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
GGML_UNUSED(max_tensor_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||||
|
const struct ggml_tensor * src0 = op->src[0];
|
||||||
|
const struct ggml_tensor * src1 = op->src[1];
|
||||||
|
|
||||||
|
switch (op->op) {
|
||||||
|
case GGML_OP_NONE:
|
||||||
|
case GGML_OP_RESHAPE:
|
||||||
|
case GGML_OP_VIEW:
|
||||||
|
case GGML_OP_PERMUTE:
|
||||||
|
case GGML_OP_TRANSPOSE:
|
||||||
|
return true;
|
||||||
|
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
{
|
||||||
|
// BLAS usually is only faster for large matrices
|
||||||
|
const struct ggml_tensor * src0 = op->src[0];
|
||||||
|
const struct ggml_tensor * src1 = op->src[1];
|
||||||
|
|
||||||
|
const int64_t ne10 = src1->ne[0];
|
||||||
|
|
||||||
|
const int64_t ne0 = op->ne[0];
|
||||||
|
const int64_t ne1 = op->ne[1];
|
||||||
|
|
||||||
|
// TODO: find the optimal value
|
||||||
|
const int64_t min_batch = 32;
|
||||||
|
|
||||||
|
return ggml_is_contiguous(src0) &&
|
||||||
|
ggml_is_contiguous(src1) &&
|
||||||
|
src1->type == GGML_TYPE_F32 &&
|
||||||
|
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
|
||||||
|
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
case GGML_OP_OUT_PROD:
|
||||||
|
return op->src[0]->type == GGML_TYPE_F32 &&
|
||||||
|
op->src[1]->type == GGML_TYPE_F32 &&
|
||||||
|
ggml_is_matrix(src0) &&
|
||||||
|
ggml_is_matrix(src1) &&
|
||||||
|
ggml_is_contiguous(src0) &&
|
||||||
|
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
|
||||||
|
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
|
||||||
|
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||||
|
return ggml_backend_buft_is_host(buft);
|
||||||
|
|
||||||
|
GGML_UNUSED(dev);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
|
||||||
|
/* .get_name = */ ggml_backend_blas_device_get_name,
|
||||||
|
/* .get_description = */ ggml_backend_blas_device_get_description,
|
||||||
|
/* .get_memory = */ ggml_backend_blas_device_get_memory,
|
||||||
|
/* .get_type = */ ggml_backend_blas_device_get_type,
|
||||||
|
/* .get_props = */ ggml_backend_blas_device_get_props,
|
||||||
|
/* .init_backend = */ ggml_backend_blas_device_init,
|
||||||
|
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
|
||||||
|
/* .get_host_buffer_type = */ NULL,
|
||||||
|
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
|
||||||
|
/* .supports_op = */ ggml_backend_blas_device_supports_op,
|
||||||
|
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
|
||||||
|
/* .offload_op = */ NULL,
|
||||||
|
/* .event_new = */ NULL,
|
||||||
|
/* .event_free = */ NULL,
|
||||||
|
/* .event_synchronize = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
// backend reg interface
|
||||||
|
|
||||||
|
static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
|
||||||
|
return "BLAS";
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||||
|
GGML_ASSERT(index == 0);
|
||||||
|
|
||||||
|
static ggml_backend_device ggml_backend_blas_device = {
|
||||||
|
/* .iface = */ ggml_backend_blas_device_i,
|
||||||
|
/* .reg = */ reg,
|
||||||
|
/* .context = */ nullptr,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_blas_device;
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
GGML_UNUSED(index);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||||
|
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||||
|
return (void *)ggml_backend_blas_set_n_threads;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
GGML_UNUSED(reg);
|
||||||
|
GGML_UNUSED(name);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
|
||||||
|
/* .get_name = */ ggml_backend_blas_reg_get_name,
|
||||||
|
/* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
|
||||||
|
/* .get_device = */ ggml_backend_blas_reg_get_device,
|
||||||
|
/* .get_proc_address = */ ggml_backend_blas_get_proc_address,
|
||||||
|
};
|
||||||
|
|
||||||
|
ggml_backend_reg_t ggml_backend_blas_reg(void) {
|
||||||
|
static struct ggml_backend_reg ggml_backend_blas_reg = {
|
||||||
|
/* .iface = */ ggml_backend_blas_reg_i,
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_blas_reg;
|
||||||
|
}
|
||||||
|
|
|
@ -39,69 +39,6 @@
|
||||||
|
|
||||||
#include "ggml-common.h"
|
#include "ggml-common.h"
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Default logging callback for GGML.
|
|
||||||
*
|
|
||||||
* This function is the default logging callback that logs messages to stderr.
|
|
||||||
*
|
|
||||||
* @param level The log level.
|
|
||||||
* @param msg The log message.
|
|
||||||
* @param user_data User data passed to the callback.
|
|
||||||
*/
|
|
||||||
static void ggml_cann_default_log_callback(enum ggml_log_level level,
|
|
||||||
const char* msg, void* user_data) {
|
|
||||||
GGML_UNUSED(level);
|
|
||||||
GGML_UNUSED(user_data);
|
|
||||||
fprintf(stderr, "%s", msg);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_log_callback ggml_cann_log_callback = ggml_cann_default_log_callback;
|
|
||||||
void* ggml_cann_log_user_data = NULL;
|
|
||||||
|
|
||||||
GGML_API void ggml_backend_cann_log_set_callback(ggml_log_callback log_callback,
|
|
||||||
void* user_data) {
|
|
||||||
ggml_cann_log_callback = log_callback;
|
|
||||||
ggml_cann_log_user_data = user_data;
|
|
||||||
}
|
|
||||||
|
|
||||||
#define GGML_CANN_LOG_INFO(...) ggml_cann_log(GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
|
||||||
#define GGML_CANN_LOG_WARN(...) ggml_cann_log(GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
|
||||||
#define GGML_CANN_LOG_ERROR(...) \
|
|
||||||
ggml_cann_log(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
|
||||||
|
|
||||||
GGML_ATTRIBUTE_FORMAT(2, 3)
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Log a message using the current logging callback.
|
|
||||||
*
|
|
||||||
* This function formats a log message and passes it to the current logging
|
|
||||||
* callback.
|
|
||||||
*
|
|
||||||
* @param level The log level.
|
|
||||||
* @param format The format string for the log message.
|
|
||||||
* @param ... The arguments for the format string.
|
|
||||||
*/
|
|
||||||
static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
|
||||||
if (ggml_cann_log_callback != NULL) {
|
|
||||||
va_list args;
|
|
||||||
va_start(args, format);
|
|
||||||
char buffer[128];
|
|
||||||
int len = vsnprintf(buffer, 128, format, args);
|
|
||||||
if (len < 128) {
|
|
||||||
ggml_cann_log_callback(level, buffer, ggml_cann_log_user_data);
|
|
||||||
} else {
|
|
||||||
// vsnprintf adds a null terminator
|
|
||||||
std::vector<char> buffer2(len + 1);
|
|
||||||
va_end(args);
|
|
||||||
va_start(args, format);
|
|
||||||
vsnprintf(&buffer2[0], buffer2.size(), format, args);
|
|
||||||
ggml_cann_log_callback(level, buffer2.data(),
|
|
||||||
ggml_cann_log_user_data);
|
|
||||||
}
|
|
||||||
va_end(args);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Handles CANN errors by printing an error message and aborting.
|
* @brief Handles CANN errors by printing an error message and aborting.
|
||||||
*
|
*
|
||||||
|
@ -116,10 +53,10 @@ static void ggml_cann_log(enum ggml_log_level level, const char* format, ...) {
|
||||||
int32_t id = -1;
|
int32_t id = -1;
|
||||||
aclrtGetDevice(&id);
|
aclrtGetDevice(&id);
|
||||||
|
|
||||||
GGML_CANN_LOG_ERROR("CANN error: %s\n", msg);
|
GGML_LOG_ERROR("CANN error: %s\n", msg);
|
||||||
GGML_CANN_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func,
|
||||||
file, line);
|
file, line);
|
||||||
GGML_CANN_LOG_ERROR(" %s\n", stmt);
|
GGML_LOG_ERROR(" %s\n", stmt);
|
||||||
// abort with GGML_ASSERT to get a stack trace
|
// abort with GGML_ASSERT to get a stack trace
|
||||||
GGML_ABORT("CANN error");
|
GGML_ABORT("CANN error");
|
||||||
}
|
}
|
||||||
|
@ -165,7 +102,7 @@ static ggml_cann_device_info ggml_cann_init() {
|
||||||
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count);
|
||||||
|
|
||||||
if (err != ACL_SUCCESS) {
|
if (err != ACL_SUCCESS) {
|
||||||
GGML_CANN_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n",
|
||||||
__func__, aclGetRecentErrMsg());
|
__func__, aclGetRecentErrMsg());
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
|
@ -315,7 +252,7 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
|
||||||
*actual_size = look_ahead_size;
|
*actual_size = look_ahead_size;
|
||||||
pool_size += look_ahead_size;
|
pool_size += look_ahead_size;
|
||||||
#ifdef DEBUG_CANN_MALLOC
|
#ifdef DEBUG_CANN_MALLOC
|
||||||
GGML_CANN_LOG_INFO(
|
GGML_LOG_INFO(
|
||||||
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
|
||||||
"requested %u MB\n",
|
"requested %u MB\n",
|
||||||
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
|
__func__, device, nnz, (uint32_t)(max_size / 1024 / 1024),
|
||||||
|
@ -470,7 +407,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
// add to the pool
|
// add to the pool
|
||||||
pool_size += reserve_size;
|
pool_size += reserve_size;
|
||||||
|
|
||||||
// GGML_CANN_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
|
||||||
// reserved %llu MB)\n",
|
// reserved %llu MB)\n",
|
||||||
// device, (unsigned long long) (pool_size/1024/1024),
|
// device, (unsigned long long) (pool_size/1024/1024),
|
||||||
// (unsigned long long) (reserve_size/1024/1024));
|
// (unsigned long long) (reserve_size/1024/1024));
|
||||||
|
@ -483,7 +420,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
pool_used += size;
|
pool_used += size;
|
||||||
|
|
||||||
#ifdef DEBUG_CANN_MALLOC
|
#ifdef DEBUG_CANN_MALLOC
|
||||||
GGML_CANN_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device,
|
||||||
(unsigned long long)size, (unsigned long long)ptr);
|
(unsigned long long)size, (unsigned long long)ptr);
|
||||||
#endif
|
#endif
|
||||||
return ptr;
|
return ptr;
|
||||||
|
@ -497,7 +434,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
|
||||||
*/
|
*/
|
||||||
void free(void* ptr, size_t size) override {
|
void free(void* ptr, size_t size) override {
|
||||||
#ifdef DEBUG_CANN_MALLOC
|
#ifdef DEBUG_CANN_MALLOC
|
||||||
GGML_CANN_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device,
|
||||||
(unsigned long long)size, (unsigned long long)ptr);
|
(unsigned long long)size, (unsigned long long)ptr);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -560,7 +497,7 @@ struct ggml_backend_cann_buffer_context {
|
||||||
* @return A pointer to a C-string containing the name of the buffer.
|
* @return A pointer to a C-string containing the name of the buffer.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
|
static const char* ggml_backend_cann_buffer_get_name(
|
||||||
ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_t buffer) {
|
||||||
return "CANN";
|
return "CANN";
|
||||||
|
|
||||||
|
@ -576,7 +513,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_get_name(
|
||||||
* @param buffer The buffer to check.
|
* @param buffer The buffer to check.
|
||||||
* @return true if the buffer is a CANN buffer, false otherwise.
|
* @return true if the buffer is a CANN buffer, false otherwise.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool ggml_backend_buffer_is_cann(
|
static bool ggml_backend_buffer_is_cann(
|
||||||
ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_t buffer) {
|
||||||
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
|
||||||
}
|
}
|
||||||
|
@ -589,7 +526,7 @@ GGML_CALL static bool ggml_backend_buffer_is_cann(
|
||||||
*
|
*
|
||||||
* @param buffer The CANN buffer to free.
|
* @param buffer The CANN buffer to free.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
|
static void ggml_backend_cann_buffer_free_buffer(
|
||||||
ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||||
|
@ -605,7 +542,7 @@ GGML_CALL static void ggml_backend_cann_buffer_free_buffer(
|
||||||
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
* @param buffer The CANN buffer whose base pointer is to be retrieved.
|
||||||
* @return A pointer to the base of the device memory allocated for the buffer.
|
* @return A pointer to the base of the device memory allocated for the buffer.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
static void* ggml_backend_cann_buffer_get_base(
|
||||||
ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_t buffer) {
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||||
|
@ -625,7 +562,7 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
|
||||||
* @param dst Pointer to the destination buffer where transformed data will be
|
* @param dst Pointer to the destination buffer where transformed data will be
|
||||||
* stored.
|
* stored.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
||||||
const void* src,
|
const void* src,
|
||||||
void* dst) {
|
void* dst) {
|
||||||
|
|
||||||
|
@ -677,7 +614,7 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
|
||||||
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
* @param dst Pointer to the destination buffer where the Q4.0 formatted data
|
||||||
* will be stored.
|
* will be stored.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
static void ggml_backend_cann_transform_back_q4_0(
|
||||||
const ggml_tensor* tensor, void* src, void* dst) {
|
const ggml_tensor* tensor, void* src, void* dst) {
|
||||||
|
|
||||||
int64_t n_elems = ggml_nelements(tensor);
|
int64_t n_elems = ggml_nelements(tensor);
|
||||||
|
@ -726,7 +663,7 @@ GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
|
||||||
* @param dst Pointer to the destination buffer where transformed data will be
|
* @param dst Pointer to the destination buffer where transformed data will be
|
||||||
* stored.
|
* stored.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
||||||
const void* src,
|
const void* src,
|
||||||
void* dst) {
|
void* dst) {
|
||||||
int64_t n_elems = ggml_nelements(tensor);
|
int64_t n_elems = ggml_nelements(tensor);
|
||||||
|
@ -760,7 +697,7 @@ GGML_CALL static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor,
|
||||||
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
* @param dst Pointer to the destination buffer where the Q8.0 formatted data
|
||||||
* will be stored.
|
* will be stored.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
|
static void ggml_backend_cann_transform_back_q8_0(
|
||||||
const ggml_tensor* tensor, const void* src, void* dst) {
|
const ggml_tensor* tensor, const void* src, void* dst) {
|
||||||
int64_t n_elems = ggml_nelements(tensor);
|
int64_t n_elems = ggml_nelements(tensor);
|
||||||
int64_t groups = n_elems / QK8_0;
|
int64_t groups = n_elems / QK8_0;
|
||||||
|
@ -792,7 +729,7 @@ GGML_CALL static void ggml_backend_cann_transform_back_q8_0(
|
||||||
* @param dst Pointer to the destination buffer where transformed data will be
|
* @param dst Pointer to the destination buffer where transformed data will be
|
||||||
* stored.
|
* stored.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
||||||
const void* src, void* dst) {
|
const void* src, void* dst) {
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -818,7 +755,7 @@ GGML_CALL static void ggml_backend_cann_transform(ggml_tensor* tensor,
|
||||||
* @param dst Pointer to the destination buffer where transformed tensor data
|
* @param dst Pointer to the destination buffer where transformed tensor data
|
||||||
* will be stored.
|
* will be stored.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_transform_back(
|
static void ggml_backend_cann_transform_back(
|
||||||
const ggml_tensor* tensor, void* src, void* dst) {
|
const ggml_tensor* tensor, void* src, void* dst) {
|
||||||
switch (tensor->type) {
|
switch (tensor->type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
|
@ -841,7 +778,7 @@ GGML_CALL static void ggml_backend_cann_transform_back(
|
||||||
* @param type The tensor type to check.
|
* @param type The tensor type to check.
|
||||||
* @return true if transformation is needed, false otherwise.
|
* @return true if transformation is needed, false otherwise.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool need_transform(ggml_type type) {
|
static bool need_transform(ggml_type type) {
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
|
@ -860,7 +797,7 @@ GGML_CALL static bool need_transform(ggml_type type) {
|
||||||
* @param buffer The CANN buffer from which to initialize the tensor.
|
* @param buffer The CANN buffer from which to initialize the tensor.
|
||||||
* @param tensor Pointer to the tensor to be initialized.
|
* @param tensor Pointer to the tensor to be initialized.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
|
static void ggml_backend_cann_buffer_init_tensor(
|
||||||
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
ggml_backend_buffer_t buffer, ggml_tensor* tensor) {
|
||||||
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
if (tensor->view_src != NULL && tensor->view_offs == 0) {
|
||||||
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft);
|
||||||
|
@ -896,7 +833,7 @@ GGML_CALL static void ggml_backend_cann_buffer_init_tensor(
|
||||||
* @param offset Offset in the source data from where to start copying.
|
* @param offset Offset in the source data from where to start copying.
|
||||||
* @param size Size of the data to be copied, in bytes.
|
* @param size Size of the data to be copied, in bytes.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
static void ggml_backend_cann_buffer_set_tensor(
|
||||||
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data,
|
||||||
size_t offset, size_t size) {
|
size_t offset, size_t size) {
|
||||||
ggml_backend_cann_buffer_context *ctx =
|
ggml_backend_cann_buffer_context *ctx =
|
||||||
|
@ -941,7 +878,7 @@ GGML_CALL static void ggml_backend_cann_buffer_set_tensor(
|
||||||
* @param offset Offset in the destination buffer where to start copying.
|
* @param offset Offset in the destination buffer where to start copying.
|
||||||
* @param size Size of the data to be copied, in bytes.
|
* @param size Size of the data to be copied, in bytes.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
static void ggml_backend_cann_buffer_get_tensor(
|
||||||
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data,
|
||||||
size_t offset, size_t size) {
|
size_t offset, size_t size) {
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
|
@ -975,7 +912,7 @@ GGML_CALL static void ggml_backend_cann_buffer_get_tensor(
|
||||||
* @param dst Pointer to the destination tensor where the data will be copied.
|
* @param dst Pointer to the destination tensor where the data will be copied.
|
||||||
* @return true if the copy operation succeeded, false otherwise.
|
* @return true if the copy operation succeeded, false otherwise.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
|
static bool ggml_backend_cann_buffer_cpy_tensor(
|
||||||
ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
|
ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) {
|
||||||
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
if (ggml_backend_buffer_is_cann(src->buffer)) {
|
||||||
ggml_backend_cann_buffer_context* src_ctx =
|
ggml_backend_cann_buffer_context* src_ctx =
|
||||||
|
@ -1017,7 +954,7 @@ GGML_CALL static bool ggml_backend_cann_buffer_cpy_tensor(
|
||||||
* @param buffer The CANN buffer to be cleared.
|
* @param buffer The CANN buffer to be cleared.
|
||||||
* @param value The value to which each byte in the buffer will be set.
|
* @param value The value to which each byte in the buffer will be set.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_buffer_clear(
|
static void ggml_backend_cann_buffer_clear(
|
||||||
ggml_backend_buffer_t buffer, uint8_t value) {
|
ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
ggml_backend_cann_buffer_context* ctx =
|
ggml_backend_cann_buffer_context* ctx =
|
||||||
(ggml_backend_cann_buffer_context*)buffer->context;
|
(ggml_backend_cann_buffer_context*)buffer->context;
|
||||||
|
@ -1065,7 +1002,7 @@ struct ggml_backend_cann_buffer_type_context {
|
||||||
* @param buft Pointer to the buffer type context.
|
* @param buft Pointer to the buffer type context.
|
||||||
* @return Const pointer to the C-style string containing the name.
|
* @return Const pointer to the C-style string containing the name.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
|
static const char* ggml_backend_cann_buffer_type_name(
|
||||||
ggml_backend_buffer_type_t buft) {
|
ggml_backend_buffer_type_t buft) {
|
||||||
return "CANN";
|
return "CANN";
|
||||||
|
|
||||||
|
@ -1082,7 +1019,7 @@ GGML_CALL static const char* ggml_backend_cann_buffer_type_name(
|
||||||
* @param size Size in bytes of the buffer to allocate.
|
* @param size Size in bytes of the buffer to allocate.
|
||||||
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
* @return Pointer to the allocated buffer, or nullptr if allocation fails.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static ggml_backend_buffer_t
|
static ggml_backend_buffer_t
|
||||||
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||||
size_t size) {
|
size_t size) {
|
||||||
ggml_backend_cann_buffer_type_context* buft_ctx =
|
ggml_backend_cann_buffer_type_context* buft_ctx =
|
||||||
|
@ -1095,7 +1032,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||||
void* dev_ptr;
|
void* dev_ptr;
|
||||||
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST);
|
||||||
if (err != ACL_SUCCESS) {
|
if (err != ACL_SUCCESS) {
|
||||||
GGML_CANN_LOG_ERROR(
|
GGML_LOG_ERROR(
|
||||||
"%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
|
"%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n",
|
||||||
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
__func__, size / 1024.0 / 1024.0, buft_ctx->device,
|
||||||
aclGetRecentErrMsg());
|
aclGetRecentErrMsg());
|
||||||
|
@ -1121,7 +1058,7 @@ ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
|
||||||
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
* @return The alignment requirement in bytes (fixed at 128 bytes for CANN
|
||||||
* buffers).
|
* buffers).
|
||||||
*/
|
*/
|
||||||
GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
|
static size_t ggml_backend_cann_buffer_type_get_alignment(
|
||||||
ggml_backend_buffer_type_t buft) {
|
ggml_backend_buffer_type_t buft) {
|
||||||
return 128;
|
return 128;
|
||||||
|
|
||||||
|
@ -1142,7 +1079,7 @@ GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alignment(
|
||||||
* @return The total allocation size in bytes required for the tensor in the
|
* @return The total allocation size in bytes required for the tensor in the
|
||||||
* CANN buffer.
|
* CANN buffer.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
static size_t ggml_backend_cann_buffer_type_get_alloc_size(
|
||||||
ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
|
||||||
size_t size = ggml_nbytes(tensor);
|
size_t size = ggml_nbytes(tensor);
|
||||||
int64_t ne0 = tensor->ne[0];
|
int64_t ne0 = tensor->ne[0];
|
||||||
|
@ -1193,7 +1130,7 @@ static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
|
||||||
* @return A pointer to the buffer type interface for the specified device, or
|
* @return A pointer to the buffer type interface for the specified device, or
|
||||||
* nullptr if the device index is out of range.
|
* nullptr if the device index is out of range.
|
||||||
*/
|
*/
|
||||||
GGML_CALL ggml_backend_buffer_type_t
|
ggml_backend_buffer_type_t
|
||||||
ggml_backend_cann_buffer_type(int32_t device) {
|
ggml_backend_cann_buffer_type(int32_t device) {
|
||||||
static std::mutex mutex;
|
static std::mutex mutex;
|
||||||
std::lock_guard<std::mutex> lock(mutex);
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
@ -1231,7 +1168,7 @@ ggml_backend_cann_buffer_type(int32_t device) {
|
||||||
* @param buft Pointer to the host buffer type context.
|
* @param buft Pointer to the host buffer type context.
|
||||||
* @return Const pointer to the C-style string containing the name.
|
* @return Const pointer to the C-style string containing the name.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
|
||||||
return "CANN_Host";
|
return "CANN_Host";
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
|
@ -1246,7 +1183,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backe
|
||||||
* @param buft Pointer to the host buffer context.
|
* @param buft Pointer to the host buffer context.
|
||||||
* @return Const pointer to the C-style string containing the name.
|
* @return Const pointer to the C-style string containing the name.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
|
||||||
return "CANN_Host";
|
return "CANN_Host";
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
|
@ -1260,7 +1197,7 @@ GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_bu
|
||||||
*
|
*
|
||||||
* @param buffer The CANN host buffer to free.
|
* @param buffer The CANN host buffer to free.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) {
|
||||||
ACL_CHECK(aclrtFreeHost(buffer->context));
|
ACL_CHECK(aclrtFreeHost(buffer->context));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1280,7 +1217,7 @@ static void * ggml_cann_host_malloc(size_t size) {
|
||||||
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
aclError err = aclrtMallocHost((void **) &hostPtr, size);
|
||||||
if (err != ACL_SUCCESS) {
|
if (err != ACL_SUCCESS) {
|
||||||
|
|
||||||
GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||||
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -1294,7 +1231,7 @@ static void * ggml_cann_host_malloc(size_t size) {
|
||||||
* @param size Size in bytes of the host buffer to allocate.
|
* @param size Size in bytes of the host buffer to allocate.
|
||||||
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
* @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
void * hostPtr = ggml_cann_host_malloc(size);
|
void * hostPtr = ggml_cann_host_malloc(size);
|
||||||
|
|
||||||
if (hostPtr == nullptr) {
|
if (hostPtr == nullptr) {
|
||||||
|
@ -1316,7 +1253,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_
|
||||||
* Provides function pointers for allocating, querying properties, and managing
|
* Provides function pointers for allocating, querying properties, and managing
|
||||||
* memory for CANN buffer types in the GGML backend.
|
* memory for CANN buffer types in the GGML backend.
|
||||||
*/
|
*/
|
||||||
GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
/* .get_name = */ ggml_backend_cann_host_buffer_type_name,
|
||||||
|
@ -1326,6 +1263,7 @@ GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
|
||||||
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
|
||||||
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
|
||||||
},
|
},
|
||||||
|
/* .device = */ nullptr,
|
||||||
/* .context = */ nullptr,
|
/* .context = */ nullptr,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1495,7 +1433,7 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
||||||
* @param backend Pointer to the CANN backend structure.
|
* @param backend Pointer to the CANN backend structure.
|
||||||
* @return A pointer to a constant string representing the backend name.
|
* @return A pointer to a constant string representing the backend name.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
|
|
||||||
|
@ -1510,7 +1448,7 @@ GGML_CALL static const char* ggml_backend_cann_name(ggml_backend_t backend) {
|
||||||
*
|
*
|
||||||
* @param backend Pointer to the CANN backend structure to be freed.
|
* @param backend Pointer to the CANN backend structure to be freed.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
|
static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
ACL_CHECK(aclrtSynchronizeDevice());
|
ACL_CHECK(aclrtSynchronizeDevice());
|
||||||
|
@ -1535,7 +1473,7 @@ GGML_CALL static void ggml_backend_cann_free(ggml_backend_t backend) {
|
||||||
* @param backend Pointer to the CANN backend structure.
|
* @param backend Pointer to the CANN backend structure.
|
||||||
* @return Pointer to the buffer type structure for the CANN backend.
|
* @return Pointer to the buffer type structure for the CANN backend.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static ggml_backend_buffer_type_t
|
static ggml_backend_buffer_type_t
|
||||||
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
|
@ -1556,7 +1494,7 @@ ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
* @param offset Offset in bytes within the host data.
|
* @param offset Offset in bytes within the host data.
|
||||||
* @param size Size of the data to copy in bytes.
|
* @param size Size of the data to copy in bytes.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
||||||
ggml_tensor *tensor,
|
ggml_tensor *tensor,
|
||||||
const void *data,
|
const void *data,
|
||||||
size_t offset,
|
size_t offset,
|
||||||
|
@ -1587,7 +1525,7 @@ GGML_CALL static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
static void ggml_backend_cann_get_tensor_async(
|
||||||
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
ggml_backend_t backend, const ggml_tensor *tensor, void *data,
|
||||||
size_t offset, size_t size) {
|
size_t offset, size_t size) {
|
||||||
ggml_backend_cann_context *cann_ctx =
|
ggml_backend_cann_context *cann_ctx =
|
||||||
|
@ -1626,7 +1564,7 @@ GGML_CALL static void ggml_backend_cann_get_tensor_async(
|
||||||
* @param dst Pointer to the destination tensor to copy data to.
|
* @param dst Pointer to the destination tensor to copy data to.
|
||||||
* @return true if the copy operation succeeds, false otherwise.
|
* @return true if the copy operation succeeds, false otherwise.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
static bool ggml_backend_cann_cpy_tensor_async(
|
||||||
ggml_backend_t backend_src, ggml_backend_t backend_dst,
|
ggml_backend_t backend_src, ggml_backend_t backend_dst,
|
||||||
const ggml_tensor* src, ggml_tensor* dst) {
|
const ggml_tensor* src, ggml_tensor* dst) {
|
||||||
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
GGML_ASSERT(ggml_backend_is_cann(backend_src) ||
|
||||||
|
@ -1694,7 +1632,7 @@ GGML_CALL static bool ggml_backend_cann_cpy_tensor_async(
|
||||||
*
|
*
|
||||||
* @param backend Pointer to the CANN backend structure to synchronize.
|
* @param backend Pointer to the CANN backend structure to synchronize.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
|
|
||||||
|
@ -1715,7 +1653,7 @@ GGML_CALL static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||||
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
* @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation
|
||||||
* completes successfully, otherwise an appropriate error status.
|
* completes successfully, otherwise an appropriate error status.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
static enum ggml_status ggml_backend_cann_graph_compute(
|
||||||
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
|
@ -1732,7 +1670,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
||||||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||||
|
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
GGML_CANN_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
||||||
node->name, ggml_op_name(node->op));
|
node->name, ggml_op_name(node->op));
|
||||||
}
|
}
|
||||||
GGML_ASSERT(ok);
|
GGML_ASSERT(ok);
|
||||||
|
@ -1753,7 +1691,7 @@ GGML_CALL static enum ggml_status ggml_backend_cann_graph_compute(
|
||||||
* @return bool Returns true if the operation is supported by the backend,
|
* @return bool Returns true if the operation is supported by the backend,
|
||||||
* otherwise false.
|
* otherwise false.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
|
||||||
const ggml_tensor* op) {
|
const ggml_tensor* op) {
|
||||||
switch (op->op) {
|
switch (op->op) {
|
||||||
case GGML_OP_UNARY:
|
case GGML_OP_UNARY:
|
||||||
|
@ -1875,7 +1813,7 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
|
||||||
* @return bool Returns true if the CANN backend supports the buffer type,
|
* @return bool Returns true if the CANN backend supports the buffer type,
|
||||||
* otherwise false.
|
* otherwise false.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool ggml_backend_cann_supports_buft(
|
static bool ggml_backend_cann_supports_buft(
|
||||||
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
|
||||||
if (ggml_backend_buft_is_cann(buft)) {
|
if (ggml_backend_buft_is_cann(buft)) {
|
||||||
ggml_backend_cann_context * cann_ctx =
|
ggml_backend_cann_context * cann_ctx =
|
||||||
|
@ -1901,7 +1839,7 @@ GGML_CALL static bool ggml_backend_cann_supports_buft(
|
||||||
* @return bool Returns true if the operation should be offloaded, otherwise
|
* @return bool Returns true if the operation should be offloaded, otherwise
|
||||||
* false.
|
* false.
|
||||||
*/
|
*/
|
||||||
GGML_CALL static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
|
static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
|
||||||
const ggml_tensor* op) {
|
const ggml_tensor* op) {
|
||||||
const int min_batch_size = 32;
|
const int min_batch_size = 32;
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
|
@ -2021,11 +1959,8 @@ static ggml_backend_i ggml_backend_cann_interface = {
|
||||||
/* .supports_op = */ ggml_backend_cann_supports_op,
|
/* .supports_op = */ ggml_backend_cann_supports_op,
|
||||||
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
/* .supports_buft = */ ggml_backend_cann_supports_buft,
|
||||||
/* .offload_op = */ ggml_backend_cann_offload_op,
|
/* .offload_op = */ ggml_backend_cann_offload_op,
|
||||||
/* .event_new = */ ggml_backend_cann_event_new,
|
|
||||||
/* .event_free = */ ggml_backend_cann_event_free,
|
|
||||||
/* .event_record = */ ggml_backend_cann_event_record,
|
/* .event_record = */ ggml_backend_cann_event_record,
|
||||||
/* .event_wait = */ ggml_backend_cann_event_wait,
|
/* .event_wait = */ ggml_backend_cann_event_wait,
|
||||||
/* .event_synchronize = */ ggml_backend_cann_event_synchronize,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -2042,91 +1977,46 @@ static ggml_guid_t ggml_backend_cann_guid() {
|
||||||
return &guid;
|
return &guid;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
ggml_backend_t ggml_backend_cann_init(int32_t device) {
|
||||||
aclInit(nullptr);
|
aclInit(nullptr);
|
||||||
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
|
||||||
GGML_CANN_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device);
|
||||||
if (ctx == nullptr) {
|
if (ctx == nullptr) {
|
||||||
GGML_CANN_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
ggml_cann_set_device(ctx->device);
|
ggml_cann_set_device(ctx->device);
|
||||||
ggml_backend_t cann_backend =
|
ggml_backend_t cann_backend =
|
||||||
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
|
||||||
/* .interface = */ ggml_backend_cann_interface,
|
/* .interface = */ ggml_backend_cann_interface,
|
||||||
|
/* .device = */ nullptr,
|
||||||
/* .context = */ ctx};
|
/* .context = */ ctx};
|
||||||
|
|
||||||
return cann_backend;
|
return cann_backend;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL bool ggml_backend_is_cann(ggml_backend_t backend) {
|
bool ggml_backend_is_cann(ggml_backend_t backend) {
|
||||||
return backend != NULL &&
|
return backend != NULL &&
|
||||||
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
ggml_guid_matches(backend->guid, ggml_backend_cann_guid());
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL int32_t ggml_backend_cann_get_device_count() {
|
int32_t ggml_backend_cann_get_device_count() {
|
||||||
return ggml_cann_info().device_count;
|
return ggml_cann_info().device_count;
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_cann_get_device_description(
|
void ggml_backend_cann_get_device_description(
|
||||||
int32_t device, char* description, size_t description_size) {
|
int32_t device, char* description, size_t description_size) {
|
||||||
ggml_cann_set_device(device);
|
ggml_cann_set_device(device);
|
||||||
const char* soc_name = aclrtGetSocName();
|
const char* soc_name = aclrtGetSocName();
|
||||||
snprintf(description, description_size, "%s", soc_name);
|
snprintf(description, description_size, "%s", soc_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
GGML_CALL void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
|
||||||
size_t* total) {
|
size_t* total) {
|
||||||
ggml_cann_set_device(device);
|
ggml_cann_set_device(device);
|
||||||
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
|
||||||
}
|
}
|
||||||
|
|
||||||
// backend registry
|
|
||||||
/**
|
|
||||||
* @brief Initializes a CANN backend based on the provided parameters.
|
|
||||||
*
|
|
||||||
* This function initializes a CANN backend using the device index and then
|
|
||||||
* initializes the backend using `ggml_backend_cann_init`.
|
|
||||||
*
|
|
||||||
* @param params Parameters for initialization (unused in this implementation).
|
|
||||||
* @param user_data User data containing the device index to initialize the
|
|
||||||
* backend.
|
|
||||||
* @return ggml_backend_t The initialized CANN backend.
|
|
||||||
*/
|
|
||||||
GGML_CALL static ggml_backend_t ggml_backend_reg_cann_init(const char* params,
|
|
||||||
void* user_data) {
|
|
||||||
ggml_backend_t cann_backend =
|
|
||||||
ggml_backend_cann_init((int)(intptr_t)user_data);
|
|
||||||
return cann_backend;
|
|
||||||
|
|
||||||
GGML_UNUSED(params);
|
|
||||||
}
|
|
||||||
|
|
||||||
extern "C" GGML_CALL int ggml_backend_cann_reg_devices();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Registers CANN (Ascend) devices as backend options.
|
|
||||||
*
|
|
||||||
* This function initializes ACL, retrieves the number of available CANN
|
|
||||||
* devices, and registers each device as a backend option using
|
|
||||||
* `ggml_backend_register`. Each device is given a unique name based on
|
|
||||||
* `GGML_CANN_NAME` followed by its index.
|
|
||||||
*
|
|
||||||
* @return int The number of CANN devices registered.
|
|
||||||
*/
|
|
||||||
GGML_CALL int ggml_backend_cann_reg_devices() {
|
|
||||||
uint32_t device_count = ggml_backend_cann_get_device_count();
|
|
||||||
// initialization
|
|
||||||
for (uint32_t i = 0; i < device_count; i++) {
|
|
||||||
char name[128];
|
|
||||||
snprintf(name, sizeof(name), "CANN%d", i);
|
|
||||||
ggml_backend_register(name, ggml_backend_reg_cann_init,
|
|
||||||
ggml_backend_cann_buffer_type(i),
|
|
||||||
(void*)(intptr_t)i);
|
|
||||||
}
|
|
||||||
return device_count;
|
|
||||||
}
|
|
||||||
|
|
File diff suppressed because it is too large
Load diff
79
ggml/src/ggml-cuda/argmax.cu
Normal file
79
ggml/src/ggml-cuda/argmax.cu
Normal file
|
@ -0,0 +1,79 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
#include "argmax.cuh"
|
||||||
|
#include "sum.cuh"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
static __global__ void argmax_f32(
|
||||||
|
const float * x, int32_t * dst, const int64_t ncols, const int64_t nrows) {
|
||||||
|
|
||||||
|
int argmax_thread = 0;
|
||||||
|
const int64_t row0 = (int64_t)blockIdx.x*WARP_SIZE;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int64_t row1 = 0; row1 < WARP_SIZE; ++row1) {
|
||||||
|
const int64_t row = row0 + row1;
|
||||||
|
|
||||||
|
if (row >= nrows) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
float maxval = -FLT_MAX;
|
||||||
|
int argmax = -1;
|
||||||
|
|
||||||
|
for (int32_t col = threadIdx.x; col < ncols; col += WARP_SIZE) {
|
||||||
|
const float val = x[row*ncols + col];
|
||||||
|
const int bigger = val > maxval;
|
||||||
|
const int not_bigger = bigger ^ 0x00000001;
|
||||||
|
|
||||||
|
maxval = maxval*not_bigger + val*bigger;
|
||||||
|
argmax = argmax*not_bigger + col*bigger;
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
const float val = __shfl_xor_sync(0xFFFFFFFF, maxval, mask, WARP_SIZE);
|
||||||
|
const int col = __shfl_xor_sync(0xFFFFFFFF, argmax, mask, WARP_SIZE);
|
||||||
|
const int bigger = val > maxval;
|
||||||
|
const int not_bigger = bigger ^ 0x00000001;
|
||||||
|
|
||||||
|
maxval = maxval*not_bigger + val*bigger;
|
||||||
|
argmax = argmax*not_bigger + col*bigger;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int store = row1 == threadIdx.x;
|
||||||
|
argmax_thread += store*argmax;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int row = row0 + threadIdx.x;
|
||||||
|
|
||||||
|
if (row >= nrows) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst[row] = argmax_thread;
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_I32);
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
|
const int64_t ne00 = src0->ne[0];
|
||||||
|
const int64_t nrows = ggml_nrows(src0);
|
||||||
|
|
||||||
|
const float * src0_d = (const float *) src0->data;
|
||||||
|
int32_t * dst_d = (int32_t *) dst->data;
|
||||||
|
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
|
||||||
|
const int64_t num_blocks = (nrows + WARP_SIZE - 1) / WARP_SIZE;
|
||||||
|
|
||||||
|
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||||
|
const dim3 blocks_num(num_blocks, 1, 1);
|
||||||
|
|
||||||
|
argmax_f32<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, dst_d, ne00, nrows);
|
||||||
|
}
|
3
ggml/src/ggml-cuda/argmax.cuh
Normal file
3
ggml/src/ggml-cuda/argmax.cuh
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
void ggml_cuda_argmax(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -175,6 +175,18 @@ static __device__ void no_device_code(
|
||||||
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
#define NO_DEVICE_CODE //GGML_ABORT("NO_DEVICE_CODE not valid in host code.")
|
||||||
#endif // __CUDA_ARCH__
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
|
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||||
|
return __reduce_add_sync(0xffffffff, x);
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||||
|
}
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
|
64
ggml/src/ggml-cuda/count-equal.cu
Normal file
64
ggml/src/ggml-cuda/count-equal.cu
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
#include "count-equal.cuh"
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
static __global__ void count_equal(const T * __restrict__ x, const T * __restrict__ y, int64_t * __restrict__ dst, const int64_t dk, const int64_t k) {
|
||||||
|
const int64_t i0 = (int64_t) blockIdx.x*dk;
|
||||||
|
const int64_t i1 = min(i0 + dk, k);
|
||||||
|
|
||||||
|
int nequal = 0;
|
||||||
|
|
||||||
|
for (int64_t i = i0 + threadIdx.x; i < i1; i += WARP_SIZE) {
|
||||||
|
const T xi = x[i];
|
||||||
|
const T yi = y[i];
|
||||||
|
nequal += xi == yi;
|
||||||
|
}
|
||||||
|
|
||||||
|
nequal = warp_reduce_sum(nequal);
|
||||||
|
|
||||||
|
if (threadIdx.x != 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
atomicAdd((int *) dst, nequal);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
|
const ggml_tensor * src1 = dst->src[1];
|
||||||
|
|
||||||
|
GGML_ASSERT(src0->type == src1->type);
|
||||||
|
GGML_ASSERT( dst->type == GGML_TYPE_I64);
|
||||||
|
|
||||||
|
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(src1));
|
||||||
|
GGML_ASSERT(ggml_is_contiguous(dst));
|
||||||
|
|
||||||
|
int64_t * dst_d = (int64_t *) dst->data;
|
||||||
|
|
||||||
|
cudaStream_t stream = ctx.stream();
|
||||||
|
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||||
|
|
||||||
|
const int64_t ne = ggml_nelements(src0);
|
||||||
|
GGML_ASSERT(ne < (1 << 30) && "atomicAdd implementation only supports int");
|
||||||
|
const int64_t dne = GGML_PAD(ne / (4*nsm), CUDA_COUNT_EQUAL_CHUNK_SIZE);
|
||||||
|
|
||||||
|
CUDA_CHECK(cudaMemsetAsync(dst_d, 0, ggml_nbytes(dst), stream));
|
||||||
|
|
||||||
|
const dim3 blocks_dim(WARP_SIZE, 1, 1);
|
||||||
|
const dim3 blocks_num(std::min((int64_t)4*nsm, (ne + CUDA_COUNT_EQUAL_CHUNK_SIZE - 1)/CUDA_COUNT_EQUAL_CHUNK_SIZE), 1, 1);
|
||||||
|
|
||||||
|
switch (src0->type) {
|
||||||
|
case GGML_TYPE_I32: {
|
||||||
|
const int * src0_d = (const int *) src0->data;
|
||||||
|
const int * src1_d = (const int *) src1->data;
|
||||||
|
count_equal<<<blocks_num, blocks_dim, 0, stream>>>(src0_d, src1_d, dst_d, dne, ne);
|
||||||
|
} break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
5
ggml/src/ggml-cuda/count-equal.cuh
Normal file
5
ggml/src/ggml-cuda/count-equal.cuh
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
#include "common.cuh"
|
||||||
|
|
||||||
|
#define CUDA_COUNT_EQUAL_CHUNK_SIZE 128
|
||||||
|
|
||||||
|
void ggml_cuda_count_equal(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -259,7 +259,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||||
}
|
}
|
||||||
|
|
||||||
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
half kqsum_j = __low2half(kqsum[j_VKQ_0/nwarps]) + __high2half(kqsum[j_VKQ_0/nwarps]);
|
||||||
kqsum_j = warp_reduce_sum(kqsum_j);
|
kqsum_j = warp_reduce_sum((float)kqsum_j);
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
|
for (int i00 = 0; i00 < D; i00 += 2*WARP_SIZE) {
|
||||||
|
|
|
@ -196,7 +196,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols; ++j) {
|
for (int j = 0; j < ncols; ++j) {
|
||||||
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
half sum = vec_dot_KQ(K + (k_VKQ_0 + i_KQ)*nb11, Q_h2[j], Q_i32[j], Q_ds[j]);
|
||||||
sum = warp_reduce_sum(sum);
|
sum = warp_reduce_sum((float)sum);
|
||||||
|
|
||||||
if (use_logit_softcap) {
|
if (use_logit_softcap) {
|
||||||
sum = logit_softcap*tanhf(sum);
|
sum = logit_softcap*tanhf(sum);
|
||||||
|
@ -265,7 +265,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int j = 0; j < ncols; ++j) {
|
for (int j = 0; j < ncols; ++j) {
|
||||||
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
kqsum[j] = warp_reduce_sum((float)kqsum[j]);
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||||
}
|
}
|
||||||
|
@ -280,7 +280,7 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
}
|
}
|
||||||
|
|
||||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
kqsum[j_VKQ] = warp_reduce_sum((float)kqsum[j_VKQ]);
|
||||||
|
|
||||||
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||||
if (parallel_blocks == 1) {
|
if (parallel_blocks == 1) {
|
||||||
|
|
|
@ -33,6 +33,21 @@ extern "C" {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// logging
|
||||||
|
//
|
||||||
|
|
||||||
|
GGML_ATTRIBUTE_FORMAT(2, 3)
|
||||||
|
void ggml_log_internal (enum ggml_log_level level, const char * format, ...);
|
||||||
|
void ggml_log_callback_default(enum ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
|
#define GGML_LOG(...) ggml_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
||||||
|
#define GGML_LOG_INFO(...) ggml_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
|
#define GGML_LOG_WARN(...) ggml_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||||
|
#define GGML_LOG_ERROR(...) ggml_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
#define GGML_LOG_DEBUG(...) ggml_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||||
|
#define GGML_LOG_CONT(...) ggml_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
|
||||||
|
|
||||||
// bitset
|
// bitset
|
||||||
|
|
||||||
typedef uint32_t ggml_bitset_t;
|
typedef uint32_t ggml_bitset_t;
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue