Merge branch 'master' into vulkan
This commit is contained in:
commit
9c4c15add8
94 changed files with 22156 additions and 2311 deletions
26
.devops/main-intel.Dockerfile
Normal file
26
.devops/main-intel.Dockerfile
Normal file
|
@ -0,0 +1,26 @@
|
|||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
||||
cmake --build . --config Release --target main server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/main /main
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/main" ]
|
|
@ -7,6 +7,18 @@
|
|||
{ system, ... }:
|
||||
{
|
||||
_module.args = {
|
||||
# Note: bringing up https://zimbatm.com/notes/1000-instances-of-nixpkgs
|
||||
# again, the below creates several nixpkgs instances which the
|
||||
# flake-centric CLI will be forced to evaluate e.g. on `nix flake show`.
|
||||
#
|
||||
# This is currently "slow" and "expensive", on a certain scale.
|
||||
# This also isn't "right" in that this hinders dependency injection at
|
||||
# the level of flake inputs. This might get removed in the foreseeable
|
||||
# future.
|
||||
#
|
||||
# Note that you can use these expressions without Nix
|
||||
# (`pkgs.callPackage ./devops/nix/scope.nix { }` is the entry point).
|
||||
|
||||
pkgsCuda = import inputs.nixpkgs {
|
||||
inherit system;
|
||||
# Ensure dependencies use CUDA consistently (e.g. that openmpi, ucc,
|
||||
|
|
|
@ -73,6 +73,7 @@ let
|
|||
ps: [
|
||||
ps.numpy
|
||||
ps.sentencepiece
|
||||
ps.tiktoken
|
||||
ps.torchWithoutCuda
|
||||
ps.transformers
|
||||
]
|
||||
|
@ -114,14 +115,22 @@ effectiveStdenv.mkDerivation (
|
|||
pname = "llama-cpp${pnameSuffix}";
|
||||
version = llamaVersion;
|
||||
|
||||
# Note: none of the files discarded here are visible in the sandbox or
|
||||
# affect the output hash. This also means they can be modified without
|
||||
# triggering a rebuild.
|
||||
src = lib.cleanSourceWith {
|
||||
filter =
|
||||
name: type:
|
||||
!(builtins.any (_: _) [
|
||||
let
|
||||
noneOf = builtins.all (x: !x);
|
||||
baseName = baseNameOf name;
|
||||
in
|
||||
noneOf [
|
||||
(lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
|
||||
(name == "README.md") # Ignore *.md changes whe computing outPaths
|
||||
(lib.hasPrefix "." name) # Skip hidden files and directories
|
||||
]);
|
||||
(lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
|
||||
(lib.hasPrefix "." baseName) # Skip hidden files and directories
|
||||
(baseName == "flake.lock")
|
||||
];
|
||||
src = lib.cleanSource ../../.;
|
||||
};
|
||||
|
||||
|
@ -159,7 +168,7 @@ effectiveStdenv.mkDerivation (
|
|||
|
||||
cmakeFlags =
|
||||
[
|
||||
(cmakeBool "LLAMA_NATIVE" true)
|
||||
(cmakeBool "LLAMA_NATIVE" false)
|
||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||
(cmakeBool "BUILD_SHARED_LIBS" true)
|
||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||
|
@ -216,6 +225,9 @@ effectiveStdenv.mkDerivation (
|
|||
description = "contains numpy and sentencepiece";
|
||||
buildInputs = [ llama-python ];
|
||||
inputsFrom = [ finalAttrs.finalPackage ];
|
||||
shellHook = ''
|
||||
addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
|
||||
'';
|
||||
};
|
||||
|
||||
shell-extra = mkShell {
|
||||
|
|
|
@ -4,6 +4,10 @@
|
|||
llamaVersion ? "0.0.0",
|
||||
}:
|
||||
|
||||
# We're using `makeScope` instead of just writing out an attrset
|
||||
# because it allows users to apply overlays later using `overrideScope'`.
|
||||
# Cf. https://noogle.dev/f/lib/makeScope
|
||||
|
||||
lib.makeScope newScope (
|
||||
self: {
|
||||
inherit llamaVersion;
|
||||
|
|
32
.devops/server-cuda.Dockerfile
Normal file
32
.devops/server-cuda.Dockerfile
Normal file
|
@ -0,0 +1,32 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG CUDA_VERSION=11.7.1
|
||||
# Target the CUDA build image
|
||||
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
# Target the CUDA runtime image
|
||||
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
FROM ${BASE_CUDA_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
ARG CUDA_DOCKER_ARCH=all
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||
# Enable cuBLAS
|
||||
ENV LLAMA_CUBLAS=1
|
||||
|
||||
RUN make
|
||||
|
||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
25
.devops/server-intel.Dockerfile
Normal file
25
.devops/server-intel.Dockerfile
Normal file
|
@ -0,0 +1,25 @@
|
|||
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
|
||||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM intel/hpckit:$ONEAPI_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
|
||||
RUN mkdir build && \
|
||||
cd build && \
|
||||
cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
|
||||
cmake --build . --config Release --target main server
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/build/bin/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
45
.devops/server-rocm.Dockerfile
Normal file
45
.devops/server-rocm.Dockerfile
Normal file
|
@ -0,0 +1,45 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
# This needs to generally match the container host's environment.
|
||||
ARG ROCM_VERSION=5.6
|
||||
|
||||
# Target the CUDA build image
|
||||
ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
|
||||
|
||||
FROM ${BASE_ROCM_DEV_CONTAINER} as build
|
||||
|
||||
# Unless otherwise specified, we make a fat build.
|
||||
# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
|
||||
# This is mostly tied to rocBLAS supported archs.
|
||||
ARG ROCM_DOCKER_ARCH=\
|
||||
gfx803 \
|
||||
gfx900 \
|
||||
gfx906 \
|
||||
gfx908 \
|
||||
gfx90a \
|
||||
gfx1010 \
|
||||
gfx1030 \
|
||||
gfx1100 \
|
||||
gfx1101 \
|
||||
gfx1102
|
||||
|
||||
COPY requirements.txt requirements.txt
|
||||
COPY requirements requirements
|
||||
|
||||
RUN pip install --upgrade pip setuptools wheel \
|
||||
&& pip install -r requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
# Set nvcc architecture
|
||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||
# Enable ROCm
|
||||
ENV LLAMA_HIPBLAS=1
|
||||
ENV CC=/opt/rocm/llvm/bin/clang
|
||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||
|
||||
RUN make
|
||||
|
||||
ENTRYPOINT [ "/app/server" ]
|
20
.devops/server.Dockerfile
Normal file
20
.devops/server.Dockerfile
Normal file
|
@ -0,0 +1,20 @@
|
|||
ARG UBUNTU_VERSION=22.04
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as build
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential git
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN make
|
||||
|
||||
FROM ubuntu:$UBUNTU_VERSION as runtime
|
||||
|
||||
COPY --from=build /app/server /server
|
||||
|
||||
ENV LC_ALL=C.utf8
|
||||
|
||||
ENTRYPOINT [ "/server" ]
|
57
.github/workflows/build.yml
vendored
57
.github/workflows/build.yml
vendored
|
@ -72,7 +72,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-latest-cmake-sanitizer:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -107,7 +107,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
ubuntu-latest-cmake-mpi:
|
||||
runs-on: ubuntu-latest
|
||||
|
@ -141,7 +141,48 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose
|
||||
ctest -L main --verbose
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
continue-on-error: true
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: add oneAPI to apt
|
||||
shell: bash
|
||||
run: |
|
||||
cd /tmp
|
||||
wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
|
||||
sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
|
||||
|
||||
- name: install oneAPI dpcpp compiler
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt update
|
||||
sudo apt install intel-oneapi-compiler-dpcpp-cpp
|
||||
|
||||
- name: install oneAPI MKL library
|
||||
shell: bash
|
||||
run: |
|
||||
sudo apt install intel-oneapi-mkl-devel
|
||||
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build
|
||||
id: cmake_build
|
||||
run: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
mkdir build
|
||||
cd build
|
||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||
# how to debug it.
|
||||
|
@ -202,7 +243,7 @@ jobs:
|
|||
id: cmake_test
|
||||
run: |
|
||||
cd build
|
||||
ctest --verbose --timeout 900
|
||||
ctest -L main --verbose --timeout 900
|
||||
|
||||
macOS-latest-cmake-ios:
|
||||
runs-on: macos-latest
|
||||
|
@ -295,7 +336,7 @@ jobs:
|
|||
OPENBLAS_VERSION: 0.3.23
|
||||
OPENCL_VERSION: 2023.04.17
|
||||
CLBLAST_VERSION: 1.6.0
|
||||
SDE_VERSION: 9.21.1-2023-04-24
|
||||
SDE_VERSION: 9.33.0-2024-01-07
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -394,19 +435,19 @@ jobs:
|
|||
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # not all machines have native AVX-512
|
||||
run: |
|
||||
cd build
|
||||
ctest -C Release --verbose --timeout 900
|
||||
ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Test (Intel SDE)
|
||||
id: cmake_test_sde
|
||||
if: ${{ matrix.build == 'avx512' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
|
||||
run: |
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/777395/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
|
||||
# for some weird reason windows tar doesn't like sde tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
|
||||
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
|
||||
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
|
||||
cd build
|
||||
& $sde -future -- ctest -C Release --verbose --timeout 900
|
||||
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
|
|
5
.github/workflows/docker.yml
vendored
5
.github/workflows/docker.yml
vendored
|
@ -28,13 +28,18 @@ jobs:
|
|||
config:
|
||||
- { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
||||
# have disabled them for now until the reason why
|
||||
# is understood.
|
||||
- { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||
- { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
- { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||
steps:
|
||||
- name: Check out the repo
|
||||
uses: actions/checkout@v3
|
||||
|
|
11
.github/workflows/nix-ci-aarch64.yml
vendored
11
.github/workflows/nix-ci-aarch64.yml
vendored
|
@ -2,13 +2,20 @@ name: Nix aarch64 builds
|
|||
|
||||
on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
schedule:
|
||||
# Rebuild daily rather than on every push because QEMU is expensive (e.g.
|
||||
# 1.5h instead of minutes with the cold cache).
|
||||
#
|
||||
# randint(0, 59), randint(0, 23)
|
||||
- cron: '26 12 * * *'
|
||||
# But also rebuild if we touched any of the Nix expressions:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
paths: ['**/*.nix', 'flake.lock']
|
||||
|
||||
jobs:
|
||||
nix-build-aarch64:
|
||||
|
|
2
.github/workflows/nix-ci.yml
vendored
2
.github/workflows/nix-ci.yml
vendored
|
@ -5,10 +5,8 @@ on:
|
|||
push:
|
||||
branches:
|
||||
- master
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', '**/*.sh', '**/*.py', '**/*.nix']
|
||||
|
||||
jobs:
|
||||
nix-eval:
|
||||
|
|
18
.gitignore
vendored
18
.gitignore
vendored
|
@ -27,7 +27,7 @@
|
|||
lcov-report/
|
||||
gcovr-report/
|
||||
|
||||
build*/
|
||||
build*
|
||||
out/
|
||||
tmp/
|
||||
|
||||
|
@ -89,19 +89,3 @@ examples/jeopardy/results.txt
|
|||
|
||||
poetry.lock
|
||||
poetry.toml
|
||||
|
||||
# Test binaries
|
||||
/tests/test-grammar-parser
|
||||
/tests/test-llama-grammar
|
||||
/tests/test-double-float
|
||||
/tests/test-grad0
|
||||
/tests/test-opt
|
||||
/tests/test-quantize-fns
|
||||
/tests/test-quantize-perf
|
||||
/tests/test-sampling
|
||||
/tests/test-tokenizer-0-llama
|
||||
/tests/test-tokenizer-0-falcon
|
||||
/tests/test-tokenizer-1-llama
|
||||
/tests/test-tokenizer-1-bpe
|
||||
/tests/test-rope
|
||||
/tests/test-backend-ops
|
||||
|
|
138
CMakeLists.txt
138
CMakeLists.txt
|
@ -1,5 +1,6 @@
|
|||
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||
project("llama.cpp" C CXX)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||
|
||||
|
@ -47,6 +48,7 @@ option(BUILD_SHARED_LIBS "build shared libraries"
|
|||
option(LLAMA_STATIC "llama: static link libraries" OFF)
|
||||
option(LLAMA_NATIVE "llama: enable -march=native flag" ON)
|
||||
option(LLAMA_LTO "llama: enable link time optimization" OFF)
|
||||
option(LLAMA_CCACHE "llama: use ccache if available" ON)
|
||||
|
||||
# debug
|
||||
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
|
||||
|
@ -103,19 +105,32 @@ option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging"
|
|||
option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF)
|
||||
option(LLAMA_MPI "llama: use MPI" OFF)
|
||||
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
|
||||
option(LLAMA_SYCL "llama: use SYCL" OFF)
|
||||
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
|
||||
|
||||
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
|
||||
option(LLAMA_BUILD_SERVER "llama: build server example" ON)
|
||||
|
||||
|
||||
# add perf arguments
|
||||
option(LLAMA_PERF "llama: enable perf" OFF)
|
||||
if (LLAMA_PERF)
|
||||
add_definitions(-DGGML_PERF)
|
||||
endif()
|
||||
|
||||
# Required for relocatable CMake package
|
||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
||||
|
||||
#
|
||||
# Compile flags
|
||||
#
|
||||
if (LLAMA_SYCL)
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
else()
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
endif()
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||
set(CMAKE_C_STANDARD 11)
|
||||
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||
|
@ -463,6 +478,32 @@ if (LLAMA_HIPBLAS)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
if (LLAMA_SYCL)
|
||||
if ( NOT DEFINED ENV{ONEAPI_ROOT})
|
||||
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
|
||||
endif()
|
||||
#todo: AOT
|
||||
|
||||
find_package(IntelSYCL REQUIRED)
|
||||
if (LLAMA_SYCL_F16)
|
||||
add_compile_definitions(GGML_SYCL_F16)
|
||||
endif()
|
||||
add_compile_definitions(GGML_USE_SYCL)
|
||||
|
||||
add_compile_options(-I./) #include DPCT
|
||||
add_compile_options(-I/${SYCL_INCLUDE_DIR})
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
|
||||
|
||||
set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
|
||||
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
|
||||
|
||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||
endif()
|
||||
|
||||
function(get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
@ -475,17 +516,24 @@ function(get_flags CCID CCVER)
|
|||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
set(C_FLAGS ${C_FLAGS} -Wdouble-promotion)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wno-format-truncation)
|
||||
list(APPEND CXX_FLAGS -Wno-format-truncation)
|
||||
endif()
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
elseif (CCID MATCHES "Intel")
|
||||
if (NOT LLAMA_SYCL)
|
||||
# enable max optimization level when using Intel compiler
|
||||
set(C_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
|
||||
add_link_options(-fuse-ld=lld -static-intel)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
@ -514,16 +562,18 @@ if (LLAMA_ALL_WARNINGS)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
set(CUDA_FLAGS ${CXX_FLAGS} -use_fast_math)
|
||||
if (NOT MSVC)
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Wno-pedantic)
|
||||
list(APPEND CUDA_FLAGS -Wno-pedantic)
|
||||
endif()
|
||||
|
||||
if (LLAMA_ALL_WARNINGS AND NOT MSVC)
|
||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||
set(NVCC_CMD ${NVCC_CMD} -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
|
@ -551,13 +601,8 @@ if (LLAMA_CUBLAS)
|
|||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(JOIN GF_CXX_FLAGS " " CUDA_CXX_FLAGS) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS STREQUAL "")
|
||||
set(CUDA_FLAGS ${CUDA_FLAGS} -Xcompiler ${CUDA_CXX_FLAGS})
|
||||
endif()
|
||||
list(APPEND CUDA_CXX_FLAGS ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
endif()
|
||||
|
||||
if (WIN32)
|
||||
|
@ -578,6 +623,17 @@ if (LLAMA_LTO)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
if (LLAMA_CCACHE)
|
||||
find_program(LLAMA_CCACHE_FOUND ccache)
|
||||
if (LLAMA_CCACHE_FOUND)
|
||||
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
|
||||
set(ENV{CCACHE_SLOPPINESS} time_macros)
|
||||
message(STATUS "Using ccache")
|
||||
else()
|
||||
message(STATUS "Warning: ccache not found - consider installing it or use LLAMA_CCACHE=OFF")
|
||||
endif ()
|
||||
endif()
|
||||
|
||||
# this version of Apple ld64 is buggy
|
||||
execute_process(
|
||||
COMMAND ${CMAKE_C_COMPILER} ${CMAKE_EXE_LINKER_FLAGS} -Wl,-v
|
||||
|
@ -611,12 +667,7 @@ if (NOT MSVC)
|
|||
endif()
|
||||
endif()
|
||||
|
||||
function(add_compile_option_cpp ARG)
|
||||
# Adds a compile option to C/C++ only, but not for Cuda.
|
||||
# Use, e.g., for CPU-architecture flags.
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:CXX>:${ARG}>)
|
||||
add_compile_options($<$<COMPILE_LANGUAGE:C>:${ARG}>)
|
||||
endfunction()
|
||||
set(ARCH_FLAGS "")
|
||||
|
||||
if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") OR ("${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "arm64"))
|
||||
message(STATUS "ARM detected")
|
||||
|
@ -629,19 +680,19 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR (${CMAKE_SYSTEM_PROCESSOR} MATC
|
|||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
add_compile_options(-mfp16-format=ieee)
|
||||
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
# Raspberry Pi 2
|
||||
add_compile_options(-mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
add_compile_options(-mno-unaligned-access)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GENERATOR_PLATFORM_LWR}" MATCHES "^(x86_64|i686|amd64|x64)$" )
|
||||
|
@ -652,7 +703,7 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
|||
include(cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_option_cpp(/arch:AVX512)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
|
@ -666,49 +717,61 @@ elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "^(x86_64|i686|AMD64)$" OR "${CMAKE_GE
|
|||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
endif()
|
||||
elseif (LLAMA_AVX2)
|
||||
add_compile_option_cpp(/arch:AVX2)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
elseif (LLAMA_AVX)
|
||||
add_compile_option_cpp(/arch:AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
if (LLAMA_NATIVE)
|
||||
add_compile_option_cpp(-march=native)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
endif()
|
||||
if (LLAMA_F16C)
|
||||
add_compile_option_cpp(-mf16c)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if (LLAMA_FMA)
|
||||
add_compile_option_cpp(-mfma)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
endif()
|
||||
if (LLAMA_AVX)
|
||||
add_compile_option_cpp(-mavx)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
endif()
|
||||
if (LLAMA_AVX2)
|
||||
add_compile_option_cpp(-mavx2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
endif()
|
||||
if (LLAMA_AVX512)
|
||||
add_compile_option_cpp(-mavx512f)
|
||||
add_compile_option_cpp(-mavx512bw)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VBMI)
|
||||
add_compile_option_cpp(-mavx512vbmi)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
if (LLAMA_AVX512_VNNI)
|
||||
add_compile_option_cpp(-mavx512vnni)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
add_compile_options(-mcpu=powerpc64le)
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
else()
|
||||
add_compile_options(-mcpu=native -mtune=native)
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
endif()
|
||||
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||
|
||||
if (LLAMA_CUBLAS)
|
||||
list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
|
||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
|
||||
endif()
|
||||
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
endif()
|
||||
|
||||
if (MINGW)
|
||||
# Target Windows 8 for PrefetchVirtualMemory
|
||||
add_compile_definitions(_WIN32_WINNT=${LLAMA_WIN_VER})
|
||||
|
@ -788,6 +851,7 @@ add_library(ggml OBJECT
|
|||
${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL}
|
||||
${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI}
|
||||
${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA}
|
||||
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
|
||||
)
|
||||
|
||||
target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
|
||||
|
@ -863,7 +927,7 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/LlamaConfig.cmake
|
|||
${CMAKE_CURRENT_BINARY_DIR}/LlamaConfigVersion.cmake
|
||||
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Llama)
|
||||
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h"
|
||||
set(GGML_PUBLIC_HEADERS "ggml.h" "ggml-alloc.h" "ggml-backend.h"
|
||||
"${GGML_HEADERS_CUDA}" "${GGML_HEADERS_OPENCL}"
|
||||
"${GGML_HEADERS_METAL}" "${GGML_HEADERS_MPI}" "${GGML_HEADERS_EXTRA}")
|
||||
|
||||
|
|
10
Makefile
10
Makefile
|
@ -9,7 +9,7 @@ TEST_TARGETS = \
|
|||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \
|
||||
tests/test-backend-ops
|
||||
tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease
|
||||
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
|
@ -632,7 +632,7 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
|
|||
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
server: examples/server/server.cpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h,$(filter-out %.hpp,$^)) -o $@ $(LDFLAGS) $(LWINSOCK2) -Wno-cast-qual
|
||||
|
||||
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
|
||||
|
@ -760,3 +760,9 @@ tests/test-c.o: tests/test-c.c llama.h
|
|||
|
||||
tests/test-backend-ops: tests/test-backend-ops.cpp ggml.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-model-load-cancel: tests/test-model-load-cancel.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-autorelease: tests/test-autorelease.cpp ggml.o llama.o tests/get-model.cpp $(COMMON_DEPS) $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
|
32
README.md
32
README.md
|
@ -10,11 +10,11 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
|
|||
|
||||
### Hot topics
|
||||
|
||||
- ⚠️ Incoming backends: https://github.com/ggerganov/llama.cpp/discussions/5138
|
||||
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
|
||||
- Collecting Apple Silicon performance stats:
|
||||
- M-series: https://github.com/ggerganov/llama.cpp/discussions/4167
|
||||
- A-series: https://github.com/ggerganov/llama.cpp/discussions/4508
|
||||
- Added Mixtral support: https://github.com/ggerganov/llama.cpp/pull/4406
|
||||
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
|
||||
|
||||
----
|
||||
|
@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
|
|||
- AVX, AVX2 and AVX512 support for x86 architectures
|
||||
- Mixed F16 / F32 precision
|
||||
- 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
|
||||
- CUDA, Metal and OpenCL GPU backend support
|
||||
- CUDA, Metal, OpenCL, SYCL GPU backend support
|
||||
|
||||
The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
|
||||
Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
|
||||
|
@ -112,6 +112,7 @@ as the main playground for developing new features for the [ggml](https://github
|
|||
- [x] [Bakllava](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
|
||||
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
|
||||
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
|
||||
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
|
||||
|
||||
|
||||
**Bindings:**
|
||||
|
@ -121,13 +122,15 @@ as the main playground for developing new features for the [ggml](https://github
|
|||
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
|
||||
- JS/TS (llama.cpp server client): [lgrammel/modelfusion](https://modelfusion.dev/integration/model-provider/llamacpp)
|
||||
- Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb)
|
||||
- Rust: [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp)
|
||||
- Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs)
|
||||
- C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp)
|
||||
- Scala 3: [donderom/llm4s](https://github.com/donderom/llm4s)
|
||||
- Clojure: [phronmophobic/llama.clj](https://github.com/phronmophobic/llama.clj)
|
||||
- React Native: [mybigday/llama.rn](https://github.com/mybigday/llama.rn)
|
||||
- Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp)
|
||||
- Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
|
||||
- Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
|
||||
|
||||
**UI:**
|
||||
|
||||
|
@ -596,6 +599,15 @@ Building the program with BLAS support may lead to some performance improvements
|
|||
|
||||
You can get a list of platforms and devices from the `clinfo -l` command, etc.
|
||||
|
||||
- #### SYCL
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
|
||||
|
||||
llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
|
||||
|
||||
For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
|
||||
|
||||
|
||||
### Prepare Data & Run
|
||||
|
||||
```bash
|
||||
|
@ -929,17 +941,20 @@ Place your desired model into the `~/llama.cpp/models/` directory and execute th
|
|||
* Create a folder to store big models & intermediate files (ex. /llama/models)
|
||||
|
||||
#### Images
|
||||
We have two Docker images available for this project:
|
||||
We have three Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
Additionally, there the following images, similar to the above:
|
||||
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
|
||||
The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](.github/workflows/docker.yml). If you need different settings (for example, a different CUDA or ROCm library, you'll need to build the images locally for now).
|
||||
|
||||
|
@ -965,6 +980,12 @@ or with a light image:
|
|||
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
|
||||
```
|
||||
|
||||
or with a server image:
|
||||
|
||||
```bash
|
||||
docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
|
||||
```
|
||||
|
||||
### Docker With CUDA
|
||||
|
||||
Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia-container-toolkit) properly installed on Linux, or is using a GPU enabled cloud, `cuBLAS` should be accessible inside the container.
|
||||
|
@ -974,6 +995,7 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
|
|||
```bash
|
||||
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:light-cuda -f .devops/main-cuda.Dockerfile .
|
||||
docker build -t local/llama.cpp:server-cuda -f .devops/server-cuda.Dockerfile .
|
||||
```
|
||||
|
||||
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
|
||||
|
@ -987,6 +1009,7 @@ The resulting images, are essentially the same as the non-CUDA images:
|
|||
|
||||
1. `local/llama.cpp:full-cuda`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization.
|
||||
2. `local/llama.cpp:light-cuda`: This image only includes the main executable file.
|
||||
3. `local/llama.cpp:server-cuda`: This image only includes the server executable file.
|
||||
|
||||
#### Usage
|
||||
|
||||
|
@ -995,6 +1018,7 @@ After building locally, Usage is similar to the non-CUDA examples, but you'll ne
|
|||
```bash
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
|
||||
docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512 --n-gpu-layers 1
|
||||
```
|
||||
|
||||
### Contributing
|
||||
|
|
252
README_sycl.md
Normal file
252
README_sycl.md
Normal file
|
@ -0,0 +1,252 @@
|
|||
# llama.cpp for SYCL
|
||||
|
||||
[Background](#background)
|
||||
|
||||
[OS](#os)
|
||||
|
||||
[Intel GPU](#intel-gpu)
|
||||
|
||||
[Linux](#linux)
|
||||
|
||||
[Environment Variable](#environment-variable)
|
||||
|
||||
[Known Issue](#known-issue)
|
||||
|
||||
[Todo](#todo)
|
||||
|
||||
## Background
|
||||
|
||||
SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
|
||||
|
||||
oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
|
||||
|
||||
Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
|
||||
|
||||
To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
|
||||
|
||||
The llama.cpp for SYCL is used to support Intel GPUs.
|
||||
|
||||
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
|
||||
|
||||
## OS
|
||||
|
||||
|OS|Status|Verified|
|
||||
|-|-|-|
|
||||
|Linux|Support|Ubuntu 22.04|
|
||||
|Windows|Ongoing| |
|
||||
|
||||
|
||||
## Intel GPU
|
||||
|
||||
|Intel GPU| Status | Verified Model|
|
||||
|-|-|-|
|
||||
|Intel Data Center Max Series| Support| Max 1550|
|
||||
|Intel Data Center Flex Series| Support| Flex 170|
|
||||
|Intel Arc Series| Support| Arc 770|
|
||||
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
|
||||
|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
|
||||
|
||||
|
||||
## Linux
|
||||
|
||||
### Setup Environment
|
||||
|
||||
1. Install Intel GPU driver.
|
||||
|
||||
a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
|
||||
|
||||
Note: for iGPU, please install the client GPU driver.
|
||||
|
||||
b. Add user to group: video, render.
|
||||
|
||||
```
|
||||
sudo usermod -aG render username
|
||||
sudo usermod -aG video username
|
||||
```
|
||||
|
||||
Note: re-login to enable it.
|
||||
|
||||
c. Check
|
||||
|
||||
```
|
||||
sudo apt install clinfo
|
||||
sudo clinfo -l
|
||||
```
|
||||
|
||||
Output (example):
|
||||
|
||||
```
|
||||
Platform #0: Intel(R) OpenCL Graphics
|
||||
`-- Device #0: Intel(R) Arc(TM) A770 Graphics
|
||||
|
||||
|
||||
Platform #0: Intel(R) OpenCL HD Graphics
|
||||
`-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
|
||||
```
|
||||
|
||||
2. Install Intel® oneAPI Base toolkit.
|
||||
|
||||
|
||||
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
|
||||
|
||||
Recommend to install to default folder: **/opt/intel/oneapi**.
|
||||
|
||||
Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
|
||||
|
||||
b. Check
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
sycl-ls
|
||||
```
|
||||
|
||||
There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
|
||||
|
||||
Output (example):
|
||||
```
|
||||
[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000]
|
||||
[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
|
||||
[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50]
|
||||
[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
|
||||
|
||||
```
|
||||
|
||||
2. Build locally:
|
||||
|
||||
```
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```
|
||||
./examples/sycl/build.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
|
||||
|
||||
### Run
|
||||
|
||||
1. Put model file to folder **models**
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. List device ID
|
||||
|
||||
Run without parameter:
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
|
||||
or
|
||||
|
||||
./build/bin/main
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
||||
|
||||
4. Set device ID and execute llama.cpp
|
||||
|
||||
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
|
||||
|
||||
```
|
||||
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
|
||||
```
|
||||
or run by script:
|
||||
|
||||
```
|
||||
./examples/sycl/run_llama2.sh
|
||||
```
|
||||
|
||||
Note:
|
||||
|
||||
- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
|
||||
|
||||
|
||||
5. Check the device ID in output
|
||||
|
||||
Like:
|
||||
```
|
||||
Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
|
||||
```
|
||||
|
||||
|
||||
## Environment Variable
|
||||
|
||||
#### Build
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
|
||||
|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
|
||||
|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
|
||||
|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
|
||||
|
||||
#### Running
|
||||
|
||||
|
||||
|Name|Value|Function|
|
||||
|-|-|-|
|
||||
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
|
||||
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
|
||||
|
||||
## Known Issue
|
||||
|
||||
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
|
||||
|
||||
Miss to enable oneAPI running environment.
|
||||
|
||||
Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
|
||||
|
||||
|
||||
- Hang during startup
|
||||
|
||||
llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
|
||||
|
||||
Solution: add **--no-mmap**.
|
||||
|
||||
## Todo
|
||||
|
||||
- Support to build in Windows.
|
||||
|
||||
- Support multiple cards.
|
|
@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
|||
|
||||
# with CUDA support
|
||||
GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
# with SYCL support
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
```
|
||||
|
|
113
ci/run.sh
113
ci/run.sh
|
@ -10,6 +10,9 @@
|
|||
# # with CUDA support
|
||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with SYCL support
|
||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
|
@ -22,9 +25,9 @@ mkdir -p "$2"
|
|||
OUT=$(realpath "$1")
|
||||
MNT=$(realpath "$2")
|
||||
|
||||
rm -v $OUT/*.log
|
||||
rm -v $OUT/*.exit
|
||||
rm -v $OUT/*.md
|
||||
rm -f "$OUT/*.log"
|
||||
rm -f "$OUT/*.exit"
|
||||
rm -f "$OUT/*.md"
|
||||
|
||||
sd=`dirname $0`
|
||||
cd $sd/../
|
||||
|
@ -36,6 +39,18 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
|
|||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
|
||||
fi
|
||||
|
||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||
if [ -z ${ONEAPI_ROOT} ]; then
|
||||
echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
||||
fi
|
||||
## helpers
|
||||
|
||||
# download a file if it does not exist or if it is outdated
|
||||
|
@ -90,7 +105,7 @@ function gg_run_ctest_debug {
|
|||
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
@ -119,9 +134,9 @@ function gg_run_ctest_release {
|
|||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
else
|
||||
(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
(time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
fi
|
||||
|
||||
set +e
|
||||
|
@ -137,6 +152,61 @@ function gg_sum_ctest_release {
|
|||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
function gg_get_model {
|
||||
local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
|
||||
local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
|
||||
if [[ -s $gguf_3b ]]; then
|
||||
echo -n "$gguf_3b"
|
||||
elif [[ -s $gguf_7b ]]; then
|
||||
echo -n "$gguf_7b"
|
||||
else
|
||||
echo >&2 "No model found. Can't run gg_run_ctest_with_model."
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
function gg_run_ctest_with_model_debug {
|
||||
cd ${SRC}
|
||||
|
||||
local model; model=$(gg_get_model)
|
||||
cd build-ci-debug
|
||||
set -e
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
|
||||
function gg_run_ctest_with_model_release {
|
||||
cd ${SRC}
|
||||
|
||||
local model; model=$(gg_get_model)
|
||||
cd build-ci-release
|
||||
set -e
|
||||
(LLAMACPP_TEST_MODELFILE="$model" time ctest --output-on-failure -L model) 2>&1 | tee -a $OUT/${ci}-ctest.log
|
||||
set +e
|
||||
cd ..
|
||||
}
|
||||
|
||||
function gg_sum_ctest_with_model_debug {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest with model files in debug mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
function gg_sum_ctest_with_model_release {
|
||||
gg_printf '### %s\n\n' "${ci}"
|
||||
|
||||
gg_printf 'Runs ctest with model files in release mode\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '```\n'
|
||||
gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
|
||||
gg_printf '```\n'
|
||||
}
|
||||
|
||||
# open_llama_3b_v2
|
||||
|
||||
function gg_run_open_llama_3b_v2 {
|
||||
|
@ -160,8 +230,8 @@ function gg_run_open_llama_3b_v2 {
|
|||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
|
@ -214,6 +284,8 @@ function gg_run_open_llama_3b_v2 {
|
|||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
|
@ -241,6 +313,8 @@ function gg_run_open_llama_3b_v2 {
|
|||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
|
@ -282,7 +356,6 @@ function gg_run_open_llama_3b_v2 {
|
|||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
|
@ -292,6 +365,7 @@ function gg_sum_open_llama_3b_v2 {
|
|||
gg_printf 'OpenLLaMA 3B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
|
@ -337,8 +411,8 @@ function gg_run_open_llama_7b_v2 {
|
|||
|
||||
set -e
|
||||
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||
|
||||
python3 ../convert.py ${path_models}
|
||||
|
||||
|
@ -391,6 +465,8 @@ function gg_run_open_llama_7b_v2 {
|
|||
(time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
|
||||
(time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
|
||||
|
||||
(time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
|
||||
|
||||
(time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
|
||||
|
||||
function check_ppl {
|
||||
|
@ -418,6 +494,8 @@ function gg_run_open_llama_7b_v2 {
|
|||
check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
|
||||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
|
@ -469,6 +547,7 @@ function gg_sum_open_llama_7b_v2 {
|
|||
gg_printf 'OpenLLaMA 7B-v2:\n'
|
||||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
|
@ -492,14 +571,18 @@ function gg_sum_open_llama_7b_v2 {
|
|||
## main
|
||||
|
||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||
# Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
|
||||
rm -rf ${SRC}/models-mnt
|
||||
|
||||
mnt_models=${MNT}/models
|
||||
mkdir -p ${mnt_models}
|
||||
ln -sfn ${mnt_models} ${SRC}/models-mnt
|
||||
|
||||
python3 -m pip install -r ${SRC}/requirements.txt
|
||||
python3 -m pip install --editable gguf-py
|
||||
# Create a fresh python3 venv and enter it
|
||||
python3 -m venv "$MNT/venv"
|
||||
source "$MNT/venv/bin/activate"
|
||||
|
||||
pip install -r ${SRC}/requirements.txt --disable-pip-version-check
|
||||
pip install --editable gguf-py --disable-pip-version-check
|
||||
fi
|
||||
|
||||
ret=0
|
||||
|
@ -514,6 +597,8 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
|||
else
|
||||
test $ret -eq 0 && gg_run open_llama_7b_v2
|
||||
fi
|
||||
test $ret -eq 0 && gg_run ctest_with_model_debug
|
||||
test $ret -eq 0 && gg_run ctest_with_model_release
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
|
@ -42,6 +42,10 @@
|
|||
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||
#endif
|
||||
|
||||
#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
|
||||
#define GGML_USE_CUBLAS_SYCL
|
||||
#endif
|
||||
|
||||
int32_t get_num_physical_cores() {
|
||||
#ifdef __linux__
|
||||
// enumerate the set of thread siblings, num entries is num cores
|
||||
|
@ -203,6 +207,23 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.prompt_cache_all = true;
|
||||
} else if (arg == "--prompt-cache-ro") {
|
||||
params.prompt_cache_ro = true;
|
||||
} else if (arg == "-bf" || arg == "--binary-file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
std::ifstream file(argv[i], std::ios::binary);
|
||||
if (!file) {
|
||||
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
// store the external file name in params
|
||||
params.prompt_file = argv[i];
|
||||
std::ostringstream ss;
|
||||
ss << file.rdbuf();
|
||||
params.prompt = ss.str();
|
||||
fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]);
|
||||
} else if (arg == "-f" || arg == "--file") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -582,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.main_gpu = std::stoi(argv[i]);
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--split-mode" || arg == "-sm") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -601,9 +622,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
|
||||
} else if (arg == "--tensor-split" || arg == "-ts") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
|
@ -626,9 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
params.tensor_split[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
#ifndef GGML_USE_CUBLAS
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS
|
||||
#ifndef GGML_USE_CUBLAS_SYCL
|
||||
fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
|
||||
#endif // GGML_USE_CUBLAS_SYCL
|
||||
} else if (arg == "--no-mmap") {
|
||||
params.use_mmap = false;
|
||||
} else if (arg == "--numa") {
|
||||
|
@ -653,6 +675,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
if (params.logdir.back() != DIRECTORY_SEPARATOR) {
|
||||
params.logdir += DIRECTORY_SEPARATOR;
|
||||
}
|
||||
} else if (arg == "--save-all-logits" || arg == "--kl-divergence-base") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.logits_file = argv[i];
|
||||
} else if (arg == "--perplexity" || arg == "--all-logits") {
|
||||
params.logits_all = true;
|
||||
} else if (arg == "--ppl-stride") {
|
||||
|
@ -681,6 +709,24 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
|
|||
break;
|
||||
}
|
||||
params.hellaswag_tasks = std::stoi(argv[i]);
|
||||
} else if (arg == "--winogrande") {
|
||||
params.winogrande = true;
|
||||
} else if (arg == "--winogrande-tasks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.winogrande_tasks = std::stoi(argv[i]);
|
||||
} else if (arg == "--multiple-choice") {
|
||||
params.multiple_choice = true;
|
||||
} else if (arg == "--multiple-choice-tasks") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.multiple_choice_tasks = std::stoi(argv[i]);
|
||||
} else if (arg == "--kl-divergence") {
|
||||
params.kl_divergence = true;
|
||||
} else if (arg == "--ignore-eos") {
|
||||
params.ignore_eos = true;
|
||||
} else if (arg == "--no-penalize-nl") {
|
||||
|
@ -880,6 +926,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --in-suffix STRING string to suffix after user inputs with (default: empty)\n");
|
||||
printf(" -f FNAME, --file FNAME\n");
|
||||
printf(" prompt file to start generation.\n");
|
||||
printf(" -bf FNAME, --binary-file FNAME\n");
|
||||
printf(" binary file containing multiple choice tasks.\n");
|
||||
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
|
@ -926,6 +974,11 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n");
|
||||
printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks);
|
||||
printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --winogrande-tasks N number of tasks to use when computing the Winogrande score (default: %zu)\n", params.winogrande_tasks);
|
||||
printf(" --multiple-choice compute multiple choice score over random tasks from datafile supplied with -f\n");
|
||||
printf(" --multiple-choice-tasks N number of tasks to use when computing the multiple choice score (default: %zu)\n", params.winogrande_tasks);
|
||||
printf(" --kl-divergence computes KL-divergence to logits provided via --kl-divergence-base");
|
||||
printf(" --keep N number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep);
|
||||
printf(" --draft N number of tokens to draft for speculative decoding (default: %d)\n", params.n_draft);
|
||||
printf(" --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks);
|
||||
|
@ -959,7 +1012,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
|||
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
|
||||
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
|
||||
#endif
|
||||
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
|
||||
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
|
||||
printf(" -gan N, --grp-attn-n N\n");
|
||||
|
@ -1466,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
|
|||
fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false");
|
||||
fprintf(stream, "cpu_has_fma: %s\n", ggml_cpu_has_fma() ? "true" : "false");
|
||||
|
|
|
@ -91,6 +91,7 @@ struct gpt_params {
|
|||
std::string input_suffix = ""; // string to suffix user inputs with
|
||||
std::vector<std::string> antiprompt; // string upon seeing which more user input is prompted
|
||||
std::string logdir = ""; // directory in which to save YAML log files
|
||||
std::string logits_file = ""; // file for saving *all* logits
|
||||
|
||||
std::vector<llama_model_kv_override> kv_overrides;
|
||||
|
||||
|
@ -105,6 +106,14 @@ struct gpt_params {
|
|||
bool hellaswag = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
|
||||
size_t hellaswag_tasks = 400; // number of tasks to use when computing the HellaSwag score
|
||||
|
||||
bool winogrande = false; // compute Winogrande score over random tasks from datafile supplied in prompt
|
||||
size_t winogrande_tasks= 0; // number of tasks to use when computing the Winogrande score. If 0, all tasks will be computed
|
||||
|
||||
bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
|
||||
size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
|
||||
|
||||
bool kl_divergence = false; // compute KL-divergence
|
||||
|
||||
bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS
|
||||
bool random_prompt = false; // do not randomize prompt if none provided
|
||||
bool use_color = false; // use color to distinguish generations and inputs
|
||||
|
|
|
@ -13,6 +13,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
|||
// will be empty (default) if there are parse errors
|
||||
if (result->parsed_grammar.rules.empty()) {
|
||||
fprintf(stderr, "%s: failed to parse grammar\n", __func__);
|
||||
delete result;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -129,6 +130,8 @@ static void sampler_queue(
|
|||
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
|
||||
|
||||
const float temp = params.temp;
|
||||
const float dynatemp_range = params.dynatemp_range;
|
||||
const float dynatemp_exponent = params.dynatemp_exponent;
|
||||
const int32_t top_k = params.top_k <= 0 ? n_vocab : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float min_p = params.min_p;
|
||||
|
@ -143,7 +146,15 @@ static void sampler_queue(
|
|||
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
|
||||
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
|
||||
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
|
||||
case 't': llama_sample_temp (ctx_main, &cur_p, temp); break;
|
||||
case 't':
|
||||
if (dynatemp_range > 0) {
|
||||
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
|
||||
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);
|
||||
llama_sample_entropy(ctx_main, &cur_p, dynatemp_min, dynatemp_max, dynatemp_exponent);
|
||||
} else {
|
||||
llama_sample_temp(ctx_main, &cur_p, temp);
|
||||
}
|
||||
break;
|
||||
default : break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,9 @@ typedef struct llama_sampling_params {
|
|||
float min_p = 0.05f; // 0.0 = disabled
|
||||
float tfs_z = 1.00f; // 1.0 = disabled
|
||||
float typical_p = 1.00f; // 1.0 = disabled
|
||||
float temp = 0.80f; // 1.0 = disabled
|
||||
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
|
||||
float dynatemp_range = 0.00f; // 0.0 = disabled
|
||||
float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler
|
||||
int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size)
|
||||
float penalty_repeat = 1.10f; // 1.0 = disabled
|
||||
float penalty_freq = 0.00f; // 0.0 = disabled
|
||||
|
|
|
@ -10,7 +10,7 @@ import re
|
|||
import sys
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast, Optional
|
||||
from typing import TYPE_CHECKING, Any, ContextManager, Iterator, cast
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -189,6 +189,8 @@ class Model:
|
|||
return StableLMModel
|
||||
if model_architecture == "QWenLMHeadModel":
|
||||
return QwenModel
|
||||
if model_architecture == "Qwen2ForCausalLM":
|
||||
return Model
|
||||
if model_architecture == "MixtralForCausalLM":
|
||||
return MixtralModel
|
||||
if model_architecture == "GPT2LMHeadModel":
|
||||
|
@ -197,6 +199,10 @@ class Model:
|
|||
return Phi2Model
|
||||
if model_architecture == "PlamoForCausalLM":
|
||||
return PlamoModel
|
||||
if model_architecture == "CodeShellForCausalLM":
|
||||
return CodeShellModel
|
||||
if model_architecture == "OrionForCausalLM":
|
||||
return OrionModel
|
||||
return Model
|
||||
|
||||
def _is_model_safetensors(self) -> bool:
|
||||
|
@ -234,6 +240,8 @@ class Model:
|
|||
return gguf.MODEL_ARCH.STABLELM
|
||||
if arch == "QWenLMHeadModel":
|
||||
return gguf.MODEL_ARCH.QWEN
|
||||
if arch == "Qwen2ForCausalLM":
|
||||
return gguf.MODEL_ARCH.QWEN2
|
||||
if arch == "MixtralForCausalLM":
|
||||
return gguf.MODEL_ARCH.LLAMA
|
||||
if arch == "GPT2LMHeadModel":
|
||||
|
@ -242,6 +250,10 @@ class Model:
|
|||
return gguf.MODEL_ARCH.PHI2
|
||||
if arch == "PlamoForCausalLM":
|
||||
return gguf.MODEL_ARCH.PLAMO
|
||||
if arch == "CodeShellForCausalLM":
|
||||
return gguf.MODEL_ARCH.CODESHELL
|
||||
if arch == "OrionForCausalLM":
|
||||
return gguf.MODEL_ARCH.ORION
|
||||
|
||||
raise NotImplementedError(f'Architecture "{arch}" not supported!')
|
||||
|
||||
|
@ -281,6 +293,58 @@ class Model:
|
|||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_qwen(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
tokens: list[bytearray] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams["vocab_size"]
|
||||
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer.mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[QwenModel.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
assert len(merged) == 2
|
||||
merges.append(' '.join(map(QwenModel.token_bytes_to_string, merged)))
|
||||
|
||||
# for this kind of tokenizer, added_vocab is not a subset of vocab, so they need to be combined
|
||||
added_vocab = tokenizer.special_tokens
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in (vocab | added_vocab).items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
pad_token = f"[PAD{i}]".encode("utf-8")
|
||||
tokens.append(bytearray(pad_token))
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.merges = merges
|
||||
# only add special tokens when they were not already loaded from config.json
|
||||
if len(special_vocab.special_token_ids) == 0:
|
||||
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
# this one is usually not in config.json anyway
|
||||
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
|
||||
def _set_vocab_sentencepiece(self):
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
|
@ -479,7 +543,8 @@ class MPTModel(Model):
|
|||
# map tensor names
|
||||
if "scales" in name:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales"))
|
||||
new_name = new_name.replace("scales", "act.scales")
|
||||
if new_name is not None:
|
||||
new_name = new_name.replace("scales", "act.scales")
|
||||
else:
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
|
@ -511,6 +576,83 @@ class MPTModel(Model):
|
|||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
|
||||
|
||||
class OrionModel(Model):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
head_count = self.hparams["num_attention_heads"]
|
||||
head_count_kv = self.hparams.get("num_key_value_heads", head_count)
|
||||
hf_repo = self.hparams.get("_name_or_path", "")
|
||||
|
||||
ctx_length = 0
|
||||
if "max_sequence_length" in self.hparams:
|
||||
ctx_length = self.hparams["max_sequence_length"]
|
||||
elif "max_position_embeddings" in self.hparams:
|
||||
ctx_length = self.hparams["max_position_embeddings"]
|
||||
elif "model_max_length" in self.hparams:
|
||||
ctx_length = self.hparams["model_max_length"]
|
||||
else:
|
||||
print("gguf: can not find ctx length parameter.")
|
||||
sys.exit()
|
||||
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_name(self.dir_model.name)
|
||||
self.gguf_writer.add_source_hf_repo(hf_repo)
|
||||
self.gguf_writer.add_tensor_data_layout("Meta AI original pth")
|
||||
self.gguf_writer.add_context_length(ctx_length)
|
||||
self.gguf_writer.add_embedding_length(self.hparams["hidden_size"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"])
|
||||
self.gguf_writer.add_head_count(head_count)
|
||||
self.gguf_writer.add_head_count_kv(head_count_kv)
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
|
||||
|
||||
def write_tensors(self):
|
||||
# Collect tensors from generator object
|
||||
model_kv = dict(self.get_tensors())
|
||||
block_count = self.hparams["num_hidden_layers"]
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
|
||||
for name, data_torch in model_kv.items():
|
||||
# we don't need these
|
||||
if name.endswith(".rotary_emb.inv_freq"):
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
class BaichuanModel(Model):
|
||||
def set_vocab(self):
|
||||
self._set_vocab_sentencepiece()
|
||||
|
@ -868,6 +1010,13 @@ class PersimmonModel(Model):
|
|||
|
||||
|
||||
class StableLMModel(Model):
|
||||
def set_vocab(self):
|
||||
if (self.dir_model / "tokenizer.json").is_file():
|
||||
self._set_vocab_gpt2()
|
||||
else:
|
||||
# StableLM 2 1.6B uses a vocab in a similar format to Qwen's vocab
|
||||
self._set_vocab_qwen()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
hparams = self.hparams
|
||||
block_count = hparams["num_hidden_layers"]
|
||||
|
@ -896,7 +1045,7 @@ class QwenModel(Model):
|
|||
return ''.join([byte_encoder[ord(char)] for char in b.decode('latin-1')])
|
||||
|
||||
@staticmethod
|
||||
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: Optional[int] = None) -> list[bytes]:
|
||||
def bpe(mergeable_ranks: dict[bytes, int], token: bytes, max_rank: int | None = None) -> list[bytes]:
|
||||
parts = [bytes([b]) for b in token]
|
||||
while True:
|
||||
min_idx = None
|
||||
|
@ -913,52 +1062,7 @@ class QwenModel(Model):
|
|||
return parts
|
||||
|
||||
def set_vocab(self):
|
||||
dir_model = self.dir_model
|
||||
hparams = self.hparams
|
||||
tokens: list[bytearray] = []
|
||||
toktypes: list[int] = []
|
||||
|
||||
from transformers import AutoTokenizer
|
||||
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||
vocab_size = hparams["vocab_size"]
|
||||
assert max(tokenizer.get_vocab().values()) < vocab_size
|
||||
|
||||
merges = []
|
||||
vocab = {}
|
||||
mergeable_ranks = tokenizer.mergeable_ranks
|
||||
for token, rank in mergeable_ranks.items():
|
||||
vocab[self.token_bytes_to_string(token)] = rank
|
||||
if len(token) == 1:
|
||||
continue
|
||||
merged = QwenModel.bpe(mergeable_ranks, token, max_rank=rank)
|
||||
assert len(merged) == 2
|
||||
merges.append(' '.join(map(self.token_bytes_to_string, merged)))
|
||||
|
||||
reverse_vocab = {id_ : encoded_tok for encoded_tok, id_ in vocab.items()}
|
||||
added_vocab = tokenizer.special_tokens
|
||||
|
||||
for i in range(vocab_size):
|
||||
if i not in reverse_vocab:
|
||||
pad_token = f"[PAD{i}]".encode("utf-8")
|
||||
tokens.append(bytearray(pad_token))
|
||||
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||
elif reverse_vocab[i] in added_vocab:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.CONTROL)
|
||||
else:
|
||||
tokens.append(reverse_vocab[i])
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
self.gguf_writer.add_tokenizer_model("gpt2")
|
||||
self.gguf_writer.add_token_list(tokens)
|
||||
self.gguf_writer.add_token_types(toktypes)
|
||||
|
||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges=False)
|
||||
special_vocab.merges = merges
|
||||
special_vocab._set_special_token("bos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab._set_special_token("eos", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
|
||||
special_vocab.add_to_gguf(self.gguf_writer)
|
||||
self._set_vocab_qwen()
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
self.gguf_writer.add_name("Qwen")
|
||||
|
@ -1176,6 +1280,70 @@ class PlamoModel(Model):
|
|||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
|
||||
class CodeShellModel(Model):
|
||||
def set_gguf_parameters(self):
|
||||
block_count = self.hparams["n_layer"]
|
||||
|
||||
self.gguf_writer.add_name("CodeShell")
|
||||
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||
self.gguf_writer.add_embedding_length(self.hparams["n_embd"])
|
||||
self.gguf_writer.add_feed_forward_length(4 * self.hparams["n_embd"])
|
||||
self.gguf_writer.add_block_count(block_count)
|
||||
self.gguf_writer.add_head_count(self.hparams["n_head"])
|
||||
self.gguf_writer.add_head_count_kv(self.hparams["num_query_groups"])
|
||||
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||
self.gguf_writer.add_file_type(self.ftype)
|
||||
self.gguf_writer.add_rope_freq_base(10000.0)
|
||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||
|
||||
def write_tensors(self):
|
||||
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
|
||||
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
|
||||
tensors = dict(self.get_tensors())
|
||||
has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys()
|
||||
for name, data_torch in tensors.items():
|
||||
# we don't need these
|
||||
if name.endswith((".attn.rotary_emb.inv_freq")):
|
||||
continue
|
||||
|
||||
old_dtype = data_torch.dtype
|
||||
|
||||
# convert any unsupported data types to float32
|
||||
if data_torch.dtype not in (torch.float16, torch.float32):
|
||||
data_torch = data_torch.to(torch.float32)
|
||||
|
||||
data = data_torch.squeeze().numpy()
|
||||
|
||||
# map tensor names
|
||||
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
|
||||
if new_name is None:
|
||||
print(f"Can not map tensor {name!r}")
|
||||
sys.exit()
|
||||
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
|
||||
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
|
||||
data = data.astype(np.float32)
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
|
||||
data = data.astype(np.float16)
|
||||
|
||||
print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
|
||||
if not has_lm_head and name == "transformer.wte.weight":
|
||||
self.gguf_writer.add_tensor("output.weight", data)
|
||||
print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}")
|
||||
|
||||
###### CONVERSION LOGIC ######
|
||||
|
||||
|
||||
|
@ -1213,7 +1381,7 @@ def main() -> None:
|
|||
|
||||
if args.awq_path:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'awq-py'))
|
||||
from awq.apply_awq import add_scale_weights
|
||||
from awq.apply_awq import add_scale_weights # type: ignore[import-not-found]
|
||||
tmp_model_path = args.model / "weighted_model"
|
||||
dir_model = tmp_model_path
|
||||
if tmp_model_path.is_dir():
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from enum import IntEnum
|
||||
|
@ -9,7 +10,6 @@ from pathlib import Path
|
|||
|
||||
import numpy as np
|
||||
|
||||
import os
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
@ -371,15 +371,11 @@ def handle_metadata(cfg, hp):
|
|||
params = convert.Params.loadOriginalParamsJson(fakemodel, orig_config_path)
|
||||
else:
|
||||
raise ValueError('Unable to load metadata')
|
||||
vocab = convert.load_vocab(
|
||||
cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir,
|
||||
cfg.vocabtype)
|
||||
# FIXME: Respect cfg.vocab_dir?
|
||||
svocab = gguf.SpecialVocab(cfg.model_metadata_dir,
|
||||
load_merges = cfg.vocabtype == 'bpe',
|
||||
n_vocab = vocab.vocab_size)
|
||||
vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir)
|
||||
vocab_factory = convert.VocabFactory(vocab_path)
|
||||
vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir)
|
||||
convert.check_vocab_size(params, vocab)
|
||||
return (params, vocab, svocab)
|
||||
return params, vocab, special_vocab
|
||||
|
||||
|
||||
def handle_args():
|
||||
|
|
|
@ -5,17 +5,16 @@ import json
|
|||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, BinaryIO, Sequence
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from pathlib import Path
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
|
@ -60,7 +59,14 @@ if __name__ == '__main__':
|
|||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
if os.path.exists(input_model):
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
else:
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
|
||||
# lazy import load_file only if lora is in safetensors format.
|
||||
from safetensors.torch import load_file
|
||||
model = load_file(input_model, device="cpu")
|
||||
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
import torch
|
||||
import os
|
||||
from pprint import pprint
|
||||
import sys
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from pprint import pprint
|
||||
|
||||
import torch
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
|
||||
import gguf
|
||||
|
@ -69,7 +71,7 @@ def main():
|
|||
persimmon_model = torch.load(args.ckpt_path)
|
||||
hparams = persimmon_model['args']
|
||||
pprint(hparams)
|
||||
tensors = {}
|
||||
tensors: dict[str, torch.Tensor] = {}
|
||||
_flatten_dict(persimmon_model['model'], tensors, None)
|
||||
|
||||
arch = gguf.MODEL_ARCH.PERSIMMON
|
||||
|
|
645
convert.py
645
convert.py
File diff suppressed because it is too large
Load diff
|
@ -23,6 +23,9 @@ else()
|
|||
add_subdirectory(infill)
|
||||
add_subdirectory(llama-bench)
|
||||
add_subdirectory(llava)
|
||||
if (LLAMA_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
endif()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(parallel)
|
||||
|
|
|
@ -1800,6 +1800,8 @@ int main(int argc, char ** argv) {
|
|||
std::vector<size_t> train_samples_begin;
|
||||
std::vector<size_t> train_samples_size;
|
||||
printf("%s: tokenize training data from %s\n", __func__, params.common.fn_train_data);
|
||||
printf("%s: sample-start: %s\n", __func__, params.common.sample_start.c_str());
|
||||
printf("%s: include-sample-start: %s\n", __func__, params.common.include_sample_start ? "true" : "false");
|
||||
tokenize_file(lctx,
|
||||
params.common.fn_train_data,
|
||||
params.common.sample_start,
|
||||
|
|
32
examples/imatrix/README.md
Normal file
32
examples/imatrix/README.md
Normal file
|
@ -0,0 +1,32 @@
|
|||
# llama.cpp/examples/imatrix
|
||||
|
||||
Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantum models.
|
||||
More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
|
||||
|
||||
## Usage
|
||||
|
||||
```
|
||||
./imatrix -m <some_fp_model> -f <some_training_data> [-o <output_file>] [--verbosity <verbosity_level>]
|
||||
[-ofreq num_chunks] [-ow <0 or 1>] [other common params]
|
||||
```
|
||||
|
||||
Here `-m` with a model name and `-f` with a file containing training data (such as e.g. `wiki.train.raw`) are mandatory.
|
||||
The parameters in square brackets are optional and have the following meaning:
|
||||
* `-o` (or `--output-file`) specifies the name of the file where the computed data will be stored. If missing `imatrix.dat` is used.
|
||||
* `--verbosity` specifies the verbosity level. If set to `0`, no output other than the perplexity of the processed chunks will be generated. If set to `1`, each time the results are saved a message is written to `stderr`. If `>=2`, a message is output each time data is collected for any tensor. Default verbosity level is `1`.
|
||||
* `-ofreq` (or `--output-frequency`) specifies how often the so far computed result is saved to disk. Default is 10 (i.e., every 10 chunks)
|
||||
* `-ow` (or `--output-weight`) specifies if data will be collected for the `output.weight` tensor. My experience is that it is better to not utilize the importance matrix when quantizing `output.weight`, so this is set to `false` by default.
|
||||
|
||||
For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||
|
||||
## Example
|
||||
|
||||
```bash
|
||||
LLAMA_CUBLAS=1 make -j
|
||||
|
||||
# generate importance matrix (imatrix.dat)
|
||||
./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||
|
||||
# use the imatrix to perform a Q4_K_M quantization
|
||||
./quantize --imatrix imatrix.dat ggml-model-f16.gguf ./ggml-model-q4_k_m.gguf q4_k_m
|
||||
```
|
|
@ -26,6 +26,7 @@ struct StatParams {
|
|||
std::string ofile = "imatrix.dat";
|
||||
int n_output_frequency = 10;
|
||||
int verbosity = 1;
|
||||
int keep_every = 0;
|
||||
bool collect_output_weight = false;
|
||||
};
|
||||
|
||||
|
@ -33,47 +34,144 @@ class IMatrixCollector {
|
|||
public:
|
||||
IMatrixCollector() = default;
|
||||
void set_parameters(StatParams&& params) { m_params = std::move(params); }
|
||||
void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
||||
bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
void save_imatrix() const;
|
||||
private:
|
||||
std::unordered_map<std::string, Stats> m_stats;
|
||||
StatParams m_params;
|
||||
std::mutex m_mutex;
|
||||
int m_last_call = 0;
|
||||
std::vector<float> m_src1_data;
|
||||
std::vector<int> m_ids; // the expert ids from ggml_mul_mat_id
|
||||
//
|
||||
void save_imatrix(const char * file_name) const;
|
||||
void keep_imatrix(int ncall) const;
|
||||
};
|
||||
|
||||
void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
|
||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return;
|
||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return;
|
||||
bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
GGML_UNUSED(user_data);
|
||||
|
||||
const struct ggml_tensor * src0 = t->src[0];
|
||||
const struct ggml_tensor * src1 = t->src[1];
|
||||
|
||||
// when ask is true, the scheduler wants to know if we are interested in data from this tensor
|
||||
// if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
|
||||
if (ask) {
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
|
||||
if (t->op != GGML_OP_MUL_MAT) return false;
|
||||
if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
|
||||
if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
std::lock_guard<std::mutex> lock(m_mutex);
|
||||
auto& e = m_stats[src0->name];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
|
||||
// copy the data from the GPU memory if needed
|
||||
const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
|
||||
|
||||
if (!is_host) {
|
||||
m_src1_data.resize(ggml_nelements(src1));
|
||||
ggml_backend_tensor_get(src1, m_src1_data.data(), 0, ggml_nbytes(src1));
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const float * x = (const float *)src1->data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
m_last_call = e.ncall;
|
||||
if (m_last_call % m_params.n_output_frequency == 0) {
|
||||
save_imatrix();
|
||||
|
||||
const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
|
||||
|
||||
if (t->op == GGML_OP_MUL_MAT_ID) {
|
||||
const int idx = ((int32_t *) t->op_params)[0];
|
||||
const int n_as = ((int32_t *) t->op_params)[1];
|
||||
|
||||
// the top-k selected expert ids are stored in the src0 tensor
|
||||
// for simplicity, always copy src0 to host, because it is small
|
||||
// take into account that src0 is not contiguous!
|
||||
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
|
||||
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
|
||||
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
|
||||
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
|
||||
|
||||
// loop over all possible experts, regardless if they are used or not in the batch
|
||||
// this is necessary to guarantee equal number of "ncall" for each tensor
|
||||
for (int ex = 0; ex < n_as; ++ex) {
|
||||
src0 = t->src[2 + ex];
|
||||
auto& e = m_stats[src0->name];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
||||
// using the following line, we can correct for that if needed
|
||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const int excur = m_ids[row*n_as + idx];
|
||||
GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
|
||||
if (excur != ex) continue;
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
m_last_call = e.ncall;
|
||||
if (m_last_call % m_params.n_output_frequency == 0) {
|
||||
save_imatrix();
|
||||
}
|
||||
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
|
||||
keep_imatrix(m_last_call);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
auto& e = m_stats[src0->name];
|
||||
if (e.values.empty()) {
|
||||
e.values.resize(src1->ne[0], 0);
|
||||
}
|
||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]);
|
||||
exit(1); //GGML_ASSERT(false);
|
||||
}
|
||||
++e.ncall;
|
||||
if (m_params.verbosity > 1) {
|
||||
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
|
||||
}
|
||||
for (int row = 0; row < (int)src1->ne[1]; ++row) {
|
||||
const float * x = data + row * src1->ne[0];
|
||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||
e.values[j] += x[j]*x[j];
|
||||
}
|
||||
}
|
||||
if (e.ncall > m_last_call) {
|
||||
m_last_call = e.ncall;
|
||||
if (m_last_call % m_params.n_output_frequency == 0) {
|
||||
save_imatrix();
|
||||
}
|
||||
if (m_params.keep_every > 0 && m_last_call%m_params.keep_every == 0) {
|
||||
keep_imatrix(m_last_call);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void IMatrixCollector::save_imatrix() const {
|
||||
const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str();
|
||||
save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str());
|
||||
}
|
||||
|
||||
void IMatrixCollector::keep_imatrix(int ncall) const {
|
||||
auto file_name = m_params.ofile;
|
||||
if (file_name.empty()) file_name = "imatrix.dat";
|
||||
file_name += ".at_";
|
||||
file_name += std::to_string(ncall);
|
||||
save_imatrix(file_name.c_str());
|
||||
}
|
||||
|
||||
void IMatrixCollector::save_imatrix(const char * fname) const {
|
||||
std::ofstream out(fname, std::ios::binary);
|
||||
int n_entries = m_stats.size();
|
||||
out.write((const char*)&n_entries, sizeof(n_entries));
|
||||
|
@ -93,8 +191,8 @@ void IMatrixCollector::save_imatrix() const {
|
|||
|
||||
static IMatrixCollector g_collector;
|
||||
|
||||
static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) {
|
||||
g_collector.collect_imatrix(src0, src1);
|
||||
static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
|
||||
return g_collector.collect_imatrix(t, ask, user_data);
|
||||
}
|
||||
|
||||
|
||||
|
@ -171,7 +269,7 @@ static void process_logits(
|
|||
}
|
||||
}
|
||||
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
||||
static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl) {
|
||||
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||
const int n_ctx = llama_n_ctx(ctx);
|
||||
|
@ -192,10 +290,12 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||
}
|
||||
|
||||
std::vector<float> logit_history;
|
||||
logit_history.resize(tokens.size());
|
||||
|
||||
std::vector<float> prob_history;
|
||||
prob_history.resize(tokens.size());
|
||||
|
||||
if (compute_ppl) {
|
||||
logit_history.resize(tokens.size());
|
||||
prob_history.resize(tokens.size());
|
||||
}
|
||||
|
||||
const int n_chunk_max = tokens.size() / n_ctx;
|
||||
|
||||
|
@ -211,12 +311,17 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||
|
||||
std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
|
||||
|
||||
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
if (compute_ppl && num_batches > 1) {
|
||||
logits.reserve((size_t)n_ctx * n_vocab);
|
||||
}
|
||||
|
||||
for (int i = 0; i < n_chunk; ++i) {
|
||||
const int start = i * n_ctx;
|
||||
const int end = start + n_ctx;
|
||||
|
||||
const int num_batches = (n_ctx + n_batch - 1) / n_batch;
|
||||
|
||||
std::vector<float> logits;
|
||||
|
||||
const auto t_start = std::chrono::high_resolution_clock::now();
|
||||
|
@ -244,8 +349,10 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||
// restore the original token in case it was set to BOS
|
||||
tokens[batch_start] = token_org;
|
||||
|
||||
const auto * batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
if (compute_ppl && num_batches > 1) {
|
||||
const auto * batch_logits = llama_get_logits(ctx);
|
||||
logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
|
||||
}
|
||||
}
|
||||
|
||||
const auto t_end = std::chrono::high_resolution_clock::now();
|
||||
|
@ -261,25 +368,32 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||
fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0);
|
||||
}
|
||||
|
||||
const int first = n_ctx/2;
|
||||
process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
count += n_ctx - first - 1;
|
||||
if (compute_ppl) {
|
||||
const int first = n_ctx/2;
|
||||
const auto all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx);
|
||||
process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first,
|
||||
workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first);
|
||||
count += n_ctx - first - 1;
|
||||
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
fflush(stdout);
|
||||
printf("[%d]%.4lf,", i + 1, std::exp(nll / count));
|
||||
fflush(stdout);
|
||||
|
||||
logits.clear();
|
||||
}
|
||||
}
|
||||
printf("\n");
|
||||
|
||||
nll2 /= count;
|
||||
nll /= count;
|
||||
const double ppl = exp(nll);
|
||||
nll2 -= nll * nll;
|
||||
if (nll2 > 0) {
|
||||
nll2 = sqrt(nll2/(count-1));
|
||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
} else {
|
||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||
if (compute_ppl) {
|
||||
nll2 /= count;
|
||||
nll /= count;
|
||||
const double ppl = exp(nll);
|
||||
nll2 -= nll * nll;
|
||||
if (nll2 > 0) {
|
||||
nll2 = sqrt(nll2/(count-1));
|
||||
printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
|
||||
} else {
|
||||
printf("Unexpected negative standard deviation of log(prob)\n");
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -288,6 +402,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params) {
|
|||
int main(int argc, char ** argv) {
|
||||
|
||||
StatParams sparams;
|
||||
bool compute_ppl = true;
|
||||
std::vector<char*> args;
|
||||
args.push_back(argv[0]);
|
||||
int iarg = 1;
|
||||
|
@ -304,12 +419,21 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
else if (arg == "--verbosity") {
|
||||
sparams.verbosity = std::stoi(argv[++iarg]);
|
||||
} else if (arg == "--no-ppl") {
|
||||
compute_ppl = false;
|
||||
} else if (arg == "--keep-imatrix") {
|
||||
sparams.keep_every = std::stoi(argv[++iarg]);
|
||||
} else {
|
||||
args.push_back(argv[iarg]);
|
||||
}
|
||||
}
|
||||
if (iarg < argc) {
|
||||
args.push_back(argv[iarg]);
|
||||
std::string arg{argv[iarg]};
|
||||
if (arg == "--no-ppl") {
|
||||
compute_ppl = false;
|
||||
} else {
|
||||
args.push_back(argv[iarg]);
|
||||
}
|
||||
}
|
||||
|
||||
gpt_params params;
|
||||
|
@ -320,8 +444,6 @@ int main(int argc, char ** argv) {
|
|||
|
||||
g_collector.set_parameters(std::move(sparams));
|
||||
|
||||
ggml_set_imatrix_collection(ik_collect_imatrix);
|
||||
|
||||
params.logits_all = true;
|
||||
params.n_batch = std::min(params.n_batch, params.n_ctx);
|
||||
|
||||
|
@ -340,16 +462,27 @@ int main(int argc, char ** argv) {
|
|||
|
||||
llama_backend_init(params.numa);
|
||||
|
||||
llama_model * model;
|
||||
llama_context * ctx;
|
||||
llama_model_params mparams = llama_model_params_from_gpt_params(params);
|
||||
|
||||
// load the model and apply lora adapter, if any
|
||||
std::tie(model, ctx) = llama_init_from_gpt_params(params);
|
||||
llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||
if (model == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to load model\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
llama_context_params cparams = llama_context_params_from_gpt_params(params);
|
||||
|
||||
// pass the callback to the backend scheduler
|
||||
// it will be executed for each node during the graph computation
|
||||
cparams.cb_eval = ik_collect_imatrix;
|
||||
cparams.cb_eval_user_data = NULL;
|
||||
|
||||
llama_context * ctx = llama_new_context_with_model(model, cparams);
|
||||
if (ctx == NULL) {
|
||||
fprintf(stderr, "%s: error: unable to create context\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const int n_ctx_train = llama_n_ctx_train(model);
|
||||
if (params.n_ctx > n_ctx_train) {
|
||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||
|
@ -362,7 +495,7 @@ int main(int argc, char ** argv) {
|
|||
fprintf(stderr, "%s\n", get_system_info(params).c_str());
|
||||
}
|
||||
|
||||
bool OK = compute_imatrix(ctx, params);
|
||||
bool OK = compute_imatrix(ctx, params, compute_ppl);
|
||||
if (!OK) {
|
||||
return 1;
|
||||
}
|
||||
|
|
|
@ -241,7 +241,7 @@ int main(int argc, char ** argv) {
|
|||
LOG("add_bos: %d\n", add_bos);
|
||||
|
||||
bool suff_rm_leading_spc = params.escape;
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) {
|
||||
if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) {
|
||||
params.input_suffix.erase(0, 1);
|
||||
suff_rm_leading_spc = false;
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ android {
|
|||
}
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
arguments += "-DCMAKE_BUILD_TYPE=Release"
|
||||
cppFlags += listOf()
|
||||
arguments += listOf()
|
||||
}
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
" Similarly, you could add an insert mode keybind with
|
||||
" inoremap <C-B> <Cmd>call llama#doLlamaGen()<CR>
|
||||
"
|
||||
" g:llama_api_url and g:llama_overrides can be configured in your .vimrc
|
||||
" g:llama_api_url, g:llama_api_key and g:llama_overrides can be configured in your .vimrc
|
||||
" let g:llama_api_url = "192.168.1.10:8080"
|
||||
" llama_overrides can also be set through buffer/window scopes. For instance
|
||||
" autocmd filetype python let b:llama_overrides = {"temp": 0.2}
|
||||
|
@ -82,6 +82,9 @@ func llama#doLlamaGen()
|
|||
endif
|
||||
let l:querydata.prompt = join(l:buflines, "\n")
|
||||
let l:curlcommand = copy(s:curlcommand)
|
||||
if exists("g:llama_api_key")
|
||||
call extend(l:curlcommand, ['--header', 'Authorization: Bearer ' .. g:llama_api_key])
|
||||
endif
|
||||
let l:curlcommand[2] = json_encode(l:querydata)
|
||||
let b:job = job_start(l:curlcommand, {"callback": function("s:callbackHandler", [l:cbuffer])})
|
||||
endfunction
|
||||
|
|
131
examples/llava/MobileVLM-README.md
Normal file
131
examples/llava/MobileVLM-README.md
Normal file
|
@ -0,0 +1,131 @@
|
|||
# MobileVLM
|
||||
|
||||
Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants.
|
||||
|
||||
for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM)
|
||||
|
||||
The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava.
|
||||
|
||||
## Usage
|
||||
Build with cmake or run `make llava-cli` to build it.
|
||||
|
||||
After building, run: `./llava-cli` to see the usage. For example:
|
||||
|
||||
```sh
|
||||
./llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
|
||||
--mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
|
||||
--image path/to/an/image.jpg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
|
||||
```
|
||||
|
||||
## Model conversion
|
||||
|
||||
- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally:
|
||||
|
||||
```sh
|
||||
git clone https://huggingface.co/mtgv/MobileVLM-1.7B
|
||||
|
||||
git clone https://huggingface.co/openai/clip-vit-large-patch14-336
|
||||
```
|
||||
|
||||
2. Use `llava-surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF:
|
||||
|
||||
```sh
|
||||
python ./examples/llava/convert-image-encoder-to-gguf \
|
||||
-m path/to/clip-vit-large-patch14-336 \
|
||||
--llava-projector path/to/MobileVLM-1.7B/llava.projector \
|
||||
--output-dir path/to/MobileVLM-1.7B \
|
||||
--projector-type ldp
|
||||
```
|
||||
|
||||
4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF:
|
||||
|
||||
```sh
|
||||
python ./convert.py path/to/MobileVLM-1.7B
|
||||
```
|
||||
|
||||
5. Use `quantize` to convert LLaMA part's DataType from `fp16` to `q4_k`
|
||||
```sh
|
||||
./quantize path/to/MobileVLM-1.7B/ggml-model-f16.gguf path/to/MobileVLM-1.7B/ggml-model-q4_k.gguf q4_k_s
|
||||
```
|
||||
|
||||
Now both the LLaMA part and the image encoder is in the `MobileVLM-1.7B` directory.
|
||||
|
||||
## Android compile and run
|
||||
### compile
|
||||
refer to `examples/llava/android/build_64.sh`
|
||||
```sh
|
||||
mkdir examples/llava/android/build_64
|
||||
cd examples/llava/android/build_64
|
||||
../build_64.sh
|
||||
```
|
||||
### run on Android
|
||||
refer to `android/adb_run.sh`, modify resources' `name` and `path`
|
||||
|
||||
## some result on Android with `Snapdragon 888` chip
|
||||
### case 1
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
--image /data/local/tmp/demo.jpg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
|
||||
```
|
||||
**output**
|
||||
```sh
|
||||
encode_image_with_clip: image encoded in 21148.71 ms by CLIP ( 146.87 ms per image patch)
|
||||
Susan Wise Bauer
|
||||
llama_print_timings: load time = 23574.72 ms
|
||||
llama_print_timings: sample time = 1.24 ms / 6 runs ( 0.21 ms per token, 4850.44 tokens per second)
|
||||
llama_print_timings: prompt eval time = 12460.15 ms / 246 tokens ( 50.65 ms per token, 19.74 tokens per second)
|
||||
llama_print_timings: eval time = 424.86 ms / 6 runs ( 70.81 ms per token, 14.12 tokens per second)
|
||||
llama_print_timings: total time = 34731.93 ms
|
||||
```
|
||||
### case 2
|
||||
**input**
|
||||
```sh
|
||||
/data/local/tmp/llava-cli \
|
||||
-m /data/local/tmp/ggml-model-q4_k.gguf \
|
||||
--mmproj /data/local/tmp/mmproj-model-f16.gguf \
|
||||
-t 4 \
|
||||
--image /data/local/tmp/cat.jpeg \
|
||||
-p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||
```
|
||||
|
||||
**output**
|
||||
```sh
|
||||
encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch)
|
||||
The image depicts a cat sitting in the grass near some tall green plants.
|
||||
llama_print_timings: load time = 23257.32 ms
|
||||
llama_print_timings: sample time = 5.25 ms / 18 runs ( 0.29 ms per token, 3430.53 tokens per second)
|
||||
llama_print_timings: prompt eval time = 11900.73 ms / 232 tokens ( 51.30 ms per token, 19.49 tokens per second)
|
||||
llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 ms per token, 14.07 tokens per second)
|
||||
llama_print_timings: total time = 34570.79 ms
|
||||
```
|
||||
|
||||
## Minor shortcomings
|
||||
The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost.
|
||||
|
||||
## TODO
|
||||
|
||||
- [ ] Support non-CPU backend for the new operators, such as `depthwise`, `hardswish`, `hardsigmoid`
|
||||
- [ ] Optimize LDP projector performance
|
||||
|
||||
- Optimize the structure definition to avoid unnecessary memory rearrangements, to reduce the use of `ggml_permute_cpy`;
|
||||
- Optimize operator implementation (ARM CPU/NVIDIA GPU): such as depthwise conv, hardswish, hardsigmoid, etc.
|
||||
- [ ] run MobileVLM on `Jetson Orin`
|
||||
- [ ] Support more model variants, such as `MobileVLM-3B`.
|
||||
|
||||
|
||||
## contributor
|
||||
```sh
|
||||
zhangjidong05, yangyang260, huyiming03, chenxiaotao03
|
||||
```
|
53
examples/llava/android/adb_run.sh
Executable file
53
examples/llava/android/adb_run.sh
Executable file
|
@ -0,0 +1,53 @@
|
|||
#!/bin/bash
|
||||
|
||||
model_dir="/Users/cxt/model/llm/mobileVLM/MobileVLM-1.7B_processed"
|
||||
projector_name="mmproj-model-f16.gguf"
|
||||
llama_name="ggml-model-q4_k.gguf"
|
||||
img_dir="/Users/cxt/model/llm"
|
||||
img_name="demo.jpg"
|
||||
prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? \nAnswer the question using a single word or phrase. ASSISTANT:"
|
||||
# img_name="cat.jpeg"
|
||||
# prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
|
||||
|
||||
program_dir="build_64/bin"
|
||||
binName="llava-cli"
|
||||
n_threads=4
|
||||
|
||||
|
||||
deviceDir="/data/local/tmp"
|
||||
saveDir="output"
|
||||
if [ ! -d ${saveDir} ]; then
|
||||
mkdir ${saveDir}
|
||||
fi
|
||||
|
||||
|
||||
function android_run() {
|
||||
# # copy resource into device
|
||||
# adb push ${model_dir}/${projector_name} ${deviceDir}/${projector_name}
|
||||
# adb push ${model_dir}/${llama_name} ${deviceDir}/${llama_name}
|
||||
adb push ${img_dir}/${img_name} ${deviceDir}/${img_name}
|
||||
# copy program into device
|
||||
adb push ${program_dir}/${binName} ${deviceDir}/${binName}
|
||||
adb shell "chmod 0777 ${deviceDir}/${binName}"
|
||||
|
||||
# run
|
||||
adb shell "echo cd ${deviceDir} ${deviceDir}/${binName} \
|
||||
-m ${deviceDir}/${llama_name} \
|
||||
--mmproj ${deviceDir}/${projector_name} \
|
||||
-t ${n_threads} \
|
||||
--image ${deviceDir}/${img_name} \
|
||||
-p \"${prompt}\" \
|
||||
> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt"
|
||||
adb shell "cd ${deviceDir}; pwd; ${deviceDir}/${binName} \
|
||||
-m ${deviceDir}/${llama_name} \
|
||||
--mmproj ${deviceDir}/${projector_name} \
|
||||
-t ${n_threads} \
|
||||
--image ${deviceDir}/${img_name} \
|
||||
-p \"${prompt}\" \
|
||||
>> ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt 2>&1"
|
||||
adb pull ${deviceDir}/${modelName}_${projector_name}_${n_threads}_${img_name}.txt ${saveDir}
|
||||
}
|
||||
|
||||
android_run
|
||||
|
||||
echo "android_run is Done!"
|
8
examples/llava/android/build_64.sh
Executable file
8
examples/llava/android/build_64.sh
Executable file
|
@ -0,0 +1,8 @@
|
|||
#!/bin/bash
|
||||
cmake ../../../../ \
|
||||
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DANDROID_ABI="arm64-v8a" \
|
||||
-DANDROID_PLATFORM=android-23 $1
|
||||
|
||||
make -j4
|
|
@ -2,17 +2,6 @@
|
|||
// so there might be still unnecessary artifacts hanging around
|
||||
// I'll gradually clean and extend it
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
|
||||
#include "clip.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
@ -29,6 +18,19 @@
|
|||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
#include <regex>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <cinttypes>
|
||||
|
||||
static std::string format(const char * fmt, ...) {
|
||||
va_list ap;
|
||||
va_list ap2;
|
||||
|
@ -67,6 +69,7 @@ static std::string format(const char * fmt, ...) {
|
|||
#define KEY_PATCH_SIZE "clip.vision.patch_size"
|
||||
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
|
||||
#define KEY_IMAGE_STD "clip.vision.image_std"
|
||||
#define KEY_PROJ_TYPE "clip.projector_type"
|
||||
|
||||
//
|
||||
// tensor name constants
|
||||
|
@ -89,6 +92,22 @@ static std::string format(const char * fmt, ...) {
|
|||
#define TN_TEXT_PROJ "text_projection.weight"
|
||||
#define TN_VIS_PROJ "visual_projection.weight"
|
||||
#define TN_LLAVA_PROJ "mm.%d.%s"
|
||||
#define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s"
|
||||
#define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
|
||||
|
||||
|
||||
enum projector_type {
|
||||
PROJECTOR_TYPE_MLP,
|
||||
PROJECTOR_TYPE_MLP_NORM,
|
||||
PROJECTOR_TYPE_LDP,
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_MLP, "mlp" },
|
||||
{ PROJECTOR_TYPE_LDP, "ldp" },
|
||||
};
|
||||
|
||||
|
||||
//
|
||||
// utilities to get data from a gguf file
|
||||
|
@ -129,6 +148,91 @@ static std::string get_ftype(int ftype) {
|
|||
return ggml_type_name(static_cast<ggml_type>(ftype));
|
||||
}
|
||||
|
||||
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
||||
switch (type) {
|
||||
case GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
|
||||
case GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
|
||||
case GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
|
||||
case GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
|
||||
case GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
|
||||
case GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
|
||||
case GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
|
||||
case GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
|
||||
default: return format("unknown type %d", type);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
||||
std::string result;
|
||||
for (size_t pos = 0; ; pos += search.length()) {
|
||||
auto new_pos = s.find(search, pos);
|
||||
if (new_pos == std::string::npos) {
|
||||
result += s.substr(pos, s.size() - pos);
|
||||
break;
|
||||
}
|
||||
result += s.substr(pos, new_pos - pos) + replace;
|
||||
pos = new_pos;
|
||||
}
|
||||
s = std::move(result);
|
||||
}
|
||||
|
||||
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
||||
|
||||
switch (type) {
|
||||
case GGUF_TYPE_STRING:
|
||||
return gguf_get_val_str(ctx_gguf, i);
|
||||
case GGUF_TYPE_ARRAY:
|
||||
{
|
||||
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
|
||||
int arr_n = gguf_get_arr_n(ctx_gguf, i);
|
||||
const void * data = gguf_get_arr_data(ctx_gguf, i);
|
||||
std::stringstream ss;
|
||||
ss << "[";
|
||||
for (int j = 0; j < arr_n; j++) {
|
||||
if (arr_type == GGUF_TYPE_STRING) {
|
||||
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
|
||||
// escape quotes
|
||||
replace_all(val, "\\", "\\\\");
|
||||
replace_all(val, "\"", "\\\"");
|
||||
ss << '"' << val << '"';
|
||||
} else if (arr_type == GGUF_TYPE_ARRAY) {
|
||||
ss << "???";
|
||||
} else {
|
||||
ss << gguf_data_to_str(arr_type, data, j);
|
||||
}
|
||||
if (j < arr_n - 1) {
|
||||
ss << ", ";
|
||||
}
|
||||
}
|
||||
ss << "]";
|
||||
return ss.str();
|
||||
}
|
||||
default:
|
||||
return gguf_data_to_str(type, gguf_get_val_data(ctx_gguf, i), 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void print_tensor_info(const ggml_tensor* tensor, const char* prefix = "") {
|
||||
size_t tensor_size = ggml_nbytes(tensor);
|
||||
printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n",
|
||||
prefix, ggml_n_dims(tensor), tensor->name, tensor_size,
|
||||
tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type));
|
||||
}
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & name) {
|
||||
for (const auto & kv : PROJECTOR_TYPE_NAMES) { // NOLINT
|
||||
if (kv.second == name) {
|
||||
return kv.first;
|
||||
}
|
||||
}
|
||||
return PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
//
|
||||
// image data
|
||||
//
|
||||
|
@ -201,10 +305,44 @@ struct clip_vision_model {
|
|||
struct ggml_tensor * projection;
|
||||
|
||||
// LLaVA projection
|
||||
struct ggml_tensor * mm_0_w;
|
||||
struct ggml_tensor * mm_0_b;
|
||||
struct ggml_tensor * mm_2_w;
|
||||
struct ggml_tensor * mm_2_b;
|
||||
struct ggml_tensor * mm_0_w = NULL;
|
||||
struct ggml_tensor * mm_0_b = NULL;
|
||||
struct ggml_tensor * mm_2_w = NULL;
|
||||
struct ggml_tensor * mm_2_b = NULL;
|
||||
|
||||
// Yi type models with mlp+normalization projection
|
||||
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
|
||||
struct ggml_tensor * mm_1_b = NULL;
|
||||
struct ggml_tensor * mm_3_w = NULL;
|
||||
struct ggml_tensor * mm_3_b = NULL;
|
||||
struct ggml_tensor * mm_4_w = NULL;
|
||||
struct ggml_tensor * mm_4_b = NULL;
|
||||
|
||||
// MobileVLM projection
|
||||
struct ggml_tensor * mm_model_mlp_1_w;
|
||||
struct ggml_tensor * mm_model_mlp_1_b;
|
||||
struct ggml_tensor * mm_model_mlp_3_w;
|
||||
struct ggml_tensor * mm_model_mlp_3_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_0_0_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_0_1_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_0_1_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc1_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc1_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc2_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_1_fc2_b;
|
||||
struct ggml_tensor * mm_model_block_1_block_2_0_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_2_1_w;
|
||||
struct ggml_tensor * mm_model_block_1_block_2_1_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_0_0_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_0_1_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_0_1_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc1_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc1_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc2_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_1_fc2_b;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_0_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_1_w;
|
||||
struct ggml_tensor * mm_model_block_2_block_2_1_b;
|
||||
};
|
||||
|
||||
struct clip_ctx {
|
||||
|
@ -213,6 +351,7 @@ struct clip_ctx {
|
|||
bool has_llava_projector = false;
|
||||
|
||||
struct clip_vision_model vision_model;
|
||||
projector_type proj_type = PROJECTOR_TYPE_MLP;
|
||||
|
||||
float image_mean[3];
|
||||
float image_std[3];
|
||||
|
@ -330,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
// pre-layernorm
|
||||
{
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
ggml_set_name(embeddings, "pre_ln");
|
||||
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
|
||||
}
|
||||
|
@ -430,16 +570,156 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|||
free(patches_data);
|
||||
}
|
||||
|
||||
// shape [1, 576, 1024]
|
||||
// ne is whcn, ne = [1024, 576, 1, 1]
|
||||
embeddings = ggml_get_rows(ctx0, embeddings, patches);
|
||||
|
||||
// mm projection 0
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// print_tensor_info(embeddings, "embeddings");
|
||||
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
// llava projector
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
|
||||
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
|
||||
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
|
||||
// First LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
|
||||
model.mm_1_b);
|
||||
|
||||
// GELU activation
|
||||
embeddings = ggml_gelu(ctx0, embeddings);
|
||||
|
||||
// Second linear layer
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
|
||||
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
|
||||
|
||||
// Second LayerNorm
|
||||
embeddings = ggml_norm(ctx0, embeddings, eps);
|
||||
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
|
||||
model.mm_4_b);
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projector
|
||||
int n_patch = 24;
|
||||
struct ggml_tensor * mlp_1 = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, embeddings);
|
||||
mlp_1 = ggml_add(ctx0, mlp_1, model.mm_model_mlp_1_b);
|
||||
mlp_1 = ggml_gelu(ctx0, mlp_1);
|
||||
struct ggml_tensor * mlp_3 = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, mlp_1);
|
||||
mlp_3 = ggml_add(ctx0, mlp_3, model.mm_model_mlp_3_b);
|
||||
// mlp_3 shape = [1, 576, 2048], ne = [2048, 576, 1, 1]
|
||||
|
||||
// block 1
|
||||
struct ggml_tensor * block_1 = nullptr;
|
||||
{
|
||||
// transpose from [1, 576, 2048] --> [1, 2048, 576] --> [1, 2048, 24, 24]
|
||||
mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3));
|
||||
mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]);
|
||||
// stride = 1, padding = 1, bias is nullptr
|
||||
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1);
|
||||
|
||||
// layer norm
|
||||
// // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_0_1_w), model.mm_model_block_1_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
|
||||
// block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// hardswish
|
||||
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_1_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
// block_1_hw shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1], block_1 shape = [1, 2048], ne = [2048, 1, 1, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_1_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
// block_1 shape = [1, 24, 24, 2048], ne = [2048, 24, 24, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_1_block_2_1_w), model.mm_model_block_1_block_2_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1]
|
||||
// residual
|
||||
block_1 = ggml_add(ctx0, mlp_3, block_1);
|
||||
}
|
||||
|
||||
// block_2
|
||||
{
|
||||
// stride = 2
|
||||
block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1);
|
||||
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// layer norm
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 2, 0, 3));
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_0_1_w), model.mm_model_block_2_block_0_1_b);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 2, 0, 1, 3));
|
||||
// block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1]
|
||||
// hardswish
|
||||
struct ggml_tensor * block_1_hw = ggml_hardswish(ctx0, block_1);
|
||||
|
||||
// not sure the parameters is right for globalAvgPooling
|
||||
block_1 = ggml_pool_2d(ctx0, block_1_hw, GGML_OP_POOL_AVG, block_1_hw->ne[0], block_1_hw->ne[1], block_1_hw->ne[0], block_1_hw->ne[1], 0, 0);
|
||||
// block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
// pointwise conv
|
||||
block_1 = ggml_reshape_2d(ctx0, block_1, block_1->ne[0]*block_1->ne[1]*block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc1_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc1_b);
|
||||
block_1 = ggml_relu(ctx0, block_1);
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_1_fc2_w, block_1);
|
||||
block_1 = ggml_add(ctx0, block_1, model.mm_model_block_2_block_1_fc2_b);
|
||||
block_1 = ggml_hardsigmoid(ctx0, block_1);
|
||||
|
||||
// block_1_hw shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1], block_1 shape = [1, 2048, 1, 1], ne = [1, 1, 2048, 1]
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, 1, 1, block_1->ne[0], block_1->ne[1]);
|
||||
block_1 = ggml_mul(ctx0, block_1_hw, block_1);
|
||||
|
||||
int w = block_1->ne[0], h = block_1->ne[1];
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, w*h, block_1->ne[2], block_1->ne[3]);
|
||||
block_1 = ggml_cont(ctx0, ggml_permute(ctx0, block_1, 1, 0, 2, 3));
|
||||
// block_1 shape = [1, 24*24, 2048], ne = [24*24, 2048, 1]
|
||||
block_1 = ggml_mul_mat(ctx0, model.mm_model_block_2_block_2_0_w, block_1);
|
||||
block_1 = ggml_reshape_4d(ctx0, block_1, block_1->ne[0], w, h, block_1->ne[3]);
|
||||
|
||||
|
||||
// block_1 shape = [1, 12, 12, 2048], ne = [2048, 12, 12, 1]
|
||||
block_1 = ggml_norm(ctx0, block_1, eps);
|
||||
block_1 = ggml_add(ctx0, ggml_mul(ctx0, block_1, model.mm_model_block_2_block_2_1_w), model.mm_model_block_2_block_2_1_b);
|
||||
block_1 = ggml_reshape_3d(ctx0, block_1, block_1->ne[0], block_1->ne[1] * block_1->ne[2], block_1->ne[3]);
|
||||
// block_1 shape = [1, 144, 2048], ne = [2048, 144, 1]
|
||||
}
|
||||
embeddings = block_1;
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
// build the graph
|
||||
|
@ -485,16 +765,47 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
printf("\n");
|
||||
}
|
||||
const int n_tensors = gguf_get_n_tensors(ctx);
|
||||
|
||||
// kv
|
||||
if (verbosity >= 3) {
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
const int n_kv = gguf_get_n_kv(ctx);
|
||||
printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n",
|
||||
__func__, n_kv, n_tensors, fname);
|
||||
{
|
||||
std::map<enum ggml_type, uint32_t> n_type;
|
||||
|
||||
for (int i = 0; i < n_kv; ++i) {
|
||||
const char * key = gguf_get_key(ctx, i);
|
||||
for (int i = 0; i < n_tensors; i++) {
|
||||
enum ggml_type type = gguf_get_tensor_type(ctx, i);
|
||||
|
||||
printf("%s: kv[%d]: key = %s\n", __func__, i, key);
|
||||
n_type[type]++;
|
||||
}
|
||||
|
||||
printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
||||
for (int i = 0; i < n_kv; i++) {
|
||||
const char * name = gguf_get_key(ctx, i);
|
||||
const enum gguf_type type = gguf_get_kv_type(ctx, i);
|
||||
const std::string type_name =
|
||||
type == GGUF_TYPE_ARRAY
|
||||
? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx, i)), gguf_get_arr_n(ctx, i))
|
||||
: gguf_type_name(type);
|
||||
|
||||
std::string value = gguf_kv_to_str(ctx, i);
|
||||
const size_t MAX_VALUE_LEN = 40;
|
||||
if (value.size() > MAX_VALUE_LEN) {
|
||||
value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str());
|
||||
}
|
||||
replace_all(value, "\n", "\\n");
|
||||
|
||||
printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str());
|
||||
}
|
||||
|
||||
// print type counts
|
||||
for (auto & kv : n_type) {
|
||||
if (kv.second == 0) {
|
||||
continue;
|
||||
}
|
||||
|
||||
printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
// data
|
||||
|
@ -503,12 +814,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx, i);
|
||||
const size_t offset = gguf_get_tensor_offset(ctx, i);
|
||||
enum ggml_type type = gguf_get_tensor_type(ctx, i);
|
||||
struct ggml_tensor * cur = ggml_get_tensor(meta, name);
|
||||
size_t tensor_size = ggml_nbytes(cur);
|
||||
buffer_size += tensor_size;
|
||||
if (verbosity >= 3) {
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu\n", __func__, i,
|
||||
ggml_n_dims(cur), cur->name, tensor_size, offset);
|
||||
printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n",
|
||||
__func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -517,6 +829,23 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
|
||||
clip_ctx * new_clip = new clip_ctx;
|
||||
|
||||
// update projector type
|
||||
{
|
||||
int idx = gguf_find_key(ctx, KEY_PROJ_TYPE);
|
||||
if (idx != -1) {
|
||||
const std::string proj_type = gguf_get_val_str(ctx, idx);
|
||||
new_clip->proj_type = clip_projector_type_from_string(proj_type);
|
||||
}
|
||||
else {
|
||||
new_clip->proj_type = PROJECTOR_TYPE_MLP;
|
||||
}
|
||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
|
||||
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
new_clip->backend = ggml_backend_cuda_init(0);
|
||||
printf("%s: CLIP using CUDA backend\n", __func__);
|
||||
|
@ -661,10 +990,63 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|||
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
||||
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
||||
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
|
||||
// LLaVA projection
|
||||
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
|
||||
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
|
||||
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
try {
|
||||
// missing in Yi-type llava
|
||||
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
|
||||
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
|
||||
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
try {
|
||||
// Yi-type llava
|
||||
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
|
||||
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
|
||||
} catch (std::runtime_error & e) { }
|
||||
}
|
||||
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
// MobileVLM projection
|
||||
vision_model.mm_model_mlp_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
||||
vision_model.mm_model_mlp_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 1, "bias"));
|
||||
vision_model.mm_model_mlp_3_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
||||
vision_model.mm_model_mlp_3_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
||||
vision_model.mm_model_block_1_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 0, "1.bias"));
|
||||
vision_model.mm_model_block_1_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.weight"));
|
||||
vision_model.mm_model_block_1_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc1.bias"));
|
||||
vision_model.mm_model_block_1_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.weight"));
|
||||
vision_model.mm_model_block_1_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 1, "fc2.bias"));
|
||||
vision_model.mm_model_block_1_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "0.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.weight"));
|
||||
vision_model.mm_model_block_1_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 1, 2, "1.bias"));
|
||||
vision_model.mm_model_block_2_block_0_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_0_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 0, "1.bias"));
|
||||
vision_model.mm_model_block_2_block_1_fc1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.weight"));
|
||||
vision_model.mm_model_block_2_block_1_fc1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc1.bias"));
|
||||
vision_model.mm_model_block_2_block_1_fc2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.weight"));
|
||||
vision_model.mm_model_block_2_block_1_fc2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 1, "fc2.bias"));
|
||||
vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight"));
|
||||
vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias"));
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
|
||||
vision_model.layers.resize(hparams.n_layer);
|
||||
for (int il = 0; il < hparams.n_layer; ++il) {
|
||||
|
@ -949,7 +1331,6 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
".*weight",
|
||||
};
|
||||
|
||||
std::vector<uint8_t> read_data(512);
|
||||
std::vector<uint8_t> work(512);
|
||||
std::vector<float> conv_buf(512);
|
||||
std::vector<int64_t> hist_all(1 << 4, 0);
|
||||
|
@ -1100,13 +1481,27 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
|
|||
}
|
||||
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
return ctx->vision_model.mm_2_b->ne[0];
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
|
||||
}
|
||||
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
|
||||
return ctx->vision_model.mm_2_b->ne[0];
|
||||
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
|
||||
return ctx->vision_model.mm_3_b->ne[0];
|
||||
}
|
||||
else {
|
||||
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
|
||||
throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
|
||||
}
|
||||
}
|
||||
|
||||
int clip_n_patches(const struct clip_ctx * ctx) {
|
||||
auto & params = ctx->vision_model.hparams;
|
||||
|
||||
return (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
||||
if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
|
||||
n_patches /= 4;
|
||||
}
|
||||
return n_patches;
|
||||
}
|
||||
|
||||
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
|
||||
|
|
|
@ -81,6 +81,7 @@ ap.add_argument("--vision-only", action="store_true", required=False,
|
|||
ap.add_argument("--clip_model_is_vision", action="store_true", required=False,
|
||||
help="The clip model is a pure vision model (ShareGPT4V vision extract for example)")
|
||||
ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.")
|
||||
ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp")
|
||||
ap.add_argument("--image-mean", nargs=3, type=float, required=False, help="Override image mean values")
|
||||
ap.add_argument("--image-std", nargs=3, type=float, required=False, help="Override image std values")
|
||||
ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None)
|
||||
|
@ -174,6 +175,8 @@ elif args.vision_only and not has_llava_projector:
|
|||
fout.add_description("vision-only CLIP model")
|
||||
elif has_llava_projector:
|
||||
fout.add_description("image encoder for LLaVA")
|
||||
# add projector type
|
||||
fout.add_string("clip.projector_type", args.projector_type)
|
||||
else:
|
||||
fout.add_description("two-tower CLIP model")
|
||||
|
||||
|
@ -218,7 +221,8 @@ if has_llava_projector:
|
|||
projector = torch.load(args.llava_projector)
|
||||
for name, data in projector.items():
|
||||
name = get_tensor_name(name)
|
||||
if data.ndim == 2:
|
||||
# pw and dw conv ndim==4
|
||||
if data.ndim == 2 or data.ndim == 4:
|
||||
data = data.squeeze().numpy().astype(np.float16)
|
||||
else:
|
||||
data = data.squeeze().numpy().astype(np.float32)
|
||||
|
|
|
@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
|
||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
|
||||
|
||||
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
|
||||
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
|
||||
std::string system_prompt, user_prompt;
|
||||
size_t image_pos = prompt.find("<image>");
|
||||
if (image_pos != std::string::npos) {
|
||||
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
|
||||
|
||||
system_prompt = prompt.substr(0, image_pos);
|
||||
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
|
||||
// We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
|
||||
size_t pos = 0;
|
||||
while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
|
||||
user_prompt.replace(pos, 2, "\n");
|
||||
pos += 1; // Advance past the replaced newline
|
||||
}
|
||||
while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
|
||||
system_prompt.replace(pos, 2, "\n");
|
||||
pos += 1; // Advance past the replaced newline
|
||||
}
|
||||
|
||||
printf("system_prompt: %s\n", system_prompt.c_str());
|
||||
printf("user_prompt: %s\n", user_prompt.c_str());
|
||||
} else {
|
||||
// llava-1.5 native mode
|
||||
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
|
||||
user_prompt = prompt + "\nASSISTANT:";
|
||||
}
|
||||
|
||||
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
|
||||
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
|
||||
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
|
||||
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
|
||||
|
||||
// generate the response
|
||||
|
||||
|
@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|||
for (int i = 0; i < max_tgt_len; i++) {
|
||||
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
|
||||
if (strcmp(tmp, "</s>") == 0) break;
|
||||
if (strstr(tmp, "###")) break; // Yi-VL behavior
|
||||
|
||||
printf("%s", tmp);
|
||||
fflush(stdout);
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,14 +1,14 @@
|
|||
# Function calling example using pydantic models.
|
||||
import datetime
|
||||
import importlib
|
||||
import json
|
||||
from enum import Enum
|
||||
from typing import Union, Optional
|
||||
from typing import Optional, Union
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
import importlib
|
||||
from pydantic_models_to_grammar import generate_gbnf_grammar_and_documentation, convert_dictionary_to_pydantic_model, add_run_method_to_dynamic_model, create_dynamic_model_from_function
|
||||
from pydantic_models_to_grammar import (add_run_method_to_dynamic_model, convert_dictionary_to_pydantic_model,
|
||||
create_dynamic_model_from_function, generate_gbnf_grammar_and_documentation)
|
||||
|
||||
|
||||
# Function to get completion on the llama.cpp server with grammar.
|
||||
|
@ -35,7 +35,7 @@ class SendMessageToUser(BaseModel):
|
|||
print(self.message)
|
||||
|
||||
|
||||
# Enum for the calculator function.
|
||||
# Enum for the calculator tool.
|
||||
class MathOperation(Enum):
|
||||
ADD = "add"
|
||||
SUBTRACT = "subtract"
|
||||
|
@ -43,7 +43,7 @@ class MathOperation(Enum):
|
|||
DIVIDE = "divide"
|
||||
|
||||
|
||||
# Very simple calculator tool for the agent.
|
||||
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
||||
class Calculator(BaseModel):
|
||||
"""
|
||||
Perform a math operation on two numbers.
|
||||
|
@ -148,37 +148,6 @@ def get_current_datetime(output_format: Optional[str] = None):
|
|||
return datetime.datetime.now().strftime(output_format)
|
||||
|
||||
|
||||
# Enum for the calculator tool.
|
||||
class MathOperation(Enum):
|
||||
ADD = "add"
|
||||
SUBTRACT = "subtract"
|
||||
MULTIPLY = "multiply"
|
||||
DIVIDE = "divide"
|
||||
|
||||
|
||||
|
||||
# Simple pydantic calculator tool for the agent that can add, subtract, multiply, and divide. Docstring and description of fields will be used in system prompt.
|
||||
class Calculator(BaseModel):
|
||||
"""
|
||||
Perform a math operation on two numbers.
|
||||
"""
|
||||
number_one: Union[int, float] = Field(..., description="First number.")
|
||||
operation: MathOperation = Field(..., description="Math operation to perform.")
|
||||
number_two: Union[int, float] = Field(..., description="Second number.")
|
||||
|
||||
def run(self):
|
||||
if self.operation == MathOperation.ADD:
|
||||
return self.number_one + self.number_two
|
||||
elif self.operation == MathOperation.SUBTRACT:
|
||||
return self.number_one - self.number_two
|
||||
elif self.operation == MathOperation.MULTIPLY:
|
||||
return self.number_one * self.number_two
|
||||
elif self.operation == MathOperation.DIVIDE:
|
||||
return self.number_one / self.number_two
|
||||
else:
|
||||
raise ValueError("Unknown operation.")
|
||||
|
||||
|
||||
# Example function to get the weather
|
||||
def get_current_weather(location, unit):
|
||||
"""Get the current weather in a given location"""
|
||||
|
|
|
@ -1,15 +1,21 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import inspect
|
||||
import json
|
||||
import re
|
||||
from copy import copy
|
||||
from inspect import isclass, getdoc
|
||||
from types import NoneType
|
||||
from enum import Enum
|
||||
from inspect import getdoc, isclass
|
||||
from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union, get_args, get_origin, get_type_hints
|
||||
|
||||
from docstring_parser import parse
|
||||
from pydantic import BaseModel, create_model, Field
|
||||
from typing import Any, Type, List, get_args, get_origin, Tuple, Union, Optional, _GenericAlias
|
||||
from enum import Enum
|
||||
from typing import get_type_hints, Callable
|
||||
import re
|
||||
from pydantic import BaseModel, Field, create_model
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from types import GenericAlias
|
||||
else:
|
||||
# python 3.8 compat
|
||||
from typing import _GenericAlias as GenericAlias
|
||||
|
||||
|
||||
class PydanticDataType(Enum):
|
||||
|
@ -43,7 +49,7 @@ class PydanticDataType(Enum):
|
|||
SET = "set"
|
||||
|
||||
|
||||
def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
|
||||
def map_pydantic_type_to_gbnf(pydantic_type: type[Any]) -> str:
|
||||
if isclass(pydantic_type) and issubclass(pydantic_type, str):
|
||||
return PydanticDataType.STRING.value
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, bool):
|
||||
|
@ -57,22 +63,22 @@ def map_pydantic_type_to_gbnf(pydantic_type: Type[Any]) -> str:
|
|||
|
||||
elif isclass(pydantic_type) and issubclass(pydantic_type, BaseModel):
|
||||
return format_model_and_field_name(pydantic_type.__name__)
|
||||
elif get_origin(pydantic_type) == list:
|
||||
elif get_origin(pydantic_type) is list:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"{map_pydantic_type_to_gbnf(element_type)}-list"
|
||||
elif get_origin(pydantic_type) == set:
|
||||
elif get_origin(pydantic_type) is set:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"{map_pydantic_type_to_gbnf(element_type)}-set"
|
||||
elif get_origin(pydantic_type) == Union:
|
||||
elif get_origin(pydantic_type) is Union:
|
||||
union_types = get_args(pydantic_type)
|
||||
union_rules = [map_pydantic_type_to_gbnf(ut) for ut in union_types]
|
||||
return f"union-{'-or-'.join(union_rules)}"
|
||||
elif get_origin(pydantic_type) == Optional:
|
||||
elif get_origin(pydantic_type) is Optional:
|
||||
element_type = get_args(pydantic_type)[0]
|
||||
return f"optional-{map_pydantic_type_to_gbnf(element_type)}"
|
||||
elif isclass(pydantic_type):
|
||||
return f"{PydanticDataType.CUSTOM_CLASS.value}-{format_model_and_field_name(pydantic_type.__name__)}"
|
||||
elif get_origin(pydantic_type) == dict:
|
||||
elif get_origin(pydantic_type) is dict:
|
||||
key_type, value_type = get_args(pydantic_type)
|
||||
return f"custom-dict-key-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(key_type))}-value-type-{format_model_and_field_name(map_pydantic_type_to_gbnf(value_type))}"
|
||||
else:
|
||||
|
@ -106,7 +112,6 @@ def get_members_structure(cls, rule_name):
|
|||
return f"{cls.__name__.lower()} ::= " + " | ".join(members)
|
||||
if cls.__annotations__ and cls.__annotations__ != {}:
|
||||
result = f'{rule_name} ::= "{{"'
|
||||
type_list_rules = []
|
||||
# Modify this comprehension
|
||||
members = [
|
||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param_type)}'
|
||||
|
@ -116,27 +121,25 @@ def get_members_structure(cls, rule_name):
|
|||
|
||||
result += '"," '.join(members)
|
||||
result += ' "}"'
|
||||
return result, type_list_rules
|
||||
elif rule_name == "custom-class-any":
|
||||
return result
|
||||
if rule_name == "custom-class-any":
|
||||
result = f"{rule_name} ::= "
|
||||
result += "value"
|
||||
type_list_rules = []
|
||||
return result, type_list_rules
|
||||
else:
|
||||
init_signature = inspect.signature(cls.__init__)
|
||||
parameters = init_signature.parameters
|
||||
result = f'{rule_name} ::= "{{"'
|
||||
type_list_rules = []
|
||||
# Modify this comprehension too
|
||||
members = [
|
||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}'
|
||||
for name, param in parameters.items()
|
||||
if name != "self" and param.annotation != inspect.Parameter.empty
|
||||
]
|
||||
return result
|
||||
|
||||
result += '", "'.join(members)
|
||||
result += ' "}"'
|
||||
return result, type_list_rules
|
||||
init_signature = inspect.signature(cls.__init__)
|
||||
parameters = init_signature.parameters
|
||||
result = f'{rule_name} ::= "{{"'
|
||||
# Modify this comprehension too
|
||||
members = [
|
||||
f' "\\"{name}\\"" ":" {map_pydantic_type_to_gbnf(param.annotation)}'
|
||||
for name, param in parameters.items()
|
||||
if name != "self" and param.annotation != inspect.Parameter.empty
|
||||
]
|
||||
|
||||
result += '", "'.join(members)
|
||||
result += ' "}"'
|
||||
return result
|
||||
|
||||
|
||||
def regex_to_gbnf(regex_pattern: str) -> str:
|
||||
|
@ -269,7 +272,7 @@ def generate_gbnf_float_rules(max_digit=None, min_digit=None, max_precision=None
|
|||
|
||||
def generate_gbnf_rule_for_type(
|
||||
model_name, field_name, field_type, is_optional, processed_models, created_rules, field_info=None
|
||||
) -> Tuple[str, list]:
|
||||
) -> tuple[str, list[str]]:
|
||||
"""
|
||||
Generate GBNF rule for a given field type.
|
||||
|
||||
|
@ -283,7 +286,7 @@ def generate_gbnf_rule_for_type(
|
|||
:param field_info: Additional information about the field (optional).
|
||||
|
||||
:return: Tuple containing the GBNF type and a list of additional rules.
|
||||
:rtype: Tuple[str, list]
|
||||
:rtype: tuple[str, list]
|
||||
"""
|
||||
rules = []
|
||||
|
||||
|
@ -321,8 +324,7 @@ def generate_gbnf_rule_for_type(
|
|||
gbnf_type, rules = model_name + "-" + field_name, rules
|
||||
|
||||
elif gbnf_type.startswith("custom-class-"):
|
||||
nested_model_rules, field_types = get_members_structure(field_type, gbnf_type)
|
||||
rules.append(nested_model_rules)
|
||||
rules.append(get_members_structure(field_type, gbnf_type))
|
||||
elif gbnf_type.startswith("custom-dict-"):
|
||||
key_type, value_type = get_args(field_type)
|
||||
|
||||
|
@ -341,14 +343,14 @@ def generate_gbnf_rule_for_type(
|
|||
union_rules = []
|
||||
|
||||
for union_type in union_types:
|
||||
if isinstance(union_type, _GenericAlias):
|
||||
if isinstance(union_type, GenericAlias):
|
||||
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
||||
model_name, field_name, union_type, False, processed_models, created_rules
|
||||
)
|
||||
union_rules.append(union_gbnf_type)
|
||||
rules.extend(union_rules_list)
|
||||
|
||||
elif not issubclass(union_type, NoneType):
|
||||
elif not issubclass(union_type, type(None)):
|
||||
union_gbnf_type, union_rules_list = generate_gbnf_rule_for_type(
|
||||
model_name, field_name, union_type, False, processed_models, created_rules
|
||||
)
|
||||
|
@ -424,14 +426,10 @@ def generate_gbnf_rule_for_type(
|
|||
else:
|
||||
gbnf_type, rules = gbnf_type, []
|
||||
|
||||
if gbnf_type not in created_rules:
|
||||
return gbnf_type, rules
|
||||
else:
|
||||
if gbnf_type in created_rules:
|
||||
return gbnf_type, rules
|
||||
return gbnf_type, rules
|
||||
|
||||
|
||||
def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created_rules: dict) -> (list, bool, bool):
|
||||
def generate_gbnf_grammar(model: type[BaseModel], processed_models: set[type[BaseModel]], created_rules: dict[str, list[str]]) -> tuple[list[str], bool]:
|
||||
"""
|
||||
|
||||
Generate GBnF Grammar
|
||||
|
@ -452,7 +450,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
|
|||
```
|
||||
"""
|
||||
if model in processed_models:
|
||||
return []
|
||||
return [], False
|
||||
|
||||
processed_models.add(model)
|
||||
model_name = format_model_and_field_name(model.__name__)
|
||||
|
@ -518,7 +516,7 @@ def generate_gbnf_grammar(model: Type[BaseModel], processed_models: set, created
|
|||
|
||||
|
||||
def generate_gbnf_grammar_from_pydantic_models(
|
||||
models: List[Type[BaseModel]], outer_object_name: str = None, outer_object_content: str = None,
|
||||
models: list[type[BaseModel]], outer_object_name: str | None = None, outer_object_content: str | None = None,
|
||||
list_of_outputs: bool = False
|
||||
) -> str:
|
||||
"""
|
||||
|
@ -528,7 +526,7 @@ def generate_gbnf_grammar_from_pydantic_models(
|
|||
* grammar.
|
||||
|
||||
Args:
|
||||
models (List[Type[BaseModel]]): A list of Pydantic models to generate the grammar from.
|
||||
models (list[type[BaseModel]]): A list of Pydantic models to generate the grammar from.
|
||||
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
||||
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
||||
list_of_outputs (str, optional): Allows a list of output objects
|
||||
|
@ -543,9 +541,9 @@ def generate_gbnf_grammar_from_pydantic_models(
|
|||
# root ::= UserModel | PostModel
|
||||
# ...
|
||||
"""
|
||||
processed_models = set()
|
||||
processed_models: set[type[BaseModel]] = set()
|
||||
all_rules = []
|
||||
created_rules = {}
|
||||
created_rules: dict[str, list[str]] = {}
|
||||
if outer_object_name is None:
|
||||
for model in models:
|
||||
model_rules, _ = generate_gbnf_grammar(model, processed_models, created_rules)
|
||||
|
@ -608,7 +606,7 @@ def get_primitive_grammar(grammar):
|
|||
Returns:
|
||||
str: GBNF primitive grammar string.
|
||||
"""
|
||||
type_list = []
|
||||
type_list: list[type[object]] = []
|
||||
if "string-list" in grammar:
|
||||
type_list.append(str)
|
||||
if "boolean-list" in grammar:
|
||||
|
@ -666,14 +664,14 @@ triple-quotes ::= "'''" """
|
|||
|
||||
|
||||
def generate_markdown_documentation(
|
||||
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
Generate markdown documentation for a list of Pydantic models.
|
||||
|
||||
Args:
|
||||
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
|
||||
pydantic_models (list[type[BaseModel]]): list of Pydantic model classes.
|
||||
model_prefix (str): Prefix for the model section.
|
||||
fields_prefix (str): Prefix for the fields section.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
|
@ -731,7 +729,7 @@ def generate_markdown_documentation(
|
|||
|
||||
|
||||
def generate_field_markdown(
|
||||
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
|
||||
field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
|
@ -739,8 +737,8 @@ def generate_field_markdown(
|
|||
|
||||
Args:
|
||||
field_name (str): Name of the field.
|
||||
field_type (Type[Any]): Type of the field.
|
||||
model (Type[BaseModel]): Pydantic model class.
|
||||
field_type (type[Any]): Type of the field.
|
||||
model (type[BaseModel]): Pydantic model class.
|
||||
depth (int): Indentation depth in the documentation.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
|
||||
|
@ -798,7 +796,7 @@ def generate_field_markdown(
|
|||
return field_text
|
||||
|
||||
|
||||
def format_json_example(example: dict, depth: int) -> str:
|
||||
def format_json_example(example: dict[str, Any], depth: int) -> str:
|
||||
"""
|
||||
Format a JSON example into a readable string with indentation.
|
||||
|
||||
|
@ -819,14 +817,14 @@ def format_json_example(example: dict, depth: int) -> str:
|
|||
|
||||
|
||||
def generate_text_documentation(
|
||||
pydantic_models: List[Type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
pydantic_models: list[type[BaseModel]], model_prefix="Model", fields_prefix="Fields",
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
Generate text documentation for a list of Pydantic models.
|
||||
|
||||
Args:
|
||||
pydantic_models (List[Type[BaseModel]]): List of Pydantic model classes.
|
||||
pydantic_models (list[type[BaseModel]]): List of Pydantic model classes.
|
||||
model_prefix (str): Prefix for the model section.
|
||||
fields_prefix (str): Prefix for the fields section.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
|
@ -885,7 +883,7 @@ def generate_text_documentation(
|
|||
|
||||
|
||||
def generate_field_text(
|
||||
field_name: str, field_type: Type[Any], model: Type[BaseModel], depth=1,
|
||||
field_name: str, field_type: type[Any], model: type[BaseModel], depth=1,
|
||||
documentation_with_field_description=True
|
||||
) -> str:
|
||||
"""
|
||||
|
@ -893,8 +891,8 @@ def generate_field_text(
|
|||
|
||||
Args:
|
||||
field_name (str): Name of the field.
|
||||
field_type (Type[Any]): Type of the field.
|
||||
model (Type[BaseModel]): Pydantic model class.
|
||||
field_type (type[Any]): Type of the field.
|
||||
model (type[BaseModel]): Pydantic model class.
|
||||
depth (int): Indentation depth in the documentation.
|
||||
documentation_with_field_description (bool): Include field descriptions in the documentation.
|
||||
|
||||
|
@ -1017,8 +1015,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
|
|||
pydantic_model_list,
|
||||
grammar_file_path="./generated_grammar.gbnf",
|
||||
documentation_file_path="./generated_grammar_documentation.md",
|
||||
outer_object_name: str = None,
|
||||
outer_object_content: str = None,
|
||||
outer_object_name: str | None = None,
|
||||
outer_object_content: str | None = None,
|
||||
model_prefix: str = "Output Model",
|
||||
fields_prefix: str = "Output Fields",
|
||||
list_of_outputs: bool = False,
|
||||
|
@ -1053,8 +1051,8 @@ def generate_and_save_gbnf_grammar_and_documentation(
|
|||
|
||||
def generate_gbnf_grammar_and_documentation(
|
||||
pydantic_model_list,
|
||||
outer_object_name: str = None,
|
||||
outer_object_content: str = None,
|
||||
outer_object_name: str | None = None,
|
||||
outer_object_content: str | None = None,
|
||||
model_prefix: str = "Output Model",
|
||||
fields_prefix: str = "Output Fields",
|
||||
list_of_outputs: bool = False,
|
||||
|
@ -1086,9 +1084,9 @@ def generate_gbnf_grammar_and_documentation(
|
|||
|
||||
|
||||
def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
||||
dictionaries: List[dict],
|
||||
outer_object_name: str = None,
|
||||
outer_object_content: str = None,
|
||||
dictionaries: list[dict[str, Any]],
|
||||
outer_object_name: str | None = None,
|
||||
outer_object_content: str | None = None,
|
||||
model_prefix: str = "Output Model",
|
||||
fields_prefix: str = "Output Fields",
|
||||
list_of_outputs: bool = False,
|
||||
|
@ -1098,7 +1096,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
|||
Generate GBNF grammar and documentation from a list of dictionaries.
|
||||
|
||||
Args:
|
||||
dictionaries (List[dict]): List of dictionaries representing Pydantic models.
|
||||
dictionaries (list[dict]): List of dictionaries representing Pydantic models.
|
||||
outer_object_name (str): Outer object name for the GBNF grammar. If None, no outer object will be generated. Eg. "function" for function calling.
|
||||
outer_object_content (str): Content for the outer rule in the GBNF grammar. Eg. "function_parameters" or "params" for function calling.
|
||||
model_prefix (str): Prefix for the model section in the documentation.
|
||||
|
@ -1120,7 +1118,7 @@ def generate_gbnf_grammar_and_documentation_from_dictionaries(
|
|||
return grammar, documentation
|
||||
|
||||
|
||||
def create_dynamic_model_from_function(func: Callable):
|
||||
def create_dynamic_model_from_function(func: Callable[..., Any]):
|
||||
"""
|
||||
Creates a dynamic Pydantic model from a given function's type hints and adds the function as a 'run' method.
|
||||
|
||||
|
@ -1135,6 +1133,7 @@ def create_dynamic_model_from_function(func: Callable):
|
|||
sig = inspect.signature(func)
|
||||
|
||||
# Parse the docstring
|
||||
assert func.__doc__ is not None
|
||||
docstring = parse(func.__doc__)
|
||||
|
||||
dynamic_fields = {}
|
||||
|
@ -1157,7 +1156,6 @@ def create_dynamic_model_from_function(func: Callable):
|
|||
f"Parameter '{param.name}' in function '{func.__name__}' lacks a description in the docstring")
|
||||
|
||||
# Add parameter details to the schema
|
||||
param_doc = next((d for d in docstring.params if d.arg_name == param.name), None)
|
||||
param_docs.append((param.name, param_doc))
|
||||
if param.default == inspect.Parameter.empty:
|
||||
default_value = ...
|
||||
|
@ -1166,10 +1164,10 @@ def create_dynamic_model_from_function(func: Callable):
|
|||
dynamic_fields[param.name] = (
|
||||
param.annotation if param.annotation != inspect.Parameter.empty else str, default_value)
|
||||
# Creating the dynamic model
|
||||
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields)
|
||||
dynamic_model = create_model(f"{func.__name__}", **dynamic_fields) # type: ignore[call-overload]
|
||||
|
||||
for param_doc in param_docs:
|
||||
dynamic_model.model_fields[param_doc[0]].description = param_doc[1].description
|
||||
for name, param_doc in param_docs:
|
||||
dynamic_model.model_fields[name].description = param_doc.description
|
||||
|
||||
dynamic_model.__doc__ = docstring.short_description
|
||||
|
||||
|
@ -1182,16 +1180,16 @@ def create_dynamic_model_from_function(func: Callable):
|
|||
return dynamic_model
|
||||
|
||||
|
||||
def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
|
||||
def add_run_method_to_dynamic_model(model: type[BaseModel], func: Callable[..., Any]):
|
||||
"""
|
||||
Add a 'run' method to a dynamic Pydantic model, using the provided function.
|
||||
|
||||
Args:
|
||||
model (Type[BaseModel]): Dynamic Pydantic model class.
|
||||
model (type[BaseModel]): Dynamic Pydantic model class.
|
||||
func (Callable): Function to be added as a 'run' method to the model.
|
||||
|
||||
Returns:
|
||||
Type[BaseModel]: Pydantic model class with the added 'run' method.
|
||||
type[BaseModel]: Pydantic model class with the added 'run' method.
|
||||
"""
|
||||
|
||||
def run_method_wrapper(self):
|
||||
|
@ -1204,15 +1202,15 @@ def add_run_method_to_dynamic_model(model: Type[BaseModel], func: Callable):
|
|||
return model
|
||||
|
||||
|
||||
def create_dynamic_models_from_dictionaries(dictionaries: List[dict]):
|
||||
def create_dynamic_models_from_dictionaries(dictionaries: list[dict[str, Any]]):
|
||||
"""
|
||||
Create a list of dynamic Pydantic model classes from a list of dictionaries.
|
||||
|
||||
Args:
|
||||
dictionaries (List[dict]): List of dictionaries representing model structures.
|
||||
dictionaries (list[dict]): List of dictionaries representing model structures.
|
||||
|
||||
Returns:
|
||||
List[Type[BaseModel]]: List of generated dynamic Pydantic model classes.
|
||||
list[type[BaseModel]]: List of generated dynamic Pydantic model classes.
|
||||
"""
|
||||
dynamic_models = []
|
||||
for func in dictionaries:
|
||||
|
@ -1249,7 +1247,7 @@ def list_to_enum(enum_name, values):
|
|||
return Enum(enum_name, {value: value for value in values})
|
||||
|
||||
|
||||
def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "CustomModel") -> Type[BaseModel]:
|
||||
def convert_dictionary_to_pydantic_model(dictionary: dict[str, Any], model_name: str = "CustomModel") -> type[Any]:
|
||||
"""
|
||||
Convert a dictionary to a Pydantic model class.
|
||||
|
||||
|
@ -1258,9 +1256,9 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
|
|||
model_name (str): Name of the generated Pydantic model.
|
||||
|
||||
Returns:
|
||||
Type[BaseModel]: Generated Pydantic model class.
|
||||
type[BaseModel]: Generated Pydantic model class.
|
||||
"""
|
||||
fields = {}
|
||||
fields: dict[str, Any] = {}
|
||||
|
||||
if "properties" in dictionary:
|
||||
for field_name, field_data in dictionary.get("properties", {}).items():
|
||||
|
@ -1277,7 +1275,7 @@ def convert_dictionary_to_pydantic_model(dictionary: dict, model_name: str = "Cu
|
|||
if items != {}:
|
||||
array = {"properties": items}
|
||||
array_type = convert_dictionary_to_pydantic_model(array, f"{model_name}_{field_name}_items")
|
||||
fields[field_name] = (List[array_type], ...)
|
||||
fields[field_name] = (List[array_type], ...) # type: ignore[valid-type]
|
||||
else:
|
||||
fields[field_name] = (list, ...)
|
||||
elif field_type == "object":
|
||||
|
|
|
@ -26,6 +26,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
|||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||
{ "Q3_K_XS",LLAMA_FTYPE_MOSTLY_Q3_K_XS,"3-bit extra small quantization" , },
|
||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
set(TARGET server)
|
||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||
add_executable(${TARGET} server.cpp json.hpp httplib.h)
|
||||
add_executable(${TARGET} server.cpp oai.hpp utils.hpp json.hpp httplib.h)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_compile_definitions(${TARGET} PRIVATE
|
||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||
|
|
|
@ -30,7 +30,8 @@ Command line options:
|
|||
- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled)
|
||||
- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime)
|
||||
- `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA.
|
||||
|
||||
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
|
||||
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
|
||||
## Build
|
||||
|
||||
server is build alongside everything else from the root of the project
|
||||
|
@ -65,6 +66,14 @@ server.exe -m models\7B\ggml-model.gguf -c 2048
|
|||
The above command will start a server that by default listens on `127.0.0.1:8080`.
|
||||
You can consume the endpoints with Postman or NodeJS with axios library. You can visit the web front end at the same url.
|
||||
|
||||
### Docker:
|
||||
```bash
|
||||
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
|
||||
|
||||
# or, with CUDA:
|
||||
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
|
||||
```
|
||||
|
||||
## Testing with CURL
|
||||
|
||||
Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS.
|
||||
|
|
208
examples/server/oai.hpp
Normal file
208
examples/server/oai.hpp
Normal file
|
@ -0,0 +1,208 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "json.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
inline static json oaicompat_completion_params_parse(
|
||||
const json &body /* openai api json semantics */)
|
||||
{
|
||||
json llama_params;
|
||||
|
||||
llama_params["__oaicompat"] = true;
|
||||
|
||||
// Map OpenAI parameters to llama.cpp parameters
|
||||
//
|
||||
// For parameters that are defined by the OpenAI documentation (e.g.
|
||||
// temperature), we explicitly specify OpenAI's intended default; we
|
||||
// need to do that because sometimes OpenAI disagrees with llama.cpp
|
||||
//
|
||||
// https://platform.openai.com/docs/api-reference/chat/create
|
||||
llama_sampling_params default_sparams;
|
||||
llama_params["model"] = json_value(body, "model", std::string("unknown"));
|
||||
llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt'
|
||||
llama_params["cache_prompt"] = json_value(body, "cache_prompt", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||
llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||
llama_params["n_predict"] = json_value(body, "max_tokens", -1);
|
||||
llama_params["logit_bias"] = json_value(body, "logit_bias",json::object());
|
||||
llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0);
|
||||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||
llama_params["stream"] = json_value(body, "stream", false);
|
||||
llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat);
|
||||
llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau);
|
||||
llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta);
|
||||
llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl);
|
||||
llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p);
|
||||
llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n);
|
||||
llama_params["ignore_eos"] = json_value(body, "ignore_eos", false);
|
||||
llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z);
|
||||
|
||||
if (body.count("grammar") != 0) {
|
||||
llama_params["grammar"] = json_value(body, "grammar", json::object());
|
||||
}
|
||||
|
||||
// Handle 'stop' field
|
||||
if (body.contains("stop") && body["stop"].is_string()) {
|
||||
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
||||
} else {
|
||||
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||
}
|
||||
|
||||
// Ensure there is ChatML-specific end sequence among stop words
|
||||
llama_params["stop"].push_back("<|im_end|>");
|
||||
|
||||
return llama_params;
|
||||
}
|
||||
|
||||
inline static json format_final_response_oaicompat(const json &request, const task_result &response, bool streaming = false)
|
||||
{
|
||||
json result = response.result_json;
|
||||
|
||||
bool stopped_word = result.count("stopped_word") != 0;
|
||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||
std::string content = json_value(result, "content", std::string(""));
|
||||
|
||||
std::string finish_reason = "length";
|
||||
if (stopped_word || stopped_eos) {
|
||||
finish_reason = "stop";
|
||||
}
|
||||
|
||||
json choices =
|
||||
streaming ? json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}})
|
||||
: json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"message", json{{"content", content},
|
||||
{"role", "assistant"}}}}});
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json res =
|
||||
json{{"choices", choices},
|
||||
{"created", t},
|
||||
{"model",
|
||||
json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
|
||||
{"object", streaming ? "chat.completion.chunk" : "chat.completion"},
|
||||
{"usage",
|
||||
json{{"completion_tokens", num_tokens_predicted},
|
||||
{"prompt_tokens", num_prompt_tokens},
|
||||
{"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
|
||||
{"id", gen_chatcmplid()}};
|
||||
|
||||
if (server_verbose) {
|
||||
res["__verbose"] = result;
|
||||
}
|
||||
|
||||
if (result.contains("completion_probabilities")) {
|
||||
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
// return value is vector as there is one case where we might need to generate two responses
|
||||
inline static std::vector<json> format_partial_response_oaicompat(const task_result &response) {
|
||||
json result = response.result_json;
|
||||
|
||||
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
|
||||
return std::vector<json>({response.result_json});
|
||||
}
|
||||
|
||||
bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
|
||||
std::string modelname = json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
|
||||
|
||||
bool stopped_word = json_value(result, "stopped_word", false);
|
||||
bool stopped_eos = json_value(result, "stopped_eos", false);
|
||||
bool stopped_limit = json_value(result, "stopped_limit", false);
|
||||
std::string content = json_value(result, "content", std::string(""));
|
||||
|
||||
std::string finish_reason;
|
||||
if (stopped_word || stopped_eos) {
|
||||
finish_reason = "stop";
|
||||
}
|
||||
if (stopped_limit) {
|
||||
finish_reason = "length";
|
||||
}
|
||||
|
||||
std::time_t t = std::time(0);
|
||||
|
||||
json choices;
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
choices = json::array({json{{"finish_reason", finish_reason},
|
||||
{"index", 0},
|
||||
{"delta", json::object()}}});
|
||||
} else {
|
||||
if (first) {
|
||||
if (content.empty()) {
|
||||
choices = json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{{"role", "assistant"}}}}});
|
||||
} else {
|
||||
// We have to send this as two updates to conform to openai behavior
|
||||
json initial_ret = json{{"choices", json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"role", "assistant"}
|
||||
}}}})},
|
||||
{"created", t},
|
||||
{"id", gen_chatcmplid()},
|
||||
{"model", modelname},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
json second_ret = json{
|
||||
{"choices", json::array({json{{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta", json{
|
||||
{"content", content}}}
|
||||
}})},
|
||||
{"created", t},
|
||||
{"id", gen_chatcmplid()},
|
||||
{"model", modelname},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({initial_ret, second_ret});
|
||||
}
|
||||
} else {
|
||||
// Some idiosyncrasy in task processing logic makes several trailing calls
|
||||
// with empty content, we ignore these at the calee site.
|
||||
if (content.empty()) {
|
||||
return std::vector<json>({json::object()});
|
||||
}
|
||||
|
||||
choices = json::array({json{
|
||||
{"finish_reason", nullptr},
|
||||
{"index", 0},
|
||||
{"delta",
|
||||
json{
|
||||
{"content", content},
|
||||
}},
|
||||
}});
|
||||
}
|
||||
}
|
||||
|
||||
json ret = json{{"choices", choices},
|
||||
{"created", t},
|
||||
{"id", gen_chatcmplid()},
|
||||
{"model", modelname},
|
||||
{"object", "chat.completion.chunk"}};
|
||||
|
||||
return std::vector<json>({ret});
|
||||
}
|
File diff suppressed because it is too large
Load diff
508
examples/server/utils.hpp
Normal file
508
examples/server/utils.hpp
Normal file
|
@ -0,0 +1,508 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <mutex>
|
||||
#include <condition_variable>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "json.hpp"
|
||||
|
||||
#include "../llava/clip.h"
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
extern bool server_verbose;
|
||||
|
||||
#ifndef SERVER_VERBOSE
|
||||
#define SERVER_VERBOSE 1
|
||||
#endif
|
||||
|
||||
#if SERVER_VERBOSE != 1
|
||||
#define LOG_VERBOSE(MSG, ...)
|
||||
#else
|
||||
#define LOG_VERBOSE(MSG, ...) \
|
||||
do \
|
||||
{ \
|
||||
if (server_verbose) \
|
||||
{ \
|
||||
server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
|
||||
} \
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#define LOG_ERROR( MSG, ...) server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_WARNING(MSG, ...) server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||
|
||||
//
|
||||
// parallel
|
||||
//
|
||||
|
||||
enum server_state {
|
||||
SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet
|
||||
SERVER_STATE_READY, // Server is ready and model is loaded
|
||||
SERVER_STATE_ERROR // An error occurred, load_model failed
|
||||
};
|
||||
|
||||
enum task_type {
|
||||
TASK_TYPE_COMPLETION,
|
||||
TASK_TYPE_CANCEL,
|
||||
TASK_TYPE_NEXT_RESPONSE
|
||||
};
|
||||
|
||||
struct task_server {
|
||||
int id = -1; // to be filled by llama_server_queue
|
||||
int target_id;
|
||||
task_type type;
|
||||
json data;
|
||||
bool infill_mode = false;
|
||||
bool embedding_mode = false;
|
||||
int multitask_id = -1;
|
||||
};
|
||||
|
||||
struct task_result {
|
||||
int id;
|
||||
int multitask_id = -1;
|
||||
bool stop;
|
||||
bool error;
|
||||
json result_json;
|
||||
};
|
||||
|
||||
struct task_multi {
|
||||
int id;
|
||||
std::set<int> subtasks_remaining{};
|
||||
std::vector<task_result> results{};
|
||||
};
|
||||
|
||||
// TODO: can become bool if we can't find use of more states
|
||||
enum slot_state
|
||||
{
|
||||
IDLE,
|
||||
PROCESSING,
|
||||
};
|
||||
|
||||
enum slot_command
|
||||
{
|
||||
NONE,
|
||||
LOAD_PROMPT,
|
||||
RELEASE,
|
||||
};
|
||||
|
||||
struct slot_params
|
||||
{
|
||||
bool stream = true;
|
||||
bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt
|
||||
|
||||
uint32_t seed = -1; // RNG seed
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
|
||||
std::vector<std::string> antiprompt;
|
||||
|
||||
json input_prefix;
|
||||
json input_suffix;
|
||||
};
|
||||
|
||||
struct slot_image
|
||||
{
|
||||
int32_t id;
|
||||
|
||||
bool request_encode_image = false;
|
||||
float * image_embedding = nullptr;
|
||||
int32_t image_tokens = 0;
|
||||
|
||||
clip_image_u8 * img_data;
|
||||
|
||||
std::string prefix_prompt; // before of this image
|
||||
};
|
||||
|
||||
// completion token output with probabilities
|
||||
struct completion_token_output
|
||||
{
|
||||
struct token_prob
|
||||
{
|
||||
llama_token tok;
|
||||
float prob;
|
||||
};
|
||||
|
||||
std::vector<token_prob> probs;
|
||||
llama_token tok;
|
||||
std::string text_to_send;
|
||||
};
|
||||
|
||||
static inline void server_log(const char *level, const char *function, int line,
|
||||
const char *message, const nlohmann::ordered_json &extra)
|
||||
{
|
||||
nlohmann::ordered_json log
|
||||
{
|
||||
{"timestamp", time(nullptr)},
|
||||
{"level", level},
|
||||
{"function", function},
|
||||
{"line", line},
|
||||
{"message", message},
|
||||
};
|
||||
|
||||
if (!extra.empty())
|
||||
{
|
||||
log.merge_patch(extra);
|
||||
}
|
||||
|
||||
const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
printf("%.*s\n", (int)str.size(), str.data());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
//
|
||||
// server utils
|
||||
//
|
||||
|
||||
template <typename T>
|
||||
static T json_value(const json &body, const std::string &key, const T &default_value)
|
||||
{
|
||||
// Fallback null to default value
|
||||
return body.contains(key) && !body.at(key).is_null()
|
||||
? body.value(key, default_value)
|
||||
: default_value;
|
||||
}
|
||||
|
||||
inline std::string format_chatml(std::vector<json> messages)
|
||||
{
|
||||
std::ostringstream chatml_msgs;
|
||||
|
||||
for (auto it = messages.begin(); it != messages.end(); ++it) {
|
||||
chatml_msgs << "<|im_start|>"
|
||||
<< json_value(*it, "role", std::string("user")) << '\n';
|
||||
chatml_msgs << json_value(*it, "content", std::string(""))
|
||||
<< "<|im_end|>\n";
|
||||
}
|
||||
|
||||
chatml_msgs << "<|im_start|>assistant" << '\n';
|
||||
|
||||
return chatml_msgs.str();
|
||||
}
|
||||
|
||||
//
|
||||
// work queue utils
|
||||
//
|
||||
|
||||
struct llama_server_queue {
|
||||
int id = 0;
|
||||
std::mutex mutex_tasks;
|
||||
// queues
|
||||
std::vector<task_server> queue_tasks;
|
||||
std::vector<task_server> queue_tasks_deferred;
|
||||
std::vector<task_multi> queue_multitasks;
|
||||
std::condition_variable condition_tasks;
|
||||
// callback functions
|
||||
std::function<void(task_server&)> callback_new_task;
|
||||
std::function<void(task_multi&)> callback_finish_multitask;
|
||||
std::function<void(void)> callback_all_task_finished;
|
||||
|
||||
// Add a new task to the end of the queue
|
||||
int post(task_server task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (task.id == -1) {
|
||||
task.id = id++;
|
||||
}
|
||||
queue_tasks.push_back(std::move(task));
|
||||
condition_tasks.notify_one();
|
||||
return task.id;
|
||||
}
|
||||
|
||||
// Add a new task, but defer until one slot is available
|
||||
void defer(task_server task) {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
queue_tasks_deferred.push_back(std::move(task));
|
||||
}
|
||||
|
||||
// Get the next id for creating anew task
|
||||
int get_new_id() {
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
return id++;
|
||||
}
|
||||
|
||||
// Register function to process a new task
|
||||
void on_new_task(std::function<void(task_server&)> callback) {
|
||||
callback_new_task = callback;
|
||||
}
|
||||
|
||||
// Register function to process a multitask
|
||||
void on_finish_multitask(std::function<void(task_multi&)> callback) {
|
||||
callback_finish_multitask = callback;
|
||||
}
|
||||
|
||||
// Register the function to be called when the batch of tasks is finished
|
||||
void on_all_tasks_finished(std::function<void(void)> callback) {
|
||||
callback_all_task_finished = callback;
|
||||
}
|
||||
|
||||
// Call when the state of one slot is changed
|
||||
void notify_slot_changed() {
|
||||
// move deferred tasks back to main loop
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
for (auto & task : queue_tasks_deferred) {
|
||||
queue_tasks.push_back(std::move(task));
|
||||
}
|
||||
queue_tasks_deferred.clear();
|
||||
}
|
||||
|
||||
// Start the main loop. This call is blocking
|
||||
[[noreturn]]
|
||||
void start_loop() {
|
||||
while (true) {
|
||||
// new task arrived
|
||||
LOG_VERBOSE("have new task", {});
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (queue_tasks.empty()) {
|
||||
lock.unlock();
|
||||
break;
|
||||
}
|
||||
task_server task = queue_tasks.front();
|
||||
queue_tasks.erase(queue_tasks.begin());
|
||||
lock.unlock();
|
||||
LOG_VERBOSE("callback_new_task", {});
|
||||
callback_new_task(task);
|
||||
}
|
||||
LOG_VERBOSE("callback_all_task_finished", {});
|
||||
// process and update all the multitasks
|
||||
auto queue_iterator = queue_multitasks.begin();
|
||||
while (queue_iterator != queue_multitasks.end())
|
||||
{
|
||||
if (queue_iterator->subtasks_remaining.empty())
|
||||
{
|
||||
// all subtasks done == multitask is done
|
||||
task_multi current_multitask = *queue_iterator;
|
||||
callback_finish_multitask(current_multitask);
|
||||
// remove this multitask
|
||||
queue_iterator = queue_multitasks.erase(queue_iterator);
|
||||
}
|
||||
else
|
||||
{
|
||||
++queue_iterator;
|
||||
}
|
||||
}
|
||||
// all tasks in the current loop is finished
|
||||
callback_all_task_finished();
|
||||
}
|
||||
LOG_VERBOSE("wait for new task", {});
|
||||
// wait for new task
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_tasks);
|
||||
if (queue_tasks.empty()) {
|
||||
condition_tasks.wait(lock, [&]{
|
||||
return !queue_tasks.empty();
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
// functions to manage multitasks
|
||||
//
|
||||
|
||||
// add a multitask by specifying the id of all subtask (subtask is a task_server)
|
||||
void add_multitask(int multitask_id, std::vector<int>& sub_ids)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
task_multi multi;
|
||||
multi.id = multitask_id;
|
||||
std::copy(sub_ids.begin(), sub_ids.end(), std::inserter(multi.subtasks_remaining, multi.subtasks_remaining.end()));
|
||||
queue_multitasks.push_back(multi);
|
||||
}
|
||||
|
||||
// updatethe remaining subtasks, while appending results to multitask
|
||||
void update_multitask(int multitask_id, int subtask_id, task_result& result)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(mutex_tasks);
|
||||
for (auto& multitask : queue_multitasks)
|
||||
{
|
||||
if (multitask.id == multitask_id)
|
||||
{
|
||||
multitask.subtasks_remaining.erase(subtask_id);
|
||||
multitask.results.push_back(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
struct llama_server_response {
|
||||
typedef std::function<void(int, int, task_result&)> callback_multitask_t;
|
||||
callback_multitask_t callback_update_multitask;
|
||||
// for keeping track of all tasks waiting for the result
|
||||
std::set<int> waiting_task_ids;
|
||||
// the main result queue
|
||||
std::vector<task_result> queue_results;
|
||||
std::mutex mutex_results;
|
||||
std::condition_variable condition_results;
|
||||
|
||||
void add_waiting_task_id(int task_id) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.insert(task_id);
|
||||
}
|
||||
|
||||
void remove_waiting_task_id(int task_id) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
waiting_task_ids.erase(task_id);
|
||||
}
|
||||
|
||||
// This function blocks the thread until there is a response for this task_id
|
||||
task_result recv(int task_id) {
|
||||
while (true)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
condition_results.wait(lock, [&]{
|
||||
return !queue_results.empty();
|
||||
});
|
||||
LOG_VERBOSE("condition_results unblock", {});
|
||||
|
||||
for (int i = 0; i < (int) queue_results.size(); i++)
|
||||
{
|
||||
if (queue_results[i].id == task_id)
|
||||
{
|
||||
assert(queue_results[i].multitask_id == -1);
|
||||
task_result res = queue_results[i];
|
||||
queue_results.erase(queue_results.begin() + i);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should never reach here
|
||||
}
|
||||
|
||||
// Register the function to update multitask
|
||||
void on_multitask_update(callback_multitask_t callback) {
|
||||
callback_update_multitask = callback;
|
||||
}
|
||||
|
||||
// Send a new result to a waiting task_id
|
||||
void send(task_result result) {
|
||||
std::unique_lock<std::mutex> lock(mutex_results);
|
||||
LOG_VERBOSE("send new result", {});
|
||||
for (auto& task_id : waiting_task_ids) {
|
||||
// LOG_TEE("waiting task id %i \n", task_id);
|
||||
// for now, tasks that have associated parent multitasks just get erased once multitask picks up the result
|
||||
if (result.multitask_id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("callback_update_multitask", {});
|
||||
callback_update_multitask(task_id, result.id, result);
|
||||
continue;
|
||||
}
|
||||
|
||||
if (result.id == task_id)
|
||||
{
|
||||
LOG_VERBOSE("queue_results.push_back", {});
|
||||
queue_results.push_back(result);
|
||||
condition_results.notify_one();
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
//
|
||||
// base64 utils (TODO: move to common in the future)
|
||||
//
|
||||
|
||||
static const std::string base64_chars =
|
||||
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
"abcdefghijklmnopqrstuvwxyz"
|
||||
"0123456789+/";
|
||||
|
||||
static inline bool is_base64(uint8_t c)
|
||||
{
|
||||
return (isalnum(c) || (c == '+') || (c == '/'));
|
||||
}
|
||||
|
||||
static inline std::vector<uint8_t> base64_decode(const std::string & encoded_string)
|
||||
{
|
||||
int i = 0;
|
||||
int j = 0;
|
||||
int in_ = 0;
|
||||
|
||||
int in_len = encoded_string.size();
|
||||
|
||||
uint8_t char_array_4[4];
|
||||
uint8_t char_array_3[3];
|
||||
|
||||
std::vector<uint8_t> ret;
|
||||
|
||||
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_]))
|
||||
{
|
||||
char_array_4[i++] = encoded_string[in_]; in_++;
|
||||
if (i == 4)
|
||||
{
|
||||
for (i = 0; i <4; i++)
|
||||
{
|
||||
char_array_4[i] = base64_chars.find(char_array_4[i]);
|
||||
}
|
||||
|
||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (i = 0; (i < 3); i++)
|
||||
{
|
||||
ret.push_back(char_array_3[i]);
|
||||
}
|
||||
i = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (i)
|
||||
{
|
||||
for (j = i; j <4; j++)
|
||||
{
|
||||
char_array_4[j] = 0;
|
||||
}
|
||||
|
||||
for (j = 0; j <4; j++)
|
||||
{
|
||||
char_array_4[j] = base64_chars.find(char_array_4[j]);
|
||||
}
|
||||
|
||||
char_array_3[0] = ((char_array_4[0] ) << 2) + ((char_array_4[1] & 0x30) >> 4);
|
||||
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
|
||||
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
|
||||
|
||||
for (j = 0; (j < i - 1); j++)
|
||||
{
|
||||
ret.push_back(char_array_3[j]);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
//
|
||||
// random string / id
|
||||
//
|
||||
|
||||
static std::string random_string()
|
||||
{
|
||||
static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
|
||||
|
||||
std::random_device rd;
|
||||
std::mt19937 generator(rd());
|
||||
|
||||
std::string result(32, ' ');
|
||||
|
||||
for (int i = 0; i < 32; ++i) {
|
||||
result[i] = str[generator() % str.size()];
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
static std::string gen_chatcmplid()
|
||||
{
|
||||
std::stringstream chatcmplid;
|
||||
chatcmplid << "chatcmpl-" << random_string();
|
||||
return chatcmplid.str();
|
||||
}
|
9
examples/sycl/CMakeLists.txt
Normal file
9
examples/sycl/CMakeLists.txt
Normal file
|
@ -0,0 +1,9 @@
|
|||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
set(TARGET ls-sycl-device)
|
||||
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
47
examples/sycl/README.md
Normal file
47
examples/sycl/README.md
Normal file
|
@ -0,0 +1,47 @@
|
|||
# llama.cpp/example/sycl
|
||||
|
||||
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
|
||||
|
||||
## Tool
|
||||
|
||||
|Tool Name| Function|Status|
|
||||
|-|-|-|
|
||||
|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
|
||||
|
||||
### ls-sycl-device
|
||||
|
||||
List all SYCL devices with ID, compute capability, max work group size, ect.
|
||||
|
||||
1. Build the llama.cpp for SYCL for all targets.
|
||||
|
||||
2. Enable oneAPI running environment
|
||||
|
||||
```
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
```
|
||||
|
||||
3. Execute
|
||||
|
||||
```
|
||||
./build/bin/ls-sycl-device
|
||||
```
|
||||
|
||||
Check the ID in startup log, like:
|
||||
|
||||
```
|
||||
found 4 SYCL devices:
|
||||
Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2,
|
||||
max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280
|
||||
Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0,
|
||||
max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280
|
||||
Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0,
|
||||
max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136
|
||||
|
||||
```
|
||||
|
||||
|Attribute|Note|
|
||||
|-|-|
|
||||
|compute capability 1.3|Level-zero running time, recommended |
|
||||
|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
|
20
examples/sycl/build.sh
Executable file
20
examples/sycl/build.sh
Executable file
|
@ -0,0 +1,20 @@
|
|||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
mkdir -p build
|
||||
cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
11
examples/sycl/ls-sycl-device.cpp
Normal file
11
examples/sycl/ls-sycl-device.cpp
Normal file
|
@ -0,0 +1,11 @@
|
|||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#include "ggml-sycl.h"
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
ggml_backend_sycl_print_sycl_devices();
|
||||
return 0;
|
||||
}
|
19
examples/sycl/run-llama2.sh
Executable file
19
examples/sycl/run-llama2.sh
Executable file
|
@ -0,0 +1,19 @@
|
|||
#!/bin/bash
|
||||
|
||||
# MIT license
|
||||
# Copyright (C) 2024 Intel Corporation
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
if [ $# -gt 0 ]; then
|
||||
export GGML_SYCL_DEVICE=$1
|
||||
else
|
||||
export GGML_SYCL_DEVICE=0
|
||||
fi
|
||||
echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
|
||||
#export GGML_SYCL_DEBUG=1
|
||||
./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
|
||||
#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
|
||||
|
|
@ -263,7 +263,6 @@ static void init_model(struct my_llama_model * model) {
|
|||
model->data.resize(size + tensor_alignment);
|
||||
alloc = ggml_allocr_new(model->data.data(), model->data.size(), tensor_alignment);
|
||||
alloc_model(alloc, model);
|
||||
ggml_allocr_free(alloc);
|
||||
}
|
||||
|
||||
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||
|
@ -1102,7 +1101,6 @@ int main(int argc, char ** argv) {
|
|||
alloc = ggml_allocr_new(mem_input_data.data(), mem_input_data.size(), tensor_alignment);
|
||||
ggml_allocr_alloc(alloc, tokens_input);
|
||||
ggml_allocr_alloc(alloc, target_probs);
|
||||
ggml_allocr_free(alloc);
|
||||
|
||||
// context for compute tensors without their data
|
||||
const size_t estimated_compute_size_wo_data = (
|
||||
|
@ -1149,7 +1147,6 @@ int main(int argc, char ** argv) {
|
|||
best_compute_size = max_compute_size;
|
||||
best_order = gf->order;
|
||||
}
|
||||
ggml_allocr_free(alloc);
|
||||
ggml_free(ctx_compute);
|
||||
}
|
||||
size_t max_compute_size = best_compute_size;
|
||||
|
@ -1177,7 +1174,6 @@ int main(int argc, char ** argv) {
|
|||
params.common.use_flash,
|
||||
params.common.use_checkpointing
|
||||
);
|
||||
ggml_allocr_free(alloc);
|
||||
|
||||
std::vector<llama_token> train_tokens;
|
||||
std::vector<size_t> train_samples_begin;
|
||||
|
|
6
flake.lock
generated
6
flake.lock
generated
|
@ -20,11 +20,11 @@
|
|||
},
|
||||
"nixpkgs": {
|
||||
"locked": {
|
||||
"lastModified": 1705133751,
|
||||
"narHash": "sha256-rCIsyE80jgiOU78gCWN3A0wE0tR2GI5nH6MlS+HaaSQ=",
|
||||
"lastModified": 1706191920,
|
||||
"narHash": "sha256-eLihrZAPZX0R6RyM5fYAWeKVNuQPYjAkCUBr+JNvtdE=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "9b19f5e77dd906cb52dade0b7bd280339d2a1f3d",
|
||||
"rev": "ae5c332cbb5827f6b1f02572496b141021de335f",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
|
14
flake.nix
14
flake.nix
|
@ -1,3 +1,17 @@
|
|||
# The flake interface to llama.cpp's Nix expressions. The flake is used as a
|
||||
# more discoverable entry-point, as well as a way to pin the dependencies and
|
||||
# expose default outputs, including the outputs built by the CI.
|
||||
|
||||
# For more serious applications involving some kind of customization you may
|
||||
# want to consider consuming the overlay, or instantiating `llamaPackages`
|
||||
# directly:
|
||||
#
|
||||
# ```nix
|
||||
# pkgs.callPackage ${llama-cpp-root}/.devops/nix/scope.nix { }`
|
||||
# ```
|
||||
|
||||
# Cf. https://jade.fyi/blog/flakes-arent-real/ for a more detailed exposition
|
||||
# of the relation between Nix and the Nix Flakes.
|
||||
{
|
||||
description = "Port of Facebook's LLaMA model in C/C++";
|
||||
|
||||
|
|
|
@ -109,8 +109,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
|
|||
if (block->size >= size) {
|
||||
best_fit_block = alloc->n_free_blocks - 1;
|
||||
} else {
|
||||
fprintf(stderr, "%s: not enough space in the buffer (needed %zu, largest block available %zu)\n",
|
||||
__func__, size, max_avail);
|
||||
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
|
||||
__func__, tensor->name, size, max_avail);
|
||||
GGML_ASSERT(!"not enough space in the buffer");
|
||||
return;
|
||||
}
|
||||
|
@ -335,7 +335,9 @@ bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
|
|||
}
|
||||
|
||||
size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
|
||||
return alloc->max_size;
|
||||
// FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
|
||||
// to avoid this, we add a 10% margin to the buffer size
|
||||
return alloc->max_size + alloc->max_size/10;
|
||||
}
|
||||
|
||||
// graph allocator
|
||||
|
|
|
@ -38,7 +38,9 @@ size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
|
|||
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
|
||||
// get_alloc_size is optional, defaults to ggml_nbytes
|
||||
if (buft->iface.get_alloc_size) {
|
||||
return buft->iface.get_alloc_size(buft, tensor);
|
||||
size_t size = buft->iface.get_alloc_size(buft, tensor);
|
||||
assert(size >= ggml_nbytes(tensor));
|
||||
return size;
|
||||
}
|
||||
return ggml_nbytes(tensor);
|
||||
}
|
||||
|
@ -356,6 +358,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
|
|||
ggml_backend_cuda_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
extern void ggml_backend_sycl_reg_devices(void);
|
||||
ggml_backend_sycl_reg_devices();
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
|
||||
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
@ -718,6 +725,8 @@ GGML_CALL static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, str
|
|||
|
||||
GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_CPY:
|
||||
return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||
default:
|
||||
|
@ -902,6 +911,9 @@ struct ggml_backend_sched {
|
|||
__attribute__((aligned(GGML_MEM_ALIGN)))
|
||||
#endif
|
||||
char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
|
||||
|
||||
ggml_backend_sched_eval_callback callback_eval;
|
||||
void * callback_eval_user_data;
|
||||
};
|
||||
|
||||
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
|
||||
|
@ -1286,6 +1298,24 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
ggml_tallocr_t src_allocr = node_allocr(src);
|
||||
GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
|
||||
if (src_allocr != node_allocr) {
|
||||
// create a copy of the input in the split's backend
|
||||
size_t id = hash_id(src);
|
||||
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
||||
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
||||
|
||||
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
||||
node_allocr(tensor_copy) = cur_allocr;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
|
||||
int n_inputs = sched->splits[cur_split].n_inputs++;
|
||||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
||||
}
|
||||
node->src[j] = sched->node_copies[id][cur_backend_id];
|
||||
|
||||
#if 0
|
||||
// check if the input is already in the split
|
||||
bool found = false;
|
||||
for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
|
||||
|
@ -1301,19 +1331,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
|
||||
sched->splits[cur_split].inputs[n_inputs] = src;
|
||||
}
|
||||
|
||||
// create a copy of the input in the split's backend
|
||||
size_t id = hash_id(src);
|
||||
if (sched->node_copies[id][cur_backend_id] == NULL) {
|
||||
ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
|
||||
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
|
||||
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
|
||||
|
||||
sched->node_copies[id][cur_backend_id] = tensor_copy;
|
||||
node_allocr(tensor_copy) = cur_allocr;
|
||||
SET_CAUSE(tensor_copy, "4.cpy");
|
||||
}
|
||||
node->src[j] = sched->node_copies[id][cur_backend_id];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1424,9 +1442,38 @@ static void sched_compute_splits(ggml_backend_sched_t sched) {
|
|||
ggml_graph_dump_dot(split->graph, NULL, split_filename);
|
||||
#endif
|
||||
|
||||
|
||||
uint64_t compute_start_us = ggml_time_us();
|
||||
ggml_backend_graph_compute(split_backend, &split->graph);
|
||||
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
||||
if (!sched->callback_eval) {
|
||||
ggml_backend_graph_compute(split_backend, &split->graph);
|
||||
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
|
||||
} else {
|
||||
// similar to ggml_backend_compare_graph_backend
|
||||
for (int j0 = 0; j0 < split->graph.n_nodes; j0++) {
|
||||
struct ggml_tensor * t = split->graph.nodes[j0];
|
||||
|
||||
// check if the user needs data from this node
|
||||
bool need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
||||
|
||||
int j1 = j0;
|
||||
|
||||
// determine the range [j0, j1] of nodes that can be computed together
|
||||
while (!need && j1 < split->graph.n_nodes - 1) {
|
||||
t = split->graph.nodes[++j1];
|
||||
need = sched->callback_eval(t, true, sched->callback_eval_user_data);
|
||||
}
|
||||
|
||||
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
|
||||
|
||||
ggml_backend_graph_compute(split_backend, &gv);
|
||||
|
||||
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
|
||||
break;
|
||||
}
|
||||
|
||||
j0 = j1;
|
||||
}
|
||||
}
|
||||
uint64_t compute_end_us = ggml_time_us();
|
||||
compute_us[split_backend_id] += compute_end_us - compute_start_us;
|
||||
}
|
||||
|
@ -1531,6 +1578,12 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
|
|||
sched_reset(sched);
|
||||
}
|
||||
|
||||
|
||||
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
|
||||
sched->callback_eval = callback;
|
||||
sched->callback_eval_user_data = user_data;
|
||||
}
|
||||
|
||||
int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
|
||||
return sched->n_splits;
|
||||
}
|
||||
|
|
|
@ -151,6 +151,14 @@ extern "C" {
|
|||
struct ggml_backend_sched;
|
||||
typedef struct ggml_backend_sched * ggml_backend_sched_t;
|
||||
|
||||
// when ask == true, the scheduler wants to know if the user wants to observe this node
|
||||
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
|
||||
//
|
||||
// when ask == false, the scheduler is passing the node tensor to the user for observation
|
||||
// if the user returns false, the scheduler will cancel the graph compute
|
||||
//
|
||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
// Initialize a backend scheduler
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
@ -171,6 +179,9 @@ extern "C" {
|
|||
// Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
|
||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||
|
||||
// Set a callback to be called for each resulting node during graph compute
|
||||
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
|
||||
|
||||
//
|
||||
// Utils
|
||||
//
|
||||
|
|
134
ggml-cuda.cu
134
ggml-cuda.cu
|
@ -12,9 +12,10 @@
|
|||
#include <vector>
|
||||
#include <map>
|
||||
#include <array>
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string
|
||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#include <hip/hip_runtime.h>
|
||||
|
@ -118,6 +119,11 @@
|
|||
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
|
||||
// ggml-cuda need half type so keep ggml headers include at last
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
|
||||
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
||||
|
||||
#define CC_PASCAL 600
|
||||
|
@ -582,13 +588,28 @@ static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0,
|
|||
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||
|
||||
[[noreturn]]
|
||||
static __device__ void bad_arch() {
|
||||
printf("ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n");
|
||||
static __device__ void no_device_code(
|
||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||
file_name, line, function_name, arch);
|
||||
(void) arch_list;
|
||||
#else
|
||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||
file_name, line, function_name, arch, arch_list);
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
__trap();
|
||||
|
||||
(void) bad_arch; // suppress unused function warning
|
||||
(void) no_device_code; // suppress unused function warning
|
||||
}
|
||||
|
||||
#ifdef __CUDA_ARCH__
|
||||
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
||||
#else
|
||||
#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
|
@ -615,7 +636,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|||
return a;
|
||||
#else
|
||||
(void) a;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
}
|
||||
|
||||
|
@ -636,7 +657,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|||
return x;
|
||||
#else
|
||||
(void) x;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
}
|
||||
|
||||
|
@ -2419,7 +2440,7 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h
|
|||
}
|
||||
#else
|
||||
(void) vx; (void) y; (void) k;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_PASCAL
|
||||
}
|
||||
|
||||
|
@ -2450,7 +2471,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_0_q8_1_imp
|
|||
// second part effectively subtracts 8 from each quant value
|
||||
return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2487,7 +2508,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q4_1_q8_1_imp
|
|||
// scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
|
||||
return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2522,7 +2543,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_0_q8_1_imp
|
|||
// second part effectively subtracts 16 from each quant value
|
||||
return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2567,7 +2588,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q5_1_q8_1_imp
|
|||
return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2588,7 +2609,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_0_q8_1_imp
|
|||
|
||||
return d8_0*d8_1 * sumi;
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2618,7 +2639,7 @@ template <int vdr> static __device__ __forceinline__ float vec_dot_q8_1_q8_1_imp
|
|||
// scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
|
||||
return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2653,7 +2674,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq(
|
|||
|
||||
return dm2f.x*sumf_d - dm2f.y*sumf_m;
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2690,7 +2711,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq(
|
|||
|
||||
return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2730,7 +2751,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq(
|
|||
|
||||
return d3 * sumf;
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2755,7 +2776,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq(
|
|||
|
||||
return d3*d8 * sumi;
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2788,7 +2809,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq(
|
|||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2821,7 +2842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq(
|
|||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2861,7 +2882,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq(
|
|||
return dm5f.x*sumf_d - dm5f.y*sumf_m;
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2894,7 +2915,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq(
|
|||
return dm4f.x*sumf_d - dm4f.y*sumf_m;
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2924,7 +2945,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq(
|
|||
|
||||
return d*sumf;
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -2955,7 +2976,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq(
|
|||
return d6 * sumf_d;
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
}
|
||||
|
||||
|
@ -3821,7 +3842,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|||
return dall * sumf_d - dmin * sumf_m;
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif
|
||||
|
@ -4004,7 +4025,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
|
|||
return d * sumf_d;
|
||||
|
||||
#else
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif
|
||||
|
@ -4262,7 +4283,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|||
q8 += 8;
|
||||
aux32 >>= 7;
|
||||
}
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * (float)bq8_1[ib32].ds.x * 0.25f;
|
||||
const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
return d * sumi;
|
||||
#else
|
||||
// iqs is 0...15
|
||||
|
@ -4273,7 +4294,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1(
|
|||
const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]);
|
||||
const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]);
|
||||
const uint32_t aux32 = q2[2] | (q2[3] << 16);
|
||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f;
|
||||
const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127];
|
||||
const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127];
|
||||
const int8_t * q8 = bq8_1[ib32].qs + 16*il;
|
||||
|
@ -4318,7 +4339,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|||
}
|
||||
q8 += 8;
|
||||
}
|
||||
const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f;
|
||||
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
||||
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
||||
#else
|
||||
assert(false);
|
||||
|
@ -4499,7 +4520,7 @@ template <bool need_check> static __global__ void
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q4_0_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -4568,7 +4589,7 @@ template <bool need_check> static __global__ void
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q4_1_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -4635,7 +4656,7 @@ template <bool need_check> static __global__ void
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q5_0_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -4702,7 +4723,7 @@ mul_mat_q5_1(
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q5_1_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -4769,7 +4790,7 @@ template <bool need_check> static __global__ void
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q8_0_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -4836,7 +4857,7 @@ mul_mat_q2_K(
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q2_K_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -4905,7 +4926,7 @@ template <bool need_check> static __global__ void
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q3_K_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -4974,7 +4995,7 @@ template <bool need_check> static __global__ void
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q4_K_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -5041,7 +5062,7 @@ mul_mat_q5_K(
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q5_K_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -5110,7 +5131,7 @@ template <bool need_check> static __global__ void
|
|||
(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
|
||||
#else
|
||||
(void) vec_dot_q6_K_q8_1_mul_mat;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
}
|
||||
|
||||
|
@ -5131,10 +5152,10 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
|
|||
const block_q_t * x = (const block_q_t *) vx;
|
||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||
|
||||
for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
|
||||
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
|
||||
const int ibx = row*blocks_per_row + i; // x block index
|
||||
|
||||
const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
|
||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
||||
|
||||
const int iqs = vdr * (threadIdx.x % (qi/vdr)); // x block quant index when casting the quants to int
|
||||
|
||||
|
@ -5833,7 +5854,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds
|
|||
}
|
||||
#else
|
||||
(void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale;
|
||||
bad_arch();
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
||||
}
|
||||
|
||||
|
@ -9769,8 +9790,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s
|
|||
// TODO: mmq/mmv support
|
||||
#endif
|
||||
|
||||
const int64_t nb11 = src1->nb[1];
|
||||
const int64_t nb1 = dst->nb[1];
|
||||
const size_t nb11 = src1->nb[1];
|
||||
const size_t nb1 = dst->nb[1];
|
||||
|
||||
const struct ggml_tensor * ids = src0;
|
||||
const int32_t id = ((int32_t *) dst->op_params)[0];
|
||||
|
@ -10283,15 +10304,11 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t
|
|||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
// initialize padding to 0 to avoid possible NaN values
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
|
||||
size_t original_size = ggml_nbytes_split(tensor, nrows_split);
|
||||
size_t original_size = ggml_nbytes(tensor);
|
||||
size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
|
||||
|
||||
if (padded_size > original_size && tensor->view_src == nullptr) {
|
||||
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
|
||||
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -10394,12 +10411,7 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
|
|||
}
|
||||
|
||||
GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
|
||||
int64_t row_low = 0;
|
||||
int64_t row_high = ggml_nrows(tensor);
|
||||
int64_t nrows_split = row_high - row_low;
|
||||
|
||||
size_t size = ggml_nbytes_split(tensor, nrows_split);
|
||||
|
||||
size_t size = ggml_nbytes(tensor);
|
||||
int64_t ne0 = tensor->ne[0];
|
||||
|
||||
if (ggml_is_quantized(tensor->type)) {
|
||||
|
@ -10921,6 +10933,12 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
if (a->ne[3] != b->ne[3]) {
|
||||
return false;
|
||||
}
|
||||
ggml_type a_type = a->type;
|
||||
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS) {
|
||||
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} break;
|
||||
case GGML_OP_GET_ROWS:
|
||||
|
|
125
ggml-metal.m
125
ggml-metal.m
|
@ -26,15 +26,6 @@
|
|||
|
||||
#define GGML_METAL_MAX_KERNELS 256
|
||||
|
||||
struct ggml_metal_buffer {
|
||||
const char * name;
|
||||
|
||||
void * data;
|
||||
size_t size;
|
||||
|
||||
id<MTLBuffer> metal;
|
||||
};
|
||||
|
||||
struct ggml_metal_kernel {
|
||||
id<MTLFunction> function;
|
||||
id<MTLComputePipelineState> pipeline;
|
||||
|
@ -172,9 +163,6 @@ struct ggml_metal_context {
|
|||
|
||||
dispatch_queue_t d_queue;
|
||||
|
||||
int n_buffers;
|
||||
struct ggml_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||
|
||||
struct ggml_metal_kernel kernels[GGML_METAL_MAX_KERNELS];
|
||||
|
||||
bool support_simdgroup_reduction;
|
||||
|
@ -238,30 +226,24 @@ static void * ggml_metal_host_malloc(size_t n) {
|
|||
static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
||||
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
|
||||
|
||||
id<MTLDevice> device;
|
||||
NSString * s;
|
||||
|
||||
#if TARGET_OS_OSX
|
||||
#if TARGET_OS_OSX && !GGML_METAL_NDEBUG
|
||||
// Show all the Metal device instances in the system
|
||||
NSArray * devices = MTLCopyAllDevices();
|
||||
for (device in devices) {
|
||||
s = [device name];
|
||||
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [s UTF8String]);
|
||||
for (id<MTLDevice> device in devices) {
|
||||
GGML_METAL_LOG_INFO("%s: found device: %s\n", __func__, [[device name] UTF8String]);
|
||||
}
|
||||
[devices release]; // since it was created by a *Copy* C method
|
||||
#endif
|
||||
|
||||
// Pick and show default Metal device
|
||||
device = MTLCreateSystemDefaultDevice();
|
||||
s = [device name];
|
||||
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [s UTF8String]);
|
||||
id<MTLDevice> device = MTLCreateSystemDefaultDevice();
|
||||
GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
|
||||
|
||||
// Configure context
|
||||
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
|
||||
ctx->device = device;
|
||||
ctx->n_cb = MIN(n_cb, GGML_METAL_MAX_BUFFERS);
|
||||
ctx->queue = [ctx->device newCommandQueue];
|
||||
ctx->n_buffers = 0;
|
||||
|
||||
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
|
||||
|
||||
// load library
|
||||
|
@ -279,6 +261,10 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
} else {
|
||||
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
||||
|
||||
|
@ -303,27 +289,25 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
@autoreleasepool {
|
||||
// dictionary of preprocessor macros
|
||||
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
||||
|
||||
#ifdef GGML_QKK_64
|
||||
prep[@"QK_K"] = @(64);
|
||||
prep[@"QK_K"] = @(64);
|
||||
#endif
|
||||
|
||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
MTLCompileOptions* options = [MTLCompileOptions new];
|
||||
options.preprocessorMacros = prep;
|
||||
|
||||
//[options setFastMathEnabled:false];
|
||||
//[options setFastMathEnabled:false];
|
||||
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||
|
||||
[options release];
|
||||
[prep release];
|
||||
}
|
||||
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||
if (error) {
|
||||
GGML_METAL_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -534,10 +518,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
static void ggml_metal_free(struct ggml_metal_context * ctx) {
|
||||
GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
|
||||
|
||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||
[ctx->buffers[i].metal release];
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_METAL_MAX_KERNELS; ++i) {
|
||||
if (ctx->kernels[i].pipeline) {
|
||||
[ctx->kernels[i].pipeline release];
|
||||
|
@ -580,51 +560,30 @@ struct ggml_backend_metal_buffer_context {
|
|||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||
// Metal buffer based on the host memory pointer
|
||||
//
|
||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
||||
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_tensor * t, size_t * offs) {
|
||||
//GGML_METAL_LOG_INFO("%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
||||
|
||||
const int64_t tsize = ggml_nbytes(t);
|
||||
|
||||
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
||||
|
||||
// compatibility with ggml-backend
|
||||
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||
|
||||
// find the view that contains the tensor fully
|
||||
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
||||
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
||||
|
||||
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
||||
*offs = (size_t) ioffs;
|
||||
|
||||
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
||||
|
||||
return buf_ctx->buffers[i].metal;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
||||
|
||||
return nil;
|
||||
}
|
||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||
|
||||
// find the view that contains the tensor fully
|
||||
for (int i = 0; i < ctx->n_buffers; ++i) {
|
||||
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
||||
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
||||
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
||||
|
||||
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, ctx->buffers[%d].size = %10ld, name = %s\n", ioffs, tsize, ioffs + tsize, i, ctx->buffers[i].size, ctx->buffers[i].name);
|
||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
||||
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
||||
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
||||
*offs = (size_t) ioffs;
|
||||
|
||||
//GGML_METAL_LOG_INFO("%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
||||
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
||||
|
||||
return ctx->buffers[i].metal;
|
||||
return buf_ctx->buffers[i].metal;
|
||||
}
|
||||
}
|
||||
|
||||
GGML_METAL_LOG_ERROR("%s: error: buffer is nil\n", __func__);
|
||||
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
||||
|
||||
return nil;
|
||||
}
|
||||
|
@ -671,7 +630,8 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||
return true;
|
||||
case GGML_OP_MUL_MAT:
|
||||
case GGML_OP_MUL_MAT_ID:
|
||||
return ctx->support_simdgroup_reduction;
|
||||
return ctx->support_simdgroup_reduction &&
|
||||
(op->src[0]->type != GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F32);
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CONT:
|
||||
|
@ -713,7 +673,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||
static bool ggml_metal_graph_compute(
|
||||
struct ggml_metal_context * ctx,
|
||||
struct ggml_cgraph * gf) {
|
||||
@autoreleasepool {
|
||||
|
||||
MTLComputePassDescriptor * edesc = MTLComputePassDescriptor.computePassDescriptor;
|
||||
edesc.dispatchType = MTLDispatchTypeSerial;
|
||||
|
@ -817,9 +776,9 @@ static bool ggml_metal_graph_compute(
|
|||
const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT;
|
||||
const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT;
|
||||
|
||||
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(ctx, src0, &offs_src0) : nil;
|
||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(ctx, src1, &offs_src1) : nil;
|
||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(ctx, dst, &offs_dst) : nil;
|
||||
id<MTLBuffer> id_src0 = src0 ? ggml_metal_get_buffer(src0, &offs_src0) : nil;
|
||||
id<MTLBuffer> id_src1 = src1 ? ggml_metal_get_buffer(src1, &offs_src1) : nil;
|
||||
id<MTLBuffer> id_dst = dst ? ggml_metal_get_buffer(dst, &offs_dst) : nil;
|
||||
|
||||
//GGML_METAL_LOG_INFO("%s: op - %s\n", __func__, ggml_op_name(dst->op));
|
||||
//if (src0) {
|
||||
|
@ -1601,7 +1560,7 @@ static bool ggml_metal_graph_compute(
|
|||
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||
|
||||
size_t offs_src_cur = 0;
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
|
||||
|
||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j];
|
||||
}
|
||||
|
@ -1746,7 +1705,7 @@ static bool ggml_metal_graph_compute(
|
|||
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||
|
||||
size_t offs_src_cur = 0;
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur);
|
||||
|
||||
[encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j];
|
||||
}
|
||||
|
@ -2236,10 +2195,7 @@ static bool ggml_metal_graph_compute(
|
|||
#endif
|
||||
}
|
||||
|
||||
if (encoder != nil) {
|
||||
[encoder endEncoding];
|
||||
encoder = nil;
|
||||
}
|
||||
[encoder endEncoding];
|
||||
|
||||
[command_buffer commit];
|
||||
});
|
||||
|
@ -2259,7 +2215,6 @@ static bool ggml_metal_graph_compute(
|
|||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -714,7 +714,6 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
|
||||
|
@ -784,6 +783,7 @@ __kernel void KERNEL_NAME(__global X_TYPE* x, __local float* tmp, __global float
|
|||
dst[row] = tmp[0];
|
||||
}
|
||||
}
|
||||
|
||||
);
|
||||
|
||||
|
||||
|
@ -799,6 +799,18 @@ __kernel void KERNEL_NAME(__global TYPE* x, const int x_offset, __global TYPE* y
|
|||
}
|
||||
);
|
||||
|
||||
std::string add_template = MULTILINE_QUOTE(
|
||||
__kernel void add_f32(__global float * x, const int x_offset, __global float * y, const int y_offset, __global float * dst, const int dst_offset, const int ky) {
|
||||
const int i = get_group_id(0)*get_local_size(0) + get_local_id(0);
|
||||
|
||||
if (i >= get_global_size(0)) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[dst_offset + i] = x[x_offset + i] + y[y_offset + i%ky];
|
||||
}
|
||||
);
|
||||
|
||||
#define CL_CHECK(err) \
|
||||
do { \
|
||||
cl_int err_ = (err); \
|
||||
|
@ -878,6 +890,7 @@ static std::string generate_kernels() {
|
|||
}
|
||||
src << mul_kernel << '\n';
|
||||
}
|
||||
src << add_template << '\n';
|
||||
|
||||
return src.str();
|
||||
}
|
||||
|
@ -893,6 +906,7 @@ static cl_kernel dequantize_mul_mat_vec_q4_0_cl, dequantize_mul_mat_vec_q4_1_cl,
|
|||
static cl_kernel dequantize_block_q2_k_cl, dequantize_block_q3_k_cl, dequantize_block_q4_k_cl, dequantize_block_q5_k_cl, dequantize_block_q6_k_cl;
|
||||
static cl_kernel dequantize_mul_mat_vec_q2_K_cl, dequantize_mul_mat_vec_q3_K_cl, dequantize_mul_mat_vec_q4_K_cl, dequantize_mul_mat_vec_q5_K_cl, dequantize_mul_mat_vec_q6_K_cl;
|
||||
static cl_kernel mul_f32_cl;
|
||||
static cl_kernel add_f32_cl;
|
||||
static bool fp16_support;
|
||||
|
||||
static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
|
||||
|
@ -1100,9 +1114,10 @@ void ggml_cl_init(void) {
|
|||
char *ext_buffer = (char *)alloca(ext_str_size + 1);
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, ext_str_size, ext_buffer, NULL);
|
||||
ext_buffer[ext_str_size] = '\0'; // ensure it is null terminated
|
||||
// Disabled due to faulty outputs
|
||||
// Check if ext_buffer contains cl_khr_fp16
|
||||
fp16_support = strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
||||
fp16_support = false; // strstr(ext_buffer, "cl_khr_fp16") != NULL;
|
||||
// fprintf(stderr, "ggml_opencl: device FP16 support: %s\n", fp16_support ? "true" : "false");
|
||||
|
||||
cl_context_properties properties[] = {
|
||||
(intptr_t)CL_CONTEXT_PLATFORM, (intptr_t)platform, 0
|
||||
|
@ -1150,6 +1165,8 @@ void ggml_cl_init(void) {
|
|||
|
||||
// mul kernel
|
||||
CL_CHECK((mul_f32_cl = clCreateKernel(program, "mul_f32", &err), err));
|
||||
|
||||
CL_CHECK((add_f32_cl = clCreateKernel(program, "add_f32", &err), err));
|
||||
}
|
||||
|
||||
static cl_kernel* ggml_get_to_fp32_cl(ggml_type type) {
|
||||
|
@ -1458,6 +1475,70 @@ void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src
|
|||
ggml_cl_mul_f32(src0, src1, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_add_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne03 = src0->ne[3];
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
const int64_t ne11 = src1->ne[1];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
const int64_t ne13 = src1->ne[3];
|
||||
const int nb2 = dst->nb[2];
|
||||
const int nb3 = dst->nb[3];
|
||||
size_t x_size;
|
||||
size_t d_size;
|
||||
|
||||
cl_mem d_X = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &x_size); // src0
|
||||
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
||||
cl_mem d_D = ggml_cl_pool_malloc(ne00 * ne01 * sizeof(float), &d_size); // dst
|
||||
|
||||
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||
cl_event ev;
|
||||
|
||||
// copy src0 to device
|
||||
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, &ev));
|
||||
|
||||
const int64_t i13 = i03%ne13;
|
||||
const int64_t i12 = i02%ne12;
|
||||
const int i1 = i13*ne12*ne11 + i12*ne11;
|
||||
|
||||
cl_int x_offset = 0;
|
||||
cl_int y_offset = i1*ne10;
|
||||
cl_int d_offset = 0;
|
||||
|
||||
size_t global = ne00 * ne01;
|
||||
cl_int ky = ne10 * ne11;
|
||||
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 0, sizeof(cl_mem), &d_X));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 1, sizeof(cl_int), &x_offset));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 2, sizeof(cl_mem), &d_Y));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 3, sizeof(cl_int), &y_offset));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 4, sizeof(cl_mem), &d_D));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 5, sizeof(cl_int), &d_offset));
|
||||
CL_CHECK(clSetKernelArg(add_f32_cl, 6, sizeof(cl_int), &ky));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, add_f32_cl, 1, NULL, &global, NULL, 1, &ev, NULL));
|
||||
|
||||
CL_CHECK(clReleaseEvent(ev));
|
||||
CL_CHECK(clFinish(queue));
|
||||
|
||||
// copy dst to host
|
||||
float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * ne00*ne01, d, 0, NULL, NULL));
|
||||
}
|
||||
}
|
||||
ggml_cl_pool_free(d_X, x_size);
|
||||
ggml_cl_pool_free(d_D, d_size);
|
||||
}
|
||||
|
||||
void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
|
||||
ggml_cl_add_f32(src0, src1, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
|
|
|
@ -10,6 +10,7 @@ extern "C" {
|
|||
GGML_API void ggml_cl_init(void);
|
||||
|
||||
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
|
||||
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||
|
|
|
@ -1274,7 +1274,12 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|||
}
|
||||
float sumlx = 0;
|
||||
float suml2 = 0;
|
||||
#ifdef HAVE_BUGGY_APPLE_LINKER
|
||||
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
||||
for (volatile int i = 0; i < n; ++i) {
|
||||
#else
|
||||
for (int i = 0; i < n; ++i) {
|
||||
#endif
|
||||
int l = nearest_int(iscale * x[i]);
|
||||
l = MAX(-nmax, MIN(nmax-1, l));
|
||||
L[i] = l + nmax;
|
||||
|
@ -1649,7 +1654,12 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|||
float max = x[0];
|
||||
float sum_w = weights ? weights[0] : x[0]*x[0];
|
||||
float sum_x = sum_w * x[0];
|
||||
#ifdef HAVE_BUGGY_APPLE_LINKER
|
||||
// use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
|
||||
for (volatile int i = 1; i < n; ++i) {
|
||||
#else
|
||||
for (int i = 1; i < n; ++i) {
|
||||
#endif
|
||||
if (x[i] < min) min = x[i];
|
||||
if (x[i] > max) max = x[i];
|
||||
float w = weights ? weights[i] : x[i]*x[i];
|
||||
|
@ -1660,7 +1670,7 @@ static float make_qkx3_quants(int n, int nmax, const float * restrict x, const f
|
|||
min = 0;
|
||||
}
|
||||
if (max <= min) {
|
||||
for (int i = 0; i < n; ++i) L[i] = 0;
|
||||
memset(L, 0, n);
|
||||
*the_min = -min;
|
||||
return 0.f;
|
||||
}
|
||||
|
@ -1862,7 +1872,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|||
|
||||
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -2181,7 +2191,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|||
|
||||
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -2448,7 +2458,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|||
|
||||
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -2771,7 +2781,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|||
|
||||
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -3025,7 +3035,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|||
|
||||
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
||||
(void)hist;
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
||||
if (!quant_weights) {
|
||||
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
||||
}
|
||||
|
@ -3072,7 +3082,7 @@ size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int
|
|||
if (!quant_weights) {
|
||||
return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights);
|
||||
|
@ -3116,7 +3126,7 @@ size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int
|
|||
if (!quant_weights) {
|
||||
return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights);
|
||||
|
@ -3169,7 +3179,7 @@ size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int
|
|||
if (!quant_weights) {
|
||||
return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights);
|
||||
|
@ -3221,7 +3231,7 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
|
|||
if (!quant_weights) {
|
||||
return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
|
||||
}
|
||||
int row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
||||
char * qrow = (char *)dst;
|
||||
for (int row = 0; row < nrow; ++row) {
|
||||
quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights);
|
||||
|
@ -8565,7 +8575,7 @@ static int iq2_compare_func(const void * left, const void * right) {
|
|||
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
||||
}
|
||||
|
||||
static void q2xs_init_impl(int grid_size) {
|
||||
void iq2xs_init_impl(int grid_size) {
|
||||
const int gindex = iq2_data_index(grid_size);
|
||||
if (iq2_data[gindex].grid) {
|
||||
return;
|
||||
|
@ -8720,19 +8730,7 @@ static void q2xs_init_impl(int grid_size) {
|
|||
free(dist2);
|
||||
}
|
||||
|
||||
void ggml_init_iq2_quantization(enum ggml_type type) {
|
||||
if (type == GGML_TYPE_IQ2_XXS) {
|
||||
q2xs_init_impl(256);
|
||||
}
|
||||
else if (type == GGML_TYPE_IQ2_XS) {
|
||||
q2xs_init_impl(512);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
||||
}
|
||||
}
|
||||
|
||||
static void q2xs_deinit_impl(int grid_size) {
|
||||
void iq2xs_free_impl(int grid_size) {
|
||||
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
||||
const int gindex = iq2_data_index(grid_size);
|
||||
if (iq2_data[gindex].grid) {
|
||||
|
@ -8742,18 +8740,6 @@ static void q2xs_deinit_impl(int grid_size) {
|
|||
}
|
||||
}
|
||||
|
||||
void ggml_deinit_iq2_quantization(enum ggml_type type) {
|
||||
if (type == GGML_TYPE_IQ2_XXS) {
|
||||
q2xs_deinit_impl(256);
|
||||
}
|
||||
else if (type == GGML_TYPE_IQ2_XS) {
|
||||
q2xs_deinit_impl(512);
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
||||
}
|
||||
}
|
||||
|
||||
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
||||
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
||||
int num_neighbors = neighbours[0];
|
||||
|
@ -8786,10 +8772,10 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict
|
|||
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
||||
|
||||
GGML_ASSERT(quant_weights);
|
||||
GGML_ASSERT(kgrid_q2xs);
|
||||
GGML_ASSERT(kmap_q2xs);
|
||||
GGML_ASSERT(kneighbors_q2xs);
|
||||
GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
@ -9005,10 +8991,10 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|||
const int * kmap_q2xs = iq2_data[gindex].map;
|
||||
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
||||
|
||||
GGML_ASSERT(quant_weights);
|
||||
GGML_ASSERT(kmap_q2xs);
|
||||
GGML_ASSERT(kgrid_q2xs);
|
||||
GGML_ASSERT(kneighbors_q2xs);
|
||||
GGML_ASSERT(quant_weights && "missing quantization weights");
|
||||
GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
|
||||
GGML_ASSERT(n%QK_K == 0);
|
||||
|
||||
const int kMaxQ = 3;
|
||||
|
|
|
@ -257,3 +257,6 @@ size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row,
|
|||
size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
void iq2xs_init_impl(int grid_size);
|
||||
void iq2xs_free_impl(int grid_size);
|
||||
|
|
15197
ggml-sycl.cpp
Normal file
15197
ggml-sycl.cpp
Normal file
File diff suppressed because it is too large
Load diff
27
ggml-sycl.h
Normal file
27
ggml-sycl.h
Normal file
|
@ -0,0 +1,27 @@
|
|||
/*MIT license
|
||||
Copyright (C) 2024 Intel Corporation
|
||||
SPDX-License-Identifier: MIT
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define GGML_SYCL_MAX_DEVICES 16
|
||||
#define GGML_SYCL_NAME "SYCL"
|
||||
|
||||
GGML_API void ggml_init_sycl(void);
|
||||
GGML_API bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
|
||||
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
430
ggml.c
430
ggml.c
|
@ -250,6 +250,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|||
#include "ggml-opencl.h"
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
#include "ggml-vulkan.h"
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
// floating point type used to accumulate sums
|
||||
|
@ -396,12 +398,6 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|||
static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y);
|
||||
static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y);
|
||||
|
||||
ggml_collect_imatrix_t g_imatrix_collect = NULL;
|
||||
|
||||
void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) {
|
||||
g_imatrix_collect = imatrix_collect;
|
||||
}
|
||||
|
||||
static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
||||
[GGML_TYPE_I8] = {
|
||||
.type_name = "i8",
|
||||
|
@ -1426,6 +1422,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||
// TODO: optimize performance
|
||||
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
|
@ -1784,9 +1783,11 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|||
"GELU",
|
||||
"GELU_QUICK",
|
||||
"SILU",
|
||||
"HARDSWISH",
|
||||
"HARDSIGMOID",
|
||||
};
|
||||
|
||||
static_assert(GGML_UNARY_OP_COUNT == 10, "GGML_UNARY_OP_COUNT != 10");
|
||||
static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
|
@ -2298,6 +2299,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|||
ggml_cl_init();
|
||||
#elif defined(GGML_USE_VULKAN)
|
||||
ggml_vk_init();
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
ggml_init_sycl();
|
||||
#endif
|
||||
|
||||
ggml_setup_op_has_task_pass();
|
||||
|
@ -3955,6 +3958,20 @@ struct ggml_tensor * ggml_silu_back(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml hardswish
|
||||
struct ggml_tensor * ggml_hardswish(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSWISH);
|
||||
}
|
||||
|
||||
// ggml hardsigmoid
|
||||
struct ggml_tensor * ggml_hardsigmoid(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary(ctx, a, GGML_UNARY_OP_HARDSIGMOID);
|
||||
}
|
||||
|
||||
// ggml_norm
|
||||
|
||||
static struct ggml_tensor * ggml_norm_impl(
|
||||
|
@ -5354,6 +5371,31 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_conv_depthwise
|
||||
struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
|
||||
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
|
||||
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
|
||||
s0, s1, p0, p1, d0, d1, true); // [N * IC, OH, OW, KH * KW]
|
||||
|
||||
struct ggml_tensor * result =
|
||||
ggml_mul_mat(ctx,
|
||||
ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1), // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
|
||||
ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3])); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
|
||||
|
||||
result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW]
|
||||
|
||||
return result;
|
||||
}
|
||||
// ggml_conv_2d
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
|
@ -7173,6 +7215,17 @@ static void ggml_compute_forward_add_f32(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
#ifdef GGML_USE_CLBLAST
|
||||
if (src1->backend == GGML_BACKEND_GPU) {
|
||||
// TODO: OpenCL kernel support full broadcast
|
||||
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
|
||||
if (ith == 0) {
|
||||
ggml_cl_add(src0, src1, dst);
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
const int nr = ggml_nrows(src0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
@ -7453,7 +7506,12 @@ static void ggml_compute_forward_add(
|
|||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
||||
if (src1->type == GGML_TYPE_F32) {
|
||||
ggml_compute_forward_add_f32(params, src0, src1, dst);
|
||||
}
|
||||
else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
|
@ -7774,6 +7832,9 @@ static void ggml_compute_forward_acc_f32(
|
|||
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
memcpy(
|
||||
|
@ -9343,6 +9404,87 @@ static void ggml_compute_forward_silu_back(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_hardswish_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
assert(dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_hardswish_f32(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
}
|
||||
}
|
||||
static void ggml_compute_forward_hardswish(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_hardswish_f32(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_hardsigmoid_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
assert(params->ith == 0);
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
assert(dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_hardsigmoid_f32(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_hardsigmoid(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_hardsigmoid_f32(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_norm
|
||||
|
||||
static void ggml_compute_forward_norm_f32(
|
||||
|
@ -9794,10 +9936,6 @@ static void ggml_compute_forward_mul_mat(
|
|||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
if (ith == 1 && g_imatrix_collect) {
|
||||
g_imatrix_collect(src0, src1);
|
||||
}
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
@ -9839,11 +9977,30 @@ static void ggml_compute_forward_mul_mat(
|
|||
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(dst)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
const int64_t ne_plane = ne01*ne00;
|
||||
const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
|
||||
UNUSED(desired_wsize);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (type != GGML_TYPE_F32) {
|
||||
assert(params->wsize >= desired_wsize);
|
||||
// parallelize by src0 rows
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
// broadcast src0 into src1 across 2nd,3rd dimension
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
|
||||
float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||
|
||||
for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
|
||||
to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -9851,9 +10008,14 @@ static void ggml_compute_forward_mul_mat(
|
|||
return;
|
||||
}
|
||||
|
||||
// perform sgemm, parallelization controlled by blas lib
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
//const int64_t tgemm0 = ggml_perf_time_us();
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++) {
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++) {
|
||||
// broadcast src0 into src1 across 2nd,3rd dimension
|
||||
const int64_t i03 = i13/r3;
|
||||
const int64_t i02 = i12/r2;
|
||||
|
||||
|
@ -9862,17 +10024,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
|
||||
|
||||
if (type != GGML_TYPE_F32) {
|
||||
float * const wdata = params->wdata;
|
||||
ggml_to_float_t const to_float = type_traits[type].to_float;
|
||||
|
||||
size_t id = 0;
|
||||
for (int64_t i01 = 0; i01 < ne01; ++i01) {
|
||||
to_float((const char *) x + i01*nb01, wdata + id, ne00);
|
||||
id += ne00;
|
||||
}
|
||||
|
||||
assert(id*sizeof(float) <= params->wsize);
|
||||
x = wdata;
|
||||
x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
|
||||
}
|
||||
|
||||
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
|
||||
|
@ -9882,6 +10034,7 @@ static void ggml_compute_forward_mul_mat(
|
|||
0.0f, d, ne01);
|
||||
}
|
||||
}
|
||||
//printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
|
||||
|
||||
//printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
|
||||
|
||||
|
@ -9890,6 +10043,9 @@ static void ggml_compute_forward_mul_mat(
|
|||
#endif
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
if (src1->type != vec_dot_type) {
|
||||
char * wdata = params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
@ -10054,6 +10210,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)]
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
char * wdata = params->wdata;
|
||||
if (src1->type != vec_dot_type) {
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
@ -10101,10 +10260,6 @@ static void ggml_compute_forward_mul_mat_id(
|
|||
|
||||
const struct ggml_tensor * src0_cur = dst->src[cur_a + 2];
|
||||
|
||||
if (ith == 1 && g_imatrix_collect) {
|
||||
g_imatrix_collect(src0_cur, src1);
|
||||
}
|
||||
|
||||
const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
|
||||
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
||||
|
||||
|
@ -10243,6 +10398,9 @@ static void ggml_compute_forward_out_prod_f32(
|
|||
return;
|
||||
}
|
||||
#endif
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||
return;
|
||||
}
|
||||
|
@ -10426,6 +10584,9 @@ static void ggml_compute_forward_out_prod_q_f32(
|
|||
// TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
ggml_vec_set_f32(ne0*ne1*ne2*ne3, dst->data, 0);
|
||||
return;
|
||||
}
|
||||
|
@ -10610,6 +10771,9 @@ static void ggml_compute_forward_set_f32(
|
|||
bool inplace = (bool) ((int32_t *) dst->op_params)[4];
|
||||
|
||||
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
memcpy(
|
||||
|
@ -10934,6 +11098,9 @@ static void ggml_compute_forward_get_rows_back_f32_f16(
|
|||
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
|
@ -10968,6 +11135,9 @@ static void ggml_compute_forward_get_rows_back_f32(
|
|||
// ggml_compute_forward_dup_same_cont(params, opt0, dst);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
}
|
||||
|
||||
|
@ -11105,6 +11275,9 @@ static void ggml_compute_forward_diag_mask_f32(
|
|||
GGML_ASSERT(n_past >= 0);
|
||||
|
||||
if (!inplace && (params->type == GGML_TASK_INIT)) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
// memcpy needs to be synchronized across threads to avoid race conditions.
|
||||
// => do it in INIT phase
|
||||
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
|
||||
|
@ -12075,6 +12248,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// permute kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
||||
|
@ -12169,6 +12345,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// prepare kernel data (src0) from (K x Cout x Cin) to (Cin x K x Cout)
|
||||
|
@ -12367,6 +12546,7 @@ static void ggml_compute_forward_im2col(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_conv_transpose_2d
|
||||
|
||||
static void ggml_compute_forward_conv_transpose_2d(
|
||||
|
@ -12392,6 +12572,9 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// permute kernel data (src0) from (Kw x Kh x Cout x Cin) to (Cin x Kw x Kh x Cout)
|
||||
|
@ -13935,6 +14118,14 @@ static void ggml_compute_forward_unary(
|
|||
{
|
||||
ggml_compute_forward_silu(params, src0, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
{
|
||||
ggml_compute_forward_hardswish(params, src0, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
{
|
||||
ggml_compute_forward_hardsigmoid(params, src0, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
|
@ -13998,6 +14189,9 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|||
|
||||
const bool inplace = (bool) ((int32_t *) dst->op_params)[0];
|
||||
if (!inplace && params->type == GGML_TASK_INIT) {
|
||||
if (params->ith != 0) {
|
||||
return;
|
||||
}
|
||||
memcpy((char *) dst->data, (char *) src0->data, ggml_nbytes(dst));
|
||||
return;
|
||||
}
|
||||
|
@ -14527,6 +14721,12 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
|
||||
if (skip_cpu) {
|
||||
return;
|
||||
}
|
||||
#endif // GGML_USE_SYCL
|
||||
switch (tensor->op) {
|
||||
case GGML_OP_DUP:
|
||||
{
|
||||
|
@ -16303,8 +16503,9 @@ struct ggml_compute_state_shared {
|
|||
const int n_threads;
|
||||
|
||||
// synchronization primitives
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
atomic_int n_active; // num active threads
|
||||
atomic_int node_n; // active graph node
|
||||
atomic_int node_task; // active graph node task phase
|
||||
|
||||
bool (*abort_callback)(void * data); // abort ggml_graph_compute when true
|
||||
void * abort_callback_data;
|
||||
|
@ -16360,6 +16561,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
||||
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
||||
{
|
||||
n_tasks = 1;
|
||||
} break;
|
||||
|
@ -16436,7 +16639,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
} break;
|
||||
case GGML_OP_SOFT_MAX:
|
||||
{
|
||||
n_tasks = MIN(MIN(4, n_threads), ggml_nrows(node->src[0]));
|
||||
n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
|
@ -16550,6 +16753,34 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|||
return n_tasks;
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_thread_sync_node(int * node_n, struct ggml_compute_state * state, const bool do_yield) {
|
||||
// wait for other threads to finish
|
||||
const int last_node_n = * node_n;
|
||||
|
||||
while (true) {
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
* node_n = atomic_load(&state->shared->node_n);
|
||||
if (* node_n != last_node_n) break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
|
||||
// wait for other threads to finish
|
||||
const int last_task_phase = * task_phase;
|
||||
|
||||
while (true) {
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
* task_phase = atomic_load(&state->shared->node_task);
|
||||
if (* task_phase != last_task_phase) break;
|
||||
}
|
||||
}
|
||||
|
||||
static thread_ret_t ggml_graph_compute_thread(void * data) {
|
||||
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
|
||||
|
||||
|
@ -16560,7 +16791,8 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
set_numa_thread_affinity(state->ith, n_threads);
|
||||
|
||||
int node_n = -1;
|
||||
int node_n = -1;
|
||||
int task_phase = GGML_TASK_FINALIZE;
|
||||
|
||||
while (true) {
|
||||
if (cplan->abort_callback && cplan->abort_callback(cplan->abort_callback_data)) {
|
||||
|
@ -16592,7 +16824,6 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
// distribute new work or execute it direct if 1T
|
||||
while (++node_n < cgraph->n_nodes) {
|
||||
GGML_PRINT_DEBUG_5("%s: %d/%d\n", __func__, node_n, cgraph->n_nodes);
|
||||
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||
|
||||
|
@ -16601,13 +16832,13 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
|
||||
params.nth = n_tasks;
|
||||
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
params.type = GGML_TASK_INIT;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
if (n_tasks == 1) {
|
||||
/* INIT */
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
params.type = GGML_TASK_INIT;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
// TODO: maybe push node_n to the atomic but if other threads see n_tasks is 1,
|
||||
// they do something more efficient than spinning (?)
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
|
@ -16628,38 +16859,24 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
}
|
||||
}
|
||||
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_n, node_n);
|
||||
task_phase = GGML_TASK_INIT;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_n, node_n);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
} else {
|
||||
// wait for other threads to finish
|
||||
const int last = node_n;
|
||||
|
||||
const bool do_yield = last < 0 || cgraph->nodes[last]->op == GGML_OP_MUL_MAT;
|
||||
|
||||
while (true) {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
// UPD: adding the do_yield flag seems to resolve the issue universally
|
||||
if (do_yield) {
|
||||
sched_yield();
|
||||
}
|
||||
|
||||
node_n = atomic_load(&state->shared->node_n);
|
||||
if (node_n != last) break;
|
||||
};
|
||||
ggml_graph_compute_thread_sync_node(&node_n, state, false);
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
||||
}
|
||||
|
||||
// check if we should stop
|
||||
if (node_n >= cgraph->n_nodes) break;
|
||||
|
||||
/* COMPUTE */
|
||||
/* INIT & COMPUTE */
|
||||
struct ggml_tensor * node = cgraph->nodes[node_n];
|
||||
const int n_tasks = ggml_get_n_tasks(node, n_threads);
|
||||
|
||||
struct ggml_compute_params params = {
|
||||
/*.type =*/ GGML_TASK_COMPUTE,
|
||||
/*.type =*/ GGML_TASK_INIT,
|
||||
/*.ith =*/ state->ith,
|
||||
/*.nth =*/ n_tasks,
|
||||
/*.wsize =*/ cplan->work_size,
|
||||
|
@ -16667,8 +16884,39 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
|
|||
};
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
if (GGML_OP_HAS_INIT[node->op]) {
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
task_phase = GGML_TASK_COMPUTE;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
}
|
||||
else {
|
||||
// TODO: this sched_yield can have significant impact on the performance - either positive or negative
|
||||
// depending on the workload and the operating system.
|
||||
// since it is not clear what is the best approach, it should potentially become user-configurable
|
||||
// ref: https://github.com/ggerganov/ggml/issues/291
|
||||
// UPD: adding the do_yield flag seems to resolve the issue universally
|
||||
const bool do_yield = node_n < 0 || cgraph->nodes[node_n]->op == GGML_OP_MUL_MAT;
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, do_yield);
|
||||
}
|
||||
|
||||
if (state->ith < n_tasks) {
|
||||
params.type = GGML_TASK_COMPUTE;
|
||||
ggml_compute_forward(¶ms, node);
|
||||
}
|
||||
|
||||
if (atomic_fetch_sub(&state->shared->n_active, 1) == 1) {
|
||||
task_phase = GGML_TASK_FINALIZE;
|
||||
atomic_store(&state->shared->n_active, n_threads);
|
||||
atomic_store(&state->shared->node_task, task_phase);
|
||||
}
|
||||
else {
|
||||
ggml_graph_compute_thread_sync_task(&task_phase, state, false);
|
||||
}
|
||||
}
|
||||
|
||||
return GGML_EXIT_SUCCESS;
|
||||
|
@ -16725,8 +16973,11 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
|
|||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
|
||||
if (ggml_compute_forward_mul_mat_use_blas(node)) {
|
||||
if (node->src[0]->type != GGML_TYPE_F32) {
|
||||
// here we need memory just for single 2D matrix from src0
|
||||
cur = ggml_type_size(GGML_TYPE_F32)*(node->src[0]->ne[0]*node->src[0]->ne[1]);
|
||||
// here we need memory for fully dequantized matrix from src0
|
||||
// take into account that src0 can be broadcasted into src1[2,3]
|
||||
cur = ggml_type_size(GGML_TYPE_F32)
|
||||
* node->src[0]->ne[0]*node->src[0]->ne[1]
|
||||
* node->src[1]->ne[2]*node->src[1]->ne[3];
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
|
@ -16891,6 +17142,7 @@ int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan) {
|
|||
/*.n_threads =*/ n_threads,
|
||||
/*.n_active =*/ n_threads,
|
||||
/*.node_n =*/ -1,
|
||||
/*.node_task =*/ GGML_TASK_FINALIZE,
|
||||
/*.abort_callback =*/ NULL,
|
||||
/*.abort_callback_data =*/ NULL,
|
||||
};
|
||||
|
@ -18569,6 +18821,28 @@ enum ggml_opt_result ggml_opt_resume_g(
|
|||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void ggml_quantize_init(enum ggml_type type) {
|
||||
ggml_critical_section_start();
|
||||
|
||||
switch (type) {
|
||||
case GGML_TYPE_IQ2_XXS: iq2xs_init_impl(256); break;
|
||||
case GGML_TYPE_IQ2_XS: iq2xs_init_impl(512); break;
|
||||
default: // nothing
|
||||
break;
|
||||
}
|
||||
|
||||
ggml_critical_section_end();
|
||||
}
|
||||
|
||||
void ggml_quantize_free(void) {
|
||||
ggml_critical_section_start();
|
||||
|
||||
iq2xs_free_impl(256);
|
||||
iq2xs_free_impl(512);
|
||||
|
||||
ggml_critical_section_end();
|
||||
}
|
||||
|
||||
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
||||
assert(k % QK4_0 == 0);
|
||||
const int nb = k / QK4_0;
|
||||
|
@ -18696,9 +18970,15 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|||
return (n/QK8_0*sizeof(block_q8_0));
|
||||
}
|
||||
|
||||
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
||||
return
|
||||
type == GGML_TYPE_IQ2_XXS ||
|
||||
type == GGML_TYPE_IQ2_XS;
|
||||
}
|
||||
|
||||
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
||||
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
||||
(void)imatrix;
|
||||
ggml_quantize_init(type); // this is noop if already initialized
|
||||
size_t result = 0;
|
||||
int n = nrows * n_per_row;
|
||||
switch (type) {
|
||||
|
@ -18811,13 +19091,13 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|||
} break;
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
int elemsize = sizeof(ggml_fp16_t);
|
||||
size_t elemsize = sizeof(ggml_fp16_t);
|
||||
ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n);
|
||||
result = n * elemsize;
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
int elemsize = sizeof(float);
|
||||
size_t elemsize = sizeof(float);
|
||||
result = n * elemsize;
|
||||
memcpy((uint8_t *)dst + start * elemsize, src + start, result);
|
||||
} break;
|
||||
|
@ -20041,7 +20321,7 @@ int ggml_cpu_has_wasm_simd(void) {
|
|||
}
|
||||
|
||||
int ggml_cpu_has_blas(void) {
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST)
|
||||
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
|
@ -20072,8 +20352,16 @@ int ggml_cpu_has_vulkan(void) {
|
|||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sycl(void) {
|
||||
#if defined(GGML_USE_SYCL)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_gpublas(void) {
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan();
|
||||
return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
|
|
50
ggml.h
50
ggml.h
|
@ -489,6 +489,8 @@ extern "C" {
|
|||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_GELU_QUICK,
|
||||
GGML_UNARY_OP_SILU,
|
||||
GGML_UNARY_OP_HARDSWISH,
|
||||
GGML_UNARY_OP_HARDSIGMOID,
|
||||
|
||||
GGML_UNARY_OP_COUNT,
|
||||
};
|
||||
|
@ -1032,6 +1034,16 @@ extern "C" {
|
|||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b);
|
||||
|
||||
// hardswish(x) = x * relu6(x + 3) / 6
|
||||
GGML_API struct ggml_tensor * ggml_hardswish(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// hardsigmoid(x) = relu6(x + 3) / 6
|
||||
GGML_API struct ggml_tensor * ggml_hardsigmoid(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
// normalize along rows
|
||||
GGML_API struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
|
@ -1483,6 +1495,17 @@ extern "C" {
|
|||
int d1,
|
||||
bool is_2D);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_conv_1d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -2065,6 +2088,18 @@ extern "C" {
|
|||
// quantization
|
||||
//
|
||||
|
||||
// - ggml_quantize_init can be called multiple times with the same type
|
||||
// it will only initialize the quantization tables for the first call or after ggml_quantize_free
|
||||
// automatically called by ggml_quantize_chunk for convenience
|
||||
//
|
||||
// - ggml_quantize_free will free any memory allocated by ggml_quantize_init
|
||||
// call this at the end of the program to avoid memory leaks
|
||||
//
|
||||
// note: these are thread-safe
|
||||
//
|
||||
GGML_API void ggml_quantize_init(enum ggml_type type);
|
||||
GGML_API void ggml_quantize_free(void);
|
||||
|
||||
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
||||
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
@ -2078,19 +2113,13 @@ extern "C" {
|
|||
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
||||
|
||||
// some quantization type cannot be used without an importance matrix
|
||||
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
||||
|
||||
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
||||
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
||||
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
||||
|
||||
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
||||
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
||||
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
||||
|
||||
//
|
||||
// Importance matrix
|
||||
//
|
||||
typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1);
|
||||
GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect);
|
||||
|
||||
//
|
||||
// gguf
|
||||
//
|
||||
|
@ -2238,6 +2267,7 @@ extern "C" {
|
|||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
//
|
||||
|
|
|
@ -97,8 +97,11 @@ class MODEL_ARCH(IntEnum):
|
|||
BLOOM = auto()
|
||||
STABLELM = auto()
|
||||
QWEN = auto()
|
||||
QWEN2 = auto()
|
||||
PHI2 = auto()
|
||||
PLAMO = auto()
|
||||
CODESHELL = auto()
|
||||
ORION = auto()
|
||||
|
||||
|
||||
class MODEL_TENSOR(IntEnum):
|
||||
|
@ -145,8 +148,11 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
|||
MODEL_ARCH.BLOOM: "bloom",
|
||||
MODEL_ARCH.STABLELM: "stablelm",
|
||||
MODEL_ARCH.QWEN: "qwen",
|
||||
MODEL_ARCH.QWEN2: "qwen2",
|
||||
MODEL_ARCH.PHI2: "phi2",
|
||||
MODEL_ARCH.PLAMO: "plamo",
|
||||
MODEL_ARCH.CODESHELL: "codeshell",
|
||||
MODEL_ARCH.ORION: "orion",
|
||||
}
|
||||
|
||||
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
|
@ -356,6 +362,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.QWEN2: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.PLAMO: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
|
@ -396,7 +416,36 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
]
|
||||
],
|
||||
MODEL_ARCH.CODESHELL: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.POS_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_QKV,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
MODEL_ARCH.ORION: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
MODEL_TENSOR.OUTPUT_NORM,
|
||||
MODEL_TENSOR.OUTPUT,
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_NORM,
|
||||
MODEL_TENSOR.ATTN_Q,
|
||||
MODEL_TENSOR.ATTN_K,
|
||||
MODEL_TENSOR.ATTN_V,
|
||||
MODEL_TENSOR.ATTN_OUT,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
MODEL_TENSOR.FFN_NORM,
|
||||
MODEL_TENSOR.FFN_GATE,
|
||||
MODEL_TENSOR.FFN_DOWN,
|
||||
MODEL_TENSOR.FFN_UP,
|
||||
],
|
||||
# TODO
|
||||
}
|
||||
|
||||
|
@ -417,6 +466,14 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.CODESHELL: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
MODEL_ARCH.ORION: [
|
||||
MODEL_TENSOR.ROPE_FREQS,
|
||||
MODEL_TENSOR.ATTN_ROT_EMBD,
|
||||
],
|
||||
}
|
||||
|
||||
#
|
||||
|
|
|
@ -107,7 +107,7 @@ class GGUFReader:
|
|||
offs, tensors_fields = self._build_tensors_fields(offs, tensor_count)
|
||||
new_align = self.fields.get('general.alignment')
|
||||
if new_align is not None:
|
||||
if new_align.types != [GGUFValueType.UINT64]:
|
||||
if new_align.types != [GGUFValueType.UINT32]:
|
||||
raise ValueError('Bad type for general.alignment field')
|
||||
self.alignment = new_align.parts[-1][0]
|
||||
padding = offs % self.alignment
|
||||
|
|
|
@ -154,6 +154,7 @@ class TensorNameMap:
|
|||
"model.layers.{bid}.self_attn.rotary_emb.inv_freq", # llama-hf
|
||||
"layers.{bid}.attention.inner_attention.rope.freqs", # llama-pth
|
||||
"model.layers.layers.{bid}.self_attn.rotary_emb.inv_freq", # plamo
|
||||
"transformer.h.{bid}.attn.rotary_emb.inv_freq", # codeshell
|
||||
),
|
||||
|
||||
# Feed-forward norm
|
||||
|
|
18
llama.h
18
llama.h
|
@ -2,9 +2,13 @@
|
|||
#define LLAMA_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
||||
#elif defined(GGML_USE_SYCL)
|
||||
#include "ggml-sycl.h"
|
||||
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
||||
#else
|
||||
#define LLAMA_MAX_DEVICES 1
|
||||
#endif // GGML_USE_CUBLAS
|
||||
|
@ -45,7 +49,7 @@
|
|||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||
#define LLAMA_SESSION_VERSION 4
|
||||
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN)
|
||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
||||
#endif
|
||||
|
@ -106,6 +110,7 @@ extern "C" {
|
|||
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||
|
||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||
};
|
||||
|
@ -231,6 +236,9 @@ extern "C" {
|
|||
float yarn_beta_slow; // YaRN high correction dim
|
||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||
|
||||
ggml_backend_sched_eval_callback cb_eval;
|
||||
void * cb_eval_user_data;
|
||||
|
||||
enum ggml_type type_k; // data type for K cache
|
||||
enum ggml_type type_v; // data type for V cache
|
||||
|
||||
|
@ -770,6 +778,14 @@ extern "C" {
|
|||
float p,
|
||||
size_t min_keep);
|
||||
|
||||
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
|
||||
LLAMA_API void llama_sample_entropy(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates_p,
|
||||
float min_temp,
|
||||
float max_temp,
|
||||
float exponent_val);
|
||||
|
||||
LLAMA_API void llama_sample_temp(
|
||||
struct llama_context * ctx,
|
||||
llama_token_data_array * candidates,
|
||||
|
|
1
mypy.ini
1
mypy.ini
|
@ -4,3 +4,4 @@ allow_untyped_calls = true
|
|||
allow_untyped_defs = true
|
||||
allow_incomplete_defs = true
|
||||
disable_error_code = import-untyped
|
||||
warn_return_any = false
|
||||
|
|
|
@ -243,7 +243,6 @@ int main(int argc, char** argv) {
|
|||
if (useQ4_1) q41.resize(n4);
|
||||
else q40.resize(n4);
|
||||
std::vector<block_q8_0> q8(n8);
|
||||
std::vector<int64_t> H(16, 0);
|
||||
double sumt = 0, sumt2 = 0, maxt = 0;
|
||||
double sumqt = 0, sumqt2 = 0, maxqt = 0;
|
||||
double sum = 0, sumq = 0, exactSum = 0;
|
||||
|
|
50
scripts/ci-run.sh
Executable file
50
scripts/ci-run.sh
Executable file
|
@ -0,0 +1,50 @@
|
|||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
this=$(realpath "$0"); readonly this
|
||||
cd "$(dirname "$this")"
|
||||
shellcheck "$this"
|
||||
|
||||
if (( $# != 1 && $# != 2 )); then
|
||||
cat >&2 <<'EOF'
|
||||
usage:
|
||||
ci-run.sh <tmp_dir> [<cache_dir>]
|
||||
|
||||
This script wraps ci/run.sh:
|
||||
* If <tmp_dir> is a ramdisk, you can reduce writes to your SSD. If <tmp_dir> is not a ramdisk, keep in mind that total writes will increase by the size of <cache_dir>.
|
||||
(openllama_3b_v2: quantized models are about 30GB)
|
||||
* Persistent model and data files are synced to and from <cache_dir>,
|
||||
excluding generated .gguf files.
|
||||
(openllama_3b_v2: persistent files are about 6.6GB)
|
||||
* <cache_dir> defaults to ~/.cache/llama.cpp
|
||||
EOF
|
||||
exit 1
|
||||
fi
|
||||
|
||||
cd .. # => llama.cpp repo root
|
||||
|
||||
tmp="$1"
|
||||
mkdir -p "$tmp"
|
||||
tmp=$(realpath "$tmp")
|
||||
echo >&2 "Using tmp=$tmp"
|
||||
|
||||
cache="${2-$HOME/.cache/llama.cpp}"
|
||||
mkdir -p "$cache"
|
||||
cache=$(realpath "$cache")
|
||||
echo >&2 "Using cache=$cache"
|
||||
|
||||
_sync() {
|
||||
local from="$1"; shift
|
||||
local to="$1"; shift
|
||||
|
||||
echo >&2 "Syncing from $from to $to"
|
||||
mkdir -p "$from" "$to"
|
||||
rsync -a "$from" "$to" --delete-during "$@"
|
||||
}
|
||||
|
||||
_sync "$(realpath .)/" "$tmp/llama.cpp"
|
||||
_sync "$cache/ci-mnt/models/" "$tmp/llama.cpp/ci-mnt/models/"
|
||||
|
||||
cd "$tmp/llama.cpp"
|
||||
bash ci/run.sh ci-out ci-mnt
|
||||
|
||||
_sync 'ci-mnt/models/' "$cache/ci-mnt/models/" --exclude='*.gguf' -P
|
10
scripts/get-hellaswag.sh
Executable file
10
scripts/get-hellaswag.sh
Executable file
|
@ -0,0 +1,10 @@
|
|||
#!/bin/bash
|
||||
|
||||
wget https://raw.githubusercontent.com/klosax/hellaswag_text_data/main/hellaswag_val_full.txt
|
||||
|
||||
echo "Usage:"
|
||||
echo ""
|
||||
echo " ./perplexity -m model.gguf -f hellaswag_val_full.txt --hellaswag [--hellaswag-tasks N] [other params]"
|
||||
echo ""
|
||||
|
||||
exit 0
|
|
@ -1,3 +1,10 @@
|
|||
#!/bin/bash
|
||||
|
||||
wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
|
||||
|
||||
echo "Usage:"
|
||||
echo ""
|
||||
echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]"
|
||||
echo ""
|
||||
|
||||
exit 0
|
||||
|
|
10
scripts/get-winogrande.sh
Executable file
10
scripts/get-winogrande.sh
Executable file
|
@ -0,0 +1,10 @@
|
|||
#!/bin/bash
|
||||
|
||||
wget https://huggingface.co/datasets/ikawrakow/winogrande-eval-for-llama.cpp/raw/main/winogrande-debiased-eval.csv
|
||||
|
||||
echo "Usage:"
|
||||
echo ""
|
||||
echo " ./perplexity -m model.gguf -f winogrande-debiased-eval.csv --winogrande [--winogrande-tasks N] [other params]"
|
||||
echo ""
|
||||
|
||||
exit 0
|
|
@ -46,7 +46,7 @@ Formatting considerations:
|
|||
- To define multiple "reverse_prompt" properties simultaneously the expected format is a list of strings.
|
||||
- To define a tensor split, pass a list of floats.
|
||||
"""
|
||||
usage = "run_with_preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
|
||||
usage = "run-with-preset.py [-h] [yaml_files ...] [--<ARG_NAME> <ARG_VALUE> ...]"
|
||||
epilog = (" --<ARG_NAME> specify additional CLI ars to be passed to the binary (override all preset files). "
|
||||
"Unknown args will be ignored.")
|
||||
|
|
@ -1 +1 @@
|
|||
b306d6e996ec0ace77118fa5098822cdc7f9c88f
|
||||
c2448f88d17395452a587d0176d19ed87e0f7ce1
|
||||
|
|
3
tests/.gitignore
vendored
Normal file
3
tests/.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
*
|
||||
!*.*
|
||||
test-c.o
|
|
@ -1,6 +1,6 @@
|
|||
function(llama_build_executable source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
add_executable(${TEST_TARGET} ${source} get-model.cpp)
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
endfunction()
|
||||
|
@ -8,14 +8,20 @@ endfunction()
|
|||
function(llama_test_executable name source)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_test(NAME ${name} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
set_property(TEST ${name} PROPERTY LABELS "main")
|
||||
endfunction()
|
||||
|
||||
function(llama_build_and_test_executable source)
|
||||
llama_build_and_test_executable_with_label(${source} "main")
|
||||
endfunction()
|
||||
|
||||
function(llama_build_and_test_executable_with_label source label)
|
||||
get_filename_component(TEST_TARGET ${source} NAME_WE)
|
||||
add_executable(${TEST_TARGET} ${source})
|
||||
add_executable(${TEST_TARGET} ${source} get-model.cpp)
|
||||
install(TARGETS ${TEST_TARGET} RUNTIME)
|
||||
target_link_libraries(${TEST_TARGET} PRIVATE common)
|
||||
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${ARGN})
|
||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${label})
|
||||
endfunction()
|
||||
|
||||
# llama_build_and_test_executable(test-double-float.cpp) # SLOW
|
||||
|
@ -52,6 +58,9 @@ llama_build_and_test_executable(test-backend-ops.cpp)
|
|||
|
||||
llama_build_and_test_executable(test-rope.cpp)
|
||||
|
||||
llama_build_and_test_executable_with_label(test-model-load-cancel.cpp "model")
|
||||
llama_build_and_test_executable_with_label(test-autorelease.cpp "model")
|
||||
|
||||
# dummy executable - not installed
|
||||
get_filename_component(TEST_TARGET test-c.c NAME_WE)
|
||||
add_executable(${TEST_TARGET} test-c.c)
|
||||
|
|
21
tests/get-model.cpp
Normal file
21
tests/get-model.cpp
Normal file
|
@ -0,0 +1,21 @@
|
|||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
|
||||
#include "get-model.h"
|
||||
|
||||
char * get_model_or_exit(int argc, char *argv[]) {
|
||||
char * model_path;
|
||||
if (argc > 1) {
|
||||
model_path = argv[1];
|
||||
|
||||
} else {
|
||||
model_path = getenv("LLAMACPP_TEST_MODELFILE");
|
||||
if (!model_path || strlen(model_path) == 0) {
|
||||
fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
}
|
||||
|
||||
return model_path;
|
||||
}
|
2
tests/get-model.h
Normal file
2
tests/get-model.h
Normal file
|
@ -0,0 +1,2 @@
|
|||
#pragma once
|
||||
char * get_model_or_exit(int, char*[]);
|
24
tests/test-autorelease.cpp
Normal file
24
tests/test-autorelease.cpp
Normal file
|
@ -0,0 +1,24 @@
|
|||
// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
|
||||
#include "llama.h"
|
||||
#include "get-model.h"
|
||||
|
||||
// This creates a new context inside a pthread and then tries to exit cleanly.
|
||||
int main(int argc, char ** argv) {
|
||||
auto * model_path = get_model_or_exit(argc, argv);
|
||||
|
||||
std::thread([&model_path]() {
|
||||
llama_backend_init(false);
|
||||
auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
|
||||
auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
|
||||
llama_free(ctx);
|
||||
llama_free_model(model);
|
||||
llama_backend_free();
|
||||
}).join();
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -16,39 +16,37 @@
|
|||
#include <vector>
|
||||
|
||||
static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
|
||||
// static RNG initialization (revisit if n_threads stops being constant)
|
||||
static const size_t n_threads = std::thread::hardware_concurrency();
|
||||
static std::vector<std::default_random_engine> generators = []() {
|
||||
std::random_device rd;
|
||||
std::vector<std::default_random_engine> vec;
|
||||
vec.reserve(n_threads);
|
||||
//for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
|
||||
for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
|
||||
return vec;
|
||||
}();
|
||||
|
||||
size_t size = ggml_nelements(tensor);
|
||||
std::vector<float> data(size);
|
||||
|
||||
#if 0
|
||||
static std::default_random_engine generator(1234);
|
||||
std::uniform_real_distribution<float> distribution(min, max);
|
||||
|
||||
for (size_t i = 0; i < size; i++) {
|
||||
data[i] = distribution(generator);
|
||||
}
|
||||
#else
|
||||
auto init_thread = [&](size_t start, size_t end) {
|
||||
std::random_device rd;
|
||||
std::default_random_engine generator(rd());
|
||||
auto init_thread = [&](size_t ith, size_t start, size_t end) {
|
||||
std::uniform_real_distribution<float> distribution(min, max);
|
||||
|
||||
for (size_t i = start; i < end; i++) {
|
||||
data[i] = distribution(generator);
|
||||
data[i] = distribution(generators[ith]);
|
||||
}
|
||||
};
|
||||
|
||||
size_t n_threads = std::thread::hardware_concurrency();
|
||||
std::vector<std::thread> threads;
|
||||
threads.reserve(n_threads);
|
||||
for (size_t i = 0; i < n_threads; i++) {
|
||||
size_t start = i*size/n_threads;
|
||||
size_t end = (i+1)*size/n_threads;
|
||||
threads.emplace_back(init_thread, start, end);
|
||||
threads.emplace_back(init_thread, i, start, end);
|
||||
}
|
||||
for (auto & t : threads) {
|
||||
t.join();
|
||||
}
|
||||
#endif
|
||||
|
||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||
|
@ -56,7 +54,16 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
|||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||
int64_t hist[16];
|
||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, nullptr);
|
||||
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
||||
const float * im = imatrix.data();
|
||||
if (!ggml_quantize_requires_imatrix(tensor->type)) {
|
||||
// when the imatrix is optional, we want to test both quantization with and without imatrix
|
||||
// use one of the random numbers to decide
|
||||
if (data[0] > 0.5f*(min + max)) {
|
||||
im = nullptr;
|
||||
}
|
||||
}
|
||||
ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], hist, im);
|
||||
ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
|
||||
} else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
|
||||
// This is going to create some weird integers though.
|
||||
|
@ -95,7 +102,6 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
|||
} else if (t->type == GGML_TYPE_I8) {
|
||||
tv.push_back((float)*(int8_t *) &buf[i]);
|
||||
} else if (quantized) {
|
||||
std::vector<float> vq(ggml_blck_size(t->type));
|
||||
tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type));
|
||||
tv.insert(tv.end(), vq.begin(), vq.end());
|
||||
} else {
|
||||
|
@ -233,10 +239,17 @@ static std::string var_to_str(ggml_type type) {
|
|||
#define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
|
||||
#define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
static bool inline _isinf(float f) {
|
||||
return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
|
||||
}
|
||||
#else
|
||||
static bool inline _isinf(float f) { return std::isinf(f); }
|
||||
#endif
|
||||
|
||||
// accept FLT_MAX as infinity
|
||||
static bool isinf_or_max(float f) {
|
||||
return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
||||
return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
|
||||
}
|
||||
|
||||
static bool ggml_is_view_op(enum ggml_op op) {
|
||||
|
@ -1472,7 +1485,8 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
GGML_TYPE_Q8_0,
|
||||
GGML_TYPE_Q2_K, GGML_TYPE_Q3_K,
|
||||
GGML_TYPE_Q4_K, GGML_TYPE_Q5_K,
|
||||
GGML_TYPE_Q6_K
|
||||
GGML_TYPE_Q6_K,
|
||||
GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS,
|
||||
};
|
||||
|
||||
// unary ops
|
||||
|
@ -1752,6 +1766,8 @@ int main(int argc, char ** argv) {
|
|||
return 1;
|
||||
}
|
||||
|
||||
ggml_quantize_free();
|
||||
|
||||
printf("\033[1;32mOK\033[0m\n");
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -190,7 +190,6 @@ int main()
|
|||
index++;
|
||||
}
|
||||
|
||||
std::vector<std::vector<const llama_grammar_element *>> next_stacks;
|
||||
std::vector<llama_grammar_candidate> next_candidates;
|
||||
next_candidates.resize(24);
|
||||
|
||||
|
|
27
tests/test-model-load-cancel.cpp
Normal file
27
tests/test-model-load-cancel.cpp
Normal file
|
@ -0,0 +1,27 @@
|
|||
#include "llama.h"
|
||||
#include "get-model.h"
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
int main(int argc, char *argv[] ) {
|
||||
auto * model_path = get_model_or_exit(argc, argv);
|
||||
auto * file = fopen(model_path, "r");
|
||||
if (file == nullptr) {
|
||||
fprintf(stderr, "no model at '%s' found\n", model_path);
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
|
||||
fprintf(stderr, "using '%s'\n", model_path);
|
||||
fclose(file);
|
||||
|
||||
llama_backend_init(false);
|
||||
auto params = llama_model_params{};
|
||||
params.use_mmap = false;
|
||||
params.progress_callback = [](float progress, void * ctx){
|
||||
(void) ctx;
|
||||
return progress > 0.50;
|
||||
};
|
||||
auto * model = llama_load_model_from_file(model_path, params);
|
||||
llama_backend_free();
|
||||
return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
|
||||
}
|
|
@ -5,11 +5,10 @@
|
|||
#undef NDEBUG
|
||||
#endif
|
||||
|
||||
#include <cmath>
|
||||
#include <numeric>
|
||||
#include <cassert>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static void dump(const llama_token_data_array * candidates) {
|
||||
for (size_t i = 0; i < candidates->size; i++) {
|
||||
|
@ -20,11 +19,11 @@ static void dump(const llama_token_data_array * candidates) {
|
|||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
||||
|
||||
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
|
@ -41,11 +40,11 @@ static void test_top_k(const std::vector<float> & probs, const std::vector<float
|
|||
}
|
||||
|
||||
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
|
@ -62,11 +61,11 @@ static void test_top_p(const std::vector<float> & probs, const std::vector<float
|
|||
}
|
||||
|
||||
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
|
@ -81,12 +80,33 @@ static void test_tfs(const std::vector<float> & probs, const std::vector<float>
|
|||
}
|
||||
}
|
||||
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
size_t n_vocab = probs.size();
|
||||
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_min_p(nullptr, &candidates_p, p, 1);
|
||||
DUMP(&candidates_p);
|
||||
llama_sample_softmax(nullptr, &candidates_p);
|
||||
|
||||
GGML_ASSERT(candidates_p.size == expected_probs.size());
|
||||
for (size_t i = 0; i < candidates_p.size; i++) {
|
||||
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
|
||||
}
|
||||
}
|
||||
|
||||
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
|
@ -107,11 +127,11 @@ static void test_repetition_penalties(
|
|||
) {
|
||||
GGML_ASSERT(probs.size() == expected_probs.size());
|
||||
|
||||
size_t n_vocab = probs.size();
|
||||
const size_t n_vocab = probs.size();
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
float logit = log(probs[token_id]);
|
||||
const float logit = logf(probs[token_id]);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
|
@ -128,6 +148,88 @@ static void test_repetition_penalties(
|
|||
}
|
||||
}
|
||||
|
||||
static void test_sampler_queue(
|
||||
const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
|
||||
) {
|
||||
std::vector<llama_token_data> candidates;
|
||||
candidates.reserve(n_vocab);
|
||||
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
|
||||
const float logit = logf(token_id);
|
||||
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
|
||||
}
|
||||
|
||||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
|
||||
|
||||
llama_token min_token_id = 0;
|
||||
const llama_token max_token_id = n_vocab-1;
|
||||
|
||||
for (auto s : samplers_sequence) {
|
||||
switch (s){
|
||||
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
|
||||
case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break;
|
||||
case 'y': GGML_ASSERT(false && "typical test not implemented"); break;
|
||||
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
|
||||
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
|
||||
case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
|
||||
default : GGML_ASSERT(false && "Unknown sampler"); break;
|
||||
}
|
||||
|
||||
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
|
||||
|
||||
const int size = candidates_p.size;
|
||||
|
||||
if (s == 'k') {
|
||||
const int expected_size = std::min(size, top_k);
|
||||
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
||||
} else if (s == 'p') {
|
||||
const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
|
||||
const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
|
||||
|
||||
min_token_id = n_vocab;
|
||||
int expected_size = 0;
|
||||
int cumsum = 0;
|
||||
do { // do-while because always at least one token is sampled
|
||||
min_token_id--;
|
||||
expected_size++;
|
||||
|
||||
cumsum += min_token_id;
|
||||
} while (cumsum < softmax_numerator_target);
|
||||
|
||||
// token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
|
||||
if (min_token_id == 1) {
|
||||
min_token_id--;
|
||||
expected_size += 1;
|
||||
}
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
||||
} else if (s == 'm') {
|
||||
int expected_size = ceilf((1.0f-min_p) * n_vocab);
|
||||
expected_size = std::max(expected_size, 1);
|
||||
expected_size = std::min(expected_size, size);
|
||||
|
||||
min_token_id = floorf(min_p * n_vocab);
|
||||
min_token_id = std::max(min_token_id, 1);
|
||||
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
|
||||
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
|
||||
|
||||
GGML_ASSERT(size == expected_size);
|
||||
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
|
||||
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
|
||||
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
|
||||
}
|
||||
|
||||
int main(void) {
|
||||
ggml_time_init();
|
||||
|
||||
|
@ -139,6 +241,15 @@ int main(void) {
|
|||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
|
||||
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
|
||||
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.26f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.49f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.51f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.74f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
|
||||
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
|
||||
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
|
||||
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
|
||||
|
@ -154,6 +265,34 @@ int main(void) {
|
|||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
|
||||
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
|
||||
|
||||
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
|
||||
|
||||
test_sampler_queue(10000, "k", 100, 1.0000f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
|
||||
test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
|
||||
test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
|
||||
|
||||
test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
|
||||
test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
|
||||
|
||||
test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
|
||||
test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -2,8 +2,9 @@
|
|||
|
||||
#include <cassert>
|
||||
#include <stdexcept>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
|
||||
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue