Merge branch 'master' into spm-infill-support
This commit is contained in:
commit
93fe7b7ea2
405 changed files with 32817 additions and 27710 deletions
|
@ -27,7 +27,7 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV GGML_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
# Enable ROCm
|
# Enable ROCm
|
||||||
ENV LLAMA_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
|
|
@ -21,7 +21,7 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV GGML_CUDA=1
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-cli
|
RUN make -j$(nproc) llama-cli
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
ARG GGML_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y git
|
apt-get install -y git
|
||||||
|
|
||||||
|
@ -10,11 +10,11 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
|
@ -36,7 +36,7 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
# Enable ROCm
|
# Enable ROCm
|
||||||
ENV LLAMA_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,7 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 && \
|
RUN cmake -B build -DGGML_VULKAN=1 && \
|
||||||
cmake --build build --config Release --target llama-cli
|
cmake --build build --config Release --target llama-cli
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
|
|
|
@ -1,84 +0,0 @@
|
||||||
# SRPM for building from source and packaging an RPM for RPM-based distros.
|
|
||||||
# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
|
|
||||||
# Built and maintained by John Boero - boeroboy@gmail.com
|
|
||||||
# In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
|
|
||||||
|
|
||||||
# Notes for llama.cpp:
|
|
||||||
# 1. Tags are currently based on hash - which will not sort asciibetically.
|
|
||||||
# We need to declare standard versioning if people want to sort latest releases.
|
|
||||||
# 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
|
|
||||||
# 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
|
|
||||||
# Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
|
|
||||||
# 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
|
|
||||||
# It is up to the user to install the correct vendor-specific support.
|
|
||||||
|
|
||||||
Name: llama.cpp-clblast
|
|
||||||
Version: %( date "+%%Y%%m%%d" )
|
|
||||||
Release: 1%{?dist}
|
|
||||||
Summary: OpenCL Inference of LLaMA model in C/C++
|
|
||||||
License: MIT
|
|
||||||
Source0: https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
|
|
||||||
BuildRequires: coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
|
|
||||||
Requires: clblast
|
|
||||||
URL: https://github.com/ggerganov/llama.cpp
|
|
||||||
|
|
||||||
%define debug_package %{nil}
|
|
||||||
%define source_date_epoch_from_changelog 0
|
|
||||||
|
|
||||||
%description
|
|
||||||
CPU inference for Meta's Lllama2 models using default options.
|
|
||||||
|
|
||||||
%prep
|
|
||||||
%setup -n llama.cpp-master
|
|
||||||
|
|
||||||
%build
|
|
||||||
make -j LLAMA_CLBLAST=1
|
|
||||||
|
|
||||||
%install
|
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
|
||||||
cp -p llama-cli %{buildroot}%{_bindir}/llama-clblast-cli
|
|
||||||
cp -p llama-server %{buildroot}%{_bindir}/llama-clblast-server
|
|
||||||
cp -p llama-simple %{buildroot}%{_bindir}/llama-clblast-simple
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/usr/lib/systemd/system
|
|
||||||
%{__cat} <<EOF > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
|
|
||||||
[Unit]
|
|
||||||
Description=Llama.cpp server, CPU only (no GPU support in this build).
|
|
||||||
After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
Type=simple
|
|
||||||
EnvironmentFile=/etc/sysconfig/llama
|
|
||||||
ExecStart=/usr/bin/llama-clblast-server $LLAMA_ARGS
|
|
||||||
ExecReload=/bin/kill -s HUP $MAINPID
|
|
||||||
Restart=never
|
|
||||||
|
|
||||||
[Install]
|
|
||||||
WantedBy=default.target
|
|
||||||
EOF
|
|
||||||
|
|
||||||
mkdir -p %{buildroot}/etc/sysconfig
|
|
||||||
%{__cat} <<EOF > %{buildroot}/etc/sysconfig/llama
|
|
||||||
LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
|
|
||||||
EOF
|
|
||||||
|
|
||||||
%clean
|
|
||||||
rm -rf %{buildroot}
|
|
||||||
rm -rf %{_builddir}/*
|
|
||||||
|
|
||||||
%files
|
|
||||||
%{_bindir}/llama-clblast-cli
|
|
||||||
%{_bindir}/llama-clblast-server
|
|
||||||
%{_bindir}/llama-clblast-simple
|
|
||||||
/usr/lib/systemd/system/llamaclblast.service
|
|
||||||
%config /etc/sysconfig/llama
|
|
||||||
|
|
||||||
|
|
||||||
%pre
|
|
||||||
|
|
||||||
%post
|
|
||||||
|
|
||||||
%preun
|
|
||||||
%postun
|
|
||||||
|
|
||||||
%changelog
|
|
|
@ -32,7 +32,7 @@ CPU inference for Meta's Lllama2 models using default options.
|
||||||
%setup -n llama.cpp-master
|
%setup -n llama.cpp-master
|
||||||
|
|
||||||
%build
|
%build
|
||||||
make -j LLAMA_CUDA=1
|
make -j GGML_CUDA=1
|
||||||
|
|
||||||
%install
|
%install
|
||||||
mkdir -p %{buildroot}%{_bindir}/
|
mkdir -p %{buildroot}%{_bindir}/
|
||||||
|
|
|
@ -21,7 +21,7 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
# Enable CUDA
|
# Enable CUDA
|
||||||
ENV LLAMA_CUDA=1
|
ENV GGML_CUDA=1
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
|
|
||||||
|
@ -30,8 +30,10 @@ RUN make -j$(nproc) llama-server
|
||||||
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev libgomp1
|
apt-get install -y libcurl4-openssl-dev libgomp1 curl
|
||||||
|
|
||||||
COPY --from=build /app/llama-server /llama-server
|
COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -2,7 +2,7 @@ ARG ONEAPI_VERSION=2024.1.1-devel-ubuntu22.04
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
|
||||||
|
|
||||||
ARG LLAMA_SYCL_F16=OFF
|
ARG GGML_SYCL_F16=OFF
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y git libcurl4-openssl-dev
|
apt-get install -y git libcurl4-openssl-dev
|
||||||
|
|
||||||
|
@ -10,20 +10,22 @@ WORKDIR /app
|
||||||
|
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
|
RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
|
||||||
echo "LLAMA_SYCL_F16 is set" && \
|
echo "GGML_SYCL_F16 is set" && \
|
||||||
export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
|
export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
|
||||||
fi && \
|
fi && \
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
COPY --from=build /app/build/bin/llama-server /llama-server
|
COPY --from=build /app/build/bin/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -36,15 +36,17 @@ COPY . .
|
||||||
# Set nvcc architecture
|
# Set nvcc architecture
|
||||||
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
|
||||||
# Enable ROCm
|
# Enable ROCm
|
||||||
ENV LLAMA_HIPBLAS=1
|
ENV GGML_HIPBLAS=1
|
||||||
ENV CC=/opt/rocm/llvm/bin/clang
|
ENV CC=/opt/rocm/llvm/bin/clang
|
||||||
ENV CXX=/opt/rocm/llvm/bin/clang++
|
ENV CXX=/opt/rocm/llvm/bin/clang++
|
||||||
|
|
||||||
# Enable cURL
|
# Enable cURL
|
||||||
ENV LLAMA_CURL=1
|
ENV LLAMA_CURL=1
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y libcurl4-openssl-dev
|
apt-get install -y libcurl4-openssl-dev curl
|
||||||
|
|
||||||
RUN make -j$(nproc) llama-server
|
RUN make -j$(nproc) llama-server
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/app/llama-server" ]
|
ENTRYPOINT [ "/app/llama-server" ]
|
||||||
|
|
|
@ -5,20 +5,16 @@ FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
# Install build tools
|
# Install build tools
|
||||||
RUN apt update && apt install -y git build-essential cmake wget
|
RUN apt update && apt install -y git build-essential cmake wget
|
||||||
|
|
||||||
# Install Vulkan SDK
|
# Install Vulkan SDK and cURL
|
||||||
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
|
||||||
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
|
||||||
apt update -y && \
|
apt update -y && \
|
||||||
apt-get install -y vulkan-sdk
|
apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
|
||||||
|
|
||||||
# Install cURL
|
|
||||||
RUN apt-get update && \
|
|
||||||
apt-get install -y libcurl4-openssl-dev
|
|
||||||
|
|
||||||
# Build it
|
# Build it
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
COPY . .
|
COPY . .
|
||||||
RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \
|
RUN cmake -B build -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
|
||||||
cmake --build build --config Release --target llama-server
|
cmake --build build --config Release --target llama-server
|
||||||
|
|
||||||
# Clean up
|
# Clean up
|
||||||
|
@ -28,4 +24,6 @@ RUN cp /app/build/bin/llama-server /llama-server && \
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
|
||||||
FROM ubuntu:$UBUNTU_VERSION as build
|
FROM ubuntu:$UBUNTU_VERSION as build
|
||||||
|
|
||||||
RUN apt-get update && \
|
RUN apt-get update && \
|
||||||
apt-get install -y build-essential git libcurl4-openssl-dev
|
apt-get install -y build-essential git libcurl4-openssl-dev curl
|
||||||
|
|
||||||
WORKDIR /app
|
WORKDIR /app
|
||||||
|
|
||||||
|
@ -22,4 +22,6 @@ COPY --from=build /app/llama-server /llama-server
|
||||||
|
|
||||||
ENV LC_ALL=C.utf8
|
ENV LC_ALL=C.utf8
|
||||||
|
|
||||||
|
HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
|
||||||
|
|
||||||
ENTRYPOINT [ "/llama-server" ]
|
ENTRYPOINT [ "/llama-server" ]
|
||||||
|
|
|
@ -160,9 +160,9 @@ effectiveStdenv.mkDerivation (
|
||||||
};
|
};
|
||||||
|
|
||||||
postPatch = ''
|
postPatch = ''
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
--replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
|
||||||
substituteInPlace ./ggml-metal.m \
|
substituteInPlace ./ggml/src/ggml-metal.m \
|
||||||
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
--replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
|
||||||
'';
|
'';
|
||||||
|
|
||||||
|
@ -205,17 +205,17 @@ effectiveStdenv.mkDerivation (
|
||||||
|
|
||||||
cmakeFlags =
|
cmakeFlags =
|
||||||
[
|
[
|
||||||
(cmakeBool "LLAMA_NATIVE" false)
|
|
||||||
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
(cmakeBool "LLAMA_BUILD_SERVER" true)
|
||||||
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
|
||||||
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
|
||||||
(cmakeBool "LLAMA_BLAS" useBlas)
|
(cmakeBool "GGML_NATIVE" false)
|
||||||
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
|
(cmakeBool "GGML_BLAS" useBlas)
|
||||||
(cmakeBool "LLAMA_CUDA" useCuda)
|
(cmakeBool "GGML_CLBLAST" useOpenCL)
|
||||||
(cmakeBool "LLAMA_HIPBLAS" useRocm)
|
(cmakeBool "GGML_CUDA" useCuda)
|
||||||
(cmakeBool "LLAMA_METAL" useMetalKit)
|
(cmakeBool "GGML_HIPBLAS" useRocm)
|
||||||
(cmakeBool "LLAMA_VULKAN" useVulkan)
|
(cmakeBool "GGML_METAL" useMetalKit)
|
||||||
(cmakeBool "LLAMA_STATIC" enableStatic)
|
(cmakeBool "GGML_VULKAN" useVulkan)
|
||||||
|
(cmakeBool "GGML_STATIC" enableStatic)
|
||||||
]
|
]
|
||||||
++ optionals useCuda [
|
++ optionals useCuda [
|
||||||
(
|
(
|
||||||
|
@ -231,7 +231,7 @@ effectiveStdenv.mkDerivation (
|
||||||
]
|
]
|
||||||
++ optionals useMetalKit [
|
++ optionals useMetalKit [
|
||||||
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
(lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
|
||||||
(cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
(cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
|
||||||
];
|
];
|
||||||
|
|
||||||
# Environment variables needed for ROCm
|
# Environment variables needed for ROCm
|
||||||
|
@ -244,7 +244,7 @@ effectiveStdenv.mkDerivation (
|
||||||
# if they haven't been added yet.
|
# if they haven't been added yet.
|
||||||
postInstall = ''
|
postInstall = ''
|
||||||
mkdir -p $out/include
|
mkdir -p $out/include
|
||||||
cp $src/llama.h $out/include/
|
cp $src/include/llama.h $out/include/
|
||||||
'';
|
'';
|
||||||
|
|
||||||
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
# Define the shells here, but don't add in the inputsFrom to avoid recursion.
|
||||||
|
|
|
@ -28,4 +28,5 @@ indent_size = 2
|
||||||
indent_style = tab
|
indent_style = tab
|
||||||
|
|
||||||
[examples/cvector-generator/*.txt]
|
[examples/cvector-generator/*.txt]
|
||||||
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
28
.github/labeler.yml
vendored
28
.github/labeler.yml
vendored
|
@ -2,31 +2,31 @@
|
||||||
Kompute:
|
Kompute:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-kompute.h
|
- ggml/include/ggml-kompute.h
|
||||||
- ggml-kompute.cpp
|
- ggml/src/ggml-kompute.cpp
|
||||||
- README-kompute.md
|
- README-kompute.md
|
||||||
Apple Metal:
|
Apple Metal:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-metal.h
|
- ggml/include/ggml-metal.h
|
||||||
- ggml-metal.cpp
|
- ggml/src/ggml-metal.cpp
|
||||||
- README-metal.md
|
- README-metal.md
|
||||||
SYCL:
|
SYCL:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-sycl.h
|
- ggml/include/ggml-sycl.h
|
||||||
- ggml-sycl.cpp
|
- ggml/src/ggml-sycl.cpp
|
||||||
- README-sycl.md
|
- README-sycl.md
|
||||||
Nvidia GPU:
|
Nvidia GPU:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml-cuda.h
|
- ggml/include/ggml-cuda.h
|
||||||
- ggml-cuda/**
|
- ggml/src/ggml-cuda/**
|
||||||
Vulkan:
|
Vulkan:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml_vk_generate_shaders.py
|
- ggml/ggml_vk_generate_shaders.py
|
||||||
- ggml-vulkan*
|
- ggml/src/ggml-vulkan*
|
||||||
documentation:
|
documentation:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
|
@ -73,10 +73,10 @@ server:
|
||||||
ggml:
|
ggml:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
- any-glob-to-any-file:
|
- any-glob-to-any-file:
|
||||||
- ggml.c
|
- ggml/include/ggml*.h
|
||||||
- ggml.h
|
- ggml/src/ggml*.c
|
||||||
- ggml-*.c
|
- ggml/src/ggml*.cpp
|
||||||
- ggml-*.h
|
- ggml/src/ggml*.h
|
||||||
- ggml-cuda/**
|
- ggml-cuda/**
|
||||||
nix:
|
nix:
|
||||||
- changed-files:
|
- changed-files:
|
||||||
|
|
2
.github/workflows/bench.yml
vendored
2
.github/workflows/bench.yml
vendored
|
@ -109,7 +109,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
set -eux
|
set -eux
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DLLAMA_CUBLAS=ON \
|
-DLLAMA_CUBLAS=ON \
|
||||||
|
|
78
.github/workflows/build.yml
vendored
78
.github/workflows/build.yml
vendored
|
@ -10,10 +10,10 @@ on:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||||
pull_request:
|
pull_request:
|
||||||
types: [opened, synchronize, reopened]
|
types: [opened, synchronize, reopened]
|
||||||
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m']
|
paths: ['.github/workflows/build.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
@ -47,7 +47,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -105,7 +105,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
|
||||||
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
|
||||||
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON
|
cmake -B build -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF -DLLAMA_CURL=ON
|
||||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -305,7 +305,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DLLAMA_OPENMP=OFF
|
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
|
||||||
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -335,7 +335,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_RPC=ON ..
|
cmake -DGGML_RPC=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -363,7 +363,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_VULKAN=ON ..
|
cmake -DGGML_VULKAN=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-hip:
|
ubuntu-22-cmake-hip:
|
||||||
|
@ -384,13 +384,13 @@ jobs:
|
||||||
- name: Build with native CMake HIP support
|
- name: Build with native CMake HIP support
|
||||||
id: cmake_build
|
id: cmake_build
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DLLAMA_HIPBLAS=ON
|
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIPBLAS=ON
|
||||||
cmake --build build --config Release -j $(nproc)
|
cmake --build build --config Release -j $(nproc)
|
||||||
|
|
||||||
- name: Build with legacy HIP support
|
- name: Build with legacy HIP support
|
||||||
id: cmake_build_legacy_hip
|
id: cmake_build_legacy_hip
|
||||||
run: |
|
run: |
|
||||||
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DLLAMA_HIPBLAS=ON
|
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIPBLAS=ON
|
||||||
cmake --build build2 --config Release -j $(nproc)
|
cmake --build build2 --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl:
|
ubuntu-22-cmake-sycl:
|
||||||
|
@ -431,7 +431,7 @@ jobs:
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
ubuntu-22-cmake-sycl-fp16:
|
ubuntu-22-cmake-sycl-fp16:
|
||||||
|
@ -472,10 +472,10 @@ jobs:
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON ..
|
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
|
||||||
cmake --build . --config Release -j $(nproc)
|
cmake --build . --config Release -j $(nproc)
|
||||||
|
|
||||||
# TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
|
||||||
macOS-latest-make:
|
macOS-latest-make:
|
||||||
|
@ -497,15 +497,15 @@ jobs:
|
||||||
env:
|
env:
|
||||||
LLAMA_FATAL_WARNINGS: 1
|
LLAMA_FATAL_WARNINGS: 1
|
||||||
run: |
|
run: |
|
||||||
LLAMA_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
id: make_test
|
id: make_test
|
||||||
run: |
|
run: |
|
||||||
LLAMA_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make tests -j $(sysctl -n hw.logicalcpu)
|
||||||
LLAMA_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
GGML_NO_METAL=1 make test -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
# TODO: build with LLAMA_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
|
||||||
# how to debug it.
|
# how to debug it.
|
||||||
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
|
||||||
# would be great if we fix these
|
# would be great if we fix these
|
||||||
|
@ -529,7 +529,7 @@ jobs:
|
||||||
sysctl -a
|
sysctl -a
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF ..
|
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
||||||
|
|
||||||
- name: Test
|
- name: Test
|
||||||
|
@ -559,13 +559,14 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DLLAMA_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=iOS \
|
-DCMAKE_SYSTEM_NAME=iOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
macOS-latest-cmake-tvos:
|
macOS-latest-cmake-tvos:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -588,13 +589,14 @@ jobs:
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -G Xcode .. \
|
cmake -G Xcode .. \
|
||||||
-DLLAMA_METAL_EMBED_LIBRARY=ON \
|
-DGGML_METAL_EMBED_LIBRARY=ON \
|
||||||
-DLLAMA_BUILD_EXAMPLES=OFF \
|
-DLLAMA_BUILD_EXAMPLES=OFF \
|
||||||
-DLLAMA_BUILD_TESTS=OFF \
|
-DLLAMA_BUILD_TESTS=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=OFF \
|
-DLLAMA_BUILD_SERVER=OFF \
|
||||||
-DCMAKE_SYSTEM_NAME=tvOS \
|
-DCMAKE_SYSTEM_NAME=tvOS \
|
||||||
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0
|
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
|
||||||
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
|
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
|
||||||
|
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
|
||||||
|
|
||||||
macOS-latest-swift:
|
macOS-latest-swift:
|
||||||
runs-on: macos-latest
|
runs-on: macos-latest
|
||||||
|
@ -662,7 +664,7 @@ jobs:
|
||||||
- name: Build using make w/ OpenBLAS
|
- name: Build using make w/ OpenBLAS
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
make LLAMA_OPENBLAS=1 -j $(nproc)
|
make GGML_OPENBLAS=1 -j $(nproc)
|
||||||
|
|
||||||
- name: Build using CMake
|
- name: Build using CMake
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
|
@ -678,7 +680,7 @@ jobs:
|
||||||
- name: Build using CMake w/ OpenBLAS
|
- name: Build using CMake w/ OpenBLAS
|
||||||
shell: msys2 {0}
|
shell: msys2 {0}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||||
|
|
||||||
windows-latest-cmake:
|
windows-latest-cmake:
|
||||||
|
@ -693,25 +695,25 @@ jobs:
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- build: 'rpc-x64'
|
- build: 'rpc-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'noavx-x64'
|
- build: 'noavx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx2-x64'
|
- build: 'avx2-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx-x64'
|
- build: 'avx-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'avx512-x64'
|
- build: 'avx512-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_AVX512=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'openblas-x64'
|
- build: 'openblas-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_BLAS=ON -DBUILD_SHARED_LIBS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
|
||||||
- build: 'kompute-x64'
|
- build: 'kompute-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'vulkan-x64'
|
- build: 'vulkan-x64'
|
||||||
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'llvm-arm64'
|
- build: 'llvm-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
- build: 'msvc-arm64'
|
- build: 'msvc-arm64'
|
||||||
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -724,7 +726,7 @@ jobs:
|
||||||
id: clone_kompute
|
id: clone_kompute
|
||||||
if: ${{ matrix.build == 'kompute-x64' }}
|
if: ${{ matrix.build == 'kompute-x64' }}
|
||||||
run: |
|
run: |
|
||||||
git submodule update --init kompute
|
git submodule update --init ggml/src/kompute
|
||||||
|
|
||||||
- name: Download OpenBLAS
|
- name: Download OpenBLAS
|
||||||
id: get_openblas
|
id: get_openblas
|
||||||
|
@ -854,7 +856,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
|
||||||
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
|
||||||
|
|
||||||
- name: Determine tag name
|
- name: Determine tag name
|
||||||
|
@ -987,7 +989,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
$env:HIP_PATH=$(Resolve-Path 'C:\Program Files\AMD\ROCm\*\bin\clang.exe' | split-path | split-path)
|
||||||
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
$env:CMAKE_PREFIX_PATH="${env:HIP_PATH}"
|
||||||
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DLLAMA_HIPBLAS=ON
|
cmake -G "Unix Makefiles" -B build -S . -DCMAKE_C_COMPILER="${env:HIP_PATH}\bin\clang.exe" -DCMAKE_CXX_COMPILER="${env:HIP_PATH}\bin\clang++.exe" -DGGML_HIPBLAS=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
|
|
||||||
ios-xcode-build:
|
ios-xcode-build:
|
||||||
|
|
11
.github/workflows/docker.yml
vendored
11
.github/workflows/docker.yml
vendored
|
@ -10,10 +10,11 @@
|
||||||
name: Publish Docker image
|
name: Publish Docker image
|
||||||
|
|
||||||
on:
|
on:
|
||||||
pull_request:
|
#pull_request:
|
||||||
push:
|
push:
|
||||||
branches:
|
branches:
|
||||||
- master
|
- master
|
||||||
|
paths: ['.github/workflows/docker.yml', '.devops/*.Dockerfile', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.cuh', '**/*.swift', '**/*.m', '**/*.metal']
|
||||||
|
|
||||||
concurrency:
|
concurrency:
|
||||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||||
|
@ -22,7 +23,7 @@ concurrency:
|
||||||
jobs:
|
jobs:
|
||||||
push_to_registry:
|
push_to_registry:
|
||||||
name: Push Docker image to Docker Hub
|
name: Push Docker image to Docker Hub
|
||||||
if: github.event.pull_request.draft == false
|
#if: github.event.pull_request.draft == false
|
||||||
|
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
env:
|
env:
|
||||||
|
@ -33,15 +34,13 @@ jobs:
|
||||||
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
# NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
|
|
||||||
# have disabled them for now until the reason why
|
|
||||||
# is understood.
|
|
||||||
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
# Note: the full-rocm image is failing due to a "no space left on device" error. It is disabled for now to allow the workflow to complete.
|
||||||
|
#- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
|
||||||
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
- { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" }
|
||||||
steps:
|
steps:
|
||||||
|
|
8
.github/workflows/server.yml
vendored
8
.github/workflows/server.yml
vendored
|
@ -30,7 +30,7 @@ jobs:
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
|
||||||
build_type: [RelWithDebInfo]
|
build_type: [RelWithDebInfo]
|
||||||
include:
|
include:
|
||||||
- build_type: Release
|
- build_type: Release
|
||||||
|
@ -92,12 +92,12 @@ jobs:
|
||||||
if: ${{ matrix.sanitizer == 'THREAD' }}
|
if: ${{ matrix.sanitizer == 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||||
-DLLAMA_OPENMP=OFF ;
|
-DGGML_OPENMP=OFF ;
|
||||||
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
|
@ -105,7 +105,7 @@ jobs:
|
||||||
if: ${{ matrix.sanitizer != 'THREAD' }}
|
if: ${{ matrix.sanitizer != 'THREAD' }}
|
||||||
run: |
|
run: |
|
||||||
cmake -B build \
|
cmake -B build \
|
||||||
-DLLAMA_NATIVE=OFF \
|
-DGGML_NATIVE=OFF \
|
||||||
-DLLAMA_BUILD_SERVER=ON \
|
-DLLAMA_BUILD_SERVER=ON \
|
||||||
-DLLAMA_CURL=ON \
|
-DLLAMA_CURL=ON \
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
|
||||||
|
|
114
.gitignore
vendored
114
.gitignore
vendored
|
@ -1,90 +1,124 @@
|
||||||
*.o
|
# Extensions
|
||||||
|
|
||||||
*.a
|
*.a
|
||||||
*.so
|
*.bat
|
||||||
|
*.bin
|
||||||
|
*.dll
|
||||||
|
*.dot
|
||||||
|
*.etag
|
||||||
|
*.exe
|
||||||
|
*.gcda
|
||||||
|
*.gcno
|
||||||
|
*.gcov
|
||||||
*.gguf
|
*.gguf
|
||||||
*.gguf.json
|
*.gguf.json
|
||||||
*.bin
|
|
||||||
*.exe
|
|
||||||
*.dll
|
|
||||||
*.log
|
|
||||||
*.gcov
|
|
||||||
*.gcno
|
|
||||||
*.gcda
|
|
||||||
*.dot
|
|
||||||
*.bat
|
|
||||||
*.tmp
|
|
||||||
*.metallib
|
|
||||||
*.etag
|
|
||||||
*.lastModified
|
*.lastModified
|
||||||
.DS_Store
|
*.log
|
||||||
.build/
|
*.metallib
|
||||||
|
*.o
|
||||||
|
*.so
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# IDE / OS
|
||||||
|
|
||||||
.cache/
|
.cache/
|
||||||
.ccls-cache/
|
.ccls-cache/
|
||||||
.direnv/
|
.direnv/
|
||||||
|
.DS_Store
|
||||||
.envrc
|
.envrc
|
||||||
|
.idea/
|
||||||
.swiftpm
|
.swiftpm
|
||||||
.venv
|
|
||||||
.clang-tidy
|
|
||||||
.vs/
|
.vs/
|
||||||
.vscode/
|
.vscode/
|
||||||
.idea/
|
nppBackup
|
||||||
|
|
||||||
ggml-metal-embed.metal
|
|
||||||
|
|
||||||
lcov-report/
|
# Coverage
|
||||||
|
|
||||||
gcovr-report/
|
gcovr-report/
|
||||||
|
lcov-report/
|
||||||
|
|
||||||
|
# Build Artifacts
|
||||||
|
|
||||||
tags
|
tags
|
||||||
|
.build/
|
||||||
build*
|
build*
|
||||||
|
!build-info.cmake
|
||||||
|
!build-info.cpp.in
|
||||||
|
!build-info.sh
|
||||||
!build.zig
|
!build.zig
|
||||||
cmake-build-*
|
/libllama.so
|
||||||
|
/llama-*
|
||||||
android-ndk-*
|
android-ndk-*
|
||||||
|
arm_neon.h
|
||||||
|
cmake-build-*
|
||||||
|
CMakeSettings.json
|
||||||
|
compile_commands.json
|
||||||
|
ggml-metal-embed.metal
|
||||||
|
llama-batched-swift
|
||||||
|
/rpc-server
|
||||||
out/
|
out/
|
||||||
tmp/
|
tmp/
|
||||||
|
|
||||||
|
# CI
|
||||||
|
|
||||||
|
!.github/workflows/*.yml
|
||||||
|
|
||||||
|
# Models
|
||||||
|
|
||||||
models/*
|
models/*
|
||||||
models-mnt
|
models-mnt
|
||||||
|
!models/.editorconfig
|
||||||
|
!models/ggml-vocab-*.gguf*
|
||||||
|
|
||||||
/Pipfile
|
# Zig
|
||||||
/libllama.so
|
|
||||||
/llama-*
|
|
||||||
llama-batched-swift
|
|
||||||
/common/build-info.cpp
|
|
||||||
arm_neon.h
|
|
||||||
compile_commands.json
|
|
||||||
CMakeSettings.json
|
|
||||||
|
|
||||||
__pycache__
|
|
||||||
dist
|
|
||||||
|
|
||||||
zig-out/
|
zig-out/
|
||||||
zig-cache/
|
zig-cache/
|
||||||
|
|
||||||
|
# Logs
|
||||||
|
|
||||||
ppl-*.txt
|
ppl-*.txt
|
||||||
qnt-*.txt
|
qnt-*.txt
|
||||||
perf-*.txt
|
perf-*.txt
|
||||||
|
|
||||||
|
# Examples
|
||||||
|
|
||||||
examples/jeopardy/results.txt
|
examples/jeopardy/results.txt
|
||||||
|
examples/server/*.css.hpp
|
||||||
examples/server/*.html.hpp
|
examples/server/*.html.hpp
|
||||||
examples/server/*.js.hpp
|
examples/server/*.js.hpp
|
||||||
examples/server/*.mjs.hpp
|
examples/server/*.mjs.hpp
|
||||||
examples/server/*.css.hpp
|
!build_64.sh
|
||||||
|
!examples/*.bat
|
||||||
|
!examples/*/*.kts
|
||||||
|
!examples/*/*/*.kts
|
||||||
|
!examples/sycl/*.bat
|
||||||
|
!examples/sycl/*.sh
|
||||||
|
|
||||||
|
# Python
|
||||||
|
|
||||||
|
__pycache__
|
||||||
|
.venv
|
||||||
|
/Pipfile
|
||||||
|
dist
|
||||||
poetry.lock
|
poetry.lock
|
||||||
poetry.toml
|
poetry.toml
|
||||||
nppBackup
|
|
||||||
|
|
||||||
# Test binaries
|
# Test binaries
|
||||||
/tests/test-grammar-parser
|
/tests/test-backend-ops
|
||||||
/tests/test-llama-grammar
|
|
||||||
/tests/test-double-float
|
/tests/test-double-float
|
||||||
/tests/test-grad0
|
/tests/test-grad0
|
||||||
|
/tests/test-grammar-parser
|
||||||
|
/tests/test-llama-grammar
|
||||||
/tests/test-opt
|
/tests/test-opt
|
||||||
/tests/test-quantize-fns
|
/tests/test-quantize-fns
|
||||||
/tests/test-quantize-perf
|
/tests/test-quantize-perf
|
||||||
|
/tests/test-rope
|
||||||
/tests/test-sampling
|
/tests/test-sampling
|
||||||
/tests/test-tokenizer-0
|
/tests/test-tokenizer-0
|
||||||
/tests/test-tokenizer-1-spm
|
|
||||||
/tests/test-tokenizer-1-bpe
|
/tests/test-tokenizer-1-bpe
|
||||||
/tests/test-rope
|
/tests/test-tokenizer-1-spm
|
||||||
/tests/test-backend-ops
|
|
||||||
|
# Scripts
|
||||||
|
!/scripts/install-oneapi.bat
|
||||||
|
|
2
.gitmodules
vendored
2
.gitmodules
vendored
|
@ -1,3 +1,3 @@
|
||||||
[submodule "kompute"]
|
[submodule "kompute"]
|
||||||
path = kompute
|
path = ggml/src/kompute
|
||||||
url = https://github.com/nomic-ai/kompute.git
|
url = https://github.com/nomic-ai/kompute.git
|
||||||
|
|
129
AUTHORS
129
AUTHORS
|
@ -1,8 +1,9 @@
|
||||||
# date: Tue Apr 9 09:17:14 EEST 2024
|
# date: Wed Jun 26 19:36:34 EEST 2024
|
||||||
# this file is auto-generated by scripts/gen-authors.sh
|
# this file is auto-generated by scripts/gen-authors.sh
|
||||||
|
|
||||||
0cc4m <picard12@live.de>
|
0cc4m <picard12@live.de>
|
||||||
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
0xspringtime <110655352+0xspringtime@users.noreply.github.com>
|
||||||
|
20kdc <asdd2808@gmail.com>
|
||||||
2f38b454 <dxf@protonmail.com>
|
2f38b454 <dxf@protonmail.com>
|
||||||
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com>
|
||||||
44670 <44670@users.noreply.github.com>
|
44670 <44670@users.noreply.github.com>
|
||||||
|
@ -11,14 +12,18 @@ AT <manyoso@users.noreply.github.com>
|
||||||
Aarni Koskela <akx@iki.fi>
|
Aarni Koskela <akx@iki.fi>
|
||||||
Aaron Miller <apage43@ninjawhale.com>
|
Aaron Miller <apage43@ninjawhale.com>
|
||||||
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
Aaryaman Vasishta <aaryaman.vasishta@amd.com>
|
||||||
|
Abheek Gulati <abheekg@hotmail.com>
|
||||||
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
|
||||||
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com>
|
||||||
Adithya Balaji <adithya.b94@gmail.com>
|
Adithya Balaji <adithya.b94@gmail.com>
|
||||||
AdithyanI <adithyan.i4internet@gmail.com>
|
AdithyanI <adithyan.i4internet@gmail.com>
|
||||||
Adrian <smith.adriane@gmail.com>
|
Adrian <smith.adriane@gmail.com>
|
||||||
Adrian Hesketh <a-h@users.noreply.github.com>
|
Adrian Hesketh <a-h@users.noreply.github.com>
|
||||||
|
Ahmet Zeer <ahmed.zeer@std.yildiz.edu.tr>
|
||||||
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
|
||||||
Aisuko <urakiny@gmail.com>
|
Aisuko <urakiny@gmail.com>
|
||||||
|
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
|
||||||
|
Albert Jin <albert.jin@gmail.com>
|
||||||
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
Alberto <57916483+albbus-stack@users.noreply.github.com>
|
||||||
Alex <awhill19@icloud.com>
|
Alex <awhill19@icloud.com>
|
||||||
Alex Azarov <alex@azarov.by>
|
Alex Azarov <alex@azarov.by>
|
||||||
|
@ -35,19 +40,24 @@ Ali Nehzat <ali.nehzat@thanks.dev>
|
||||||
Ali Tariq <ali.tariq@10xengineers.ai>
|
Ali Tariq <ali.tariq@10xengineers.ai>
|
||||||
Alon <alonfaraj@gmail.com>
|
Alon <alonfaraj@gmail.com>
|
||||||
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
AlpinDale <52078762+AlpinDale@users.noreply.github.com>
|
||||||
|
Amir <amir_zia@outlook.com>
|
||||||
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
|
||||||
Ananta Bastola <anantarajbastola@gmail.com>
|
Ananta Bastola <anantarajbastola@gmail.com>
|
||||||
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
Anas Ahouzi <112881240+aahouzi@users.noreply.github.com>
|
||||||
András Salamon <ott2@users.noreply.github.com>
|
András Salamon <ott2@users.noreply.github.com>
|
||||||
Andrei <abetlen@gmail.com>
|
Andrei <abetlen@gmail.com>
|
||||||
Andrew Canis <andrew.canis@gmail.com>
|
Andrew Canis <andrew.canis@gmail.com>
|
||||||
|
Andrew Downing <andrew2085@gmail.com>
|
||||||
Andrew Duffy <a10y@users.noreply.github.com>
|
Andrew Duffy <a10y@users.noreply.github.com>
|
||||||
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
Andrew Godfrey <AndrewGodfrey@users.noreply.github.com>
|
||||||
|
Andy Tai <andy-tai@users.noreply.github.com>
|
||||||
Arik Poznanski <arikpoz@users.noreply.github.com>
|
Arik Poznanski <arikpoz@users.noreply.github.com>
|
||||||
Artem <guinmoon@gmail.com>
|
Artem <guinmoon@gmail.com>
|
||||||
|
Artem Zinnatullin <ceo@abstractny.gay>
|
||||||
Artyom Lebedev <vagran.ast@gmail.com>
|
Artyom Lebedev <vagran.ast@gmail.com>
|
||||||
Asbjørn Olling <asbjornolling@gmail.com>
|
Asbjørn Olling <asbjornolling@gmail.com>
|
||||||
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
Ásgeir Bjarni Ingvarsson <asgeir@fundinn.org>
|
||||||
|
Ashish <1856117+ashishdatta@users.noreply.github.com>
|
||||||
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
Ashok Gelal <401055+ashokgelal@users.noreply.github.com>
|
||||||
Ashraful Islam <ashraful.meche@gmail.com>
|
Ashraful Islam <ashraful.meche@gmail.com>
|
||||||
Atsushi Tatsuma <yoshoku@outlook.com>
|
Atsushi Tatsuma <yoshoku@outlook.com>
|
||||||
|
@ -57,35 +67,46 @@ BADR <contact@pythops.com>
|
||||||
Bach Le <bach@bullno1.com>
|
Bach Le <bach@bullno1.com>
|
||||||
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
Bailey Chittle <39804642+bachittle@users.noreply.github.com>
|
||||||
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com>
|
||||||
|
Bartowski <ckealty1182@gmail.com>
|
||||||
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
Behnam M <58621210+ibehnam@users.noreply.github.com>
|
||||||
|
Ben Ashbaugh <ben.ashbaugh@intel.com>
|
||||||
Ben Garney <bengarney@users.noreply.github.com>
|
Ben Garney <bengarney@users.noreply.github.com>
|
||||||
Ben Siraphob <bensiraphob@gmail.com>
|
Ben Siraphob <bensiraphob@gmail.com>
|
||||||
Ben Williams <ben@719ben.com>
|
Ben Williams <ben@719ben.com>
|
||||||
|
Benjamin Findley <39356821+Kartoffelsaft@users.noreply.github.com>
|
||||||
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com>
|
||||||
Bernat Vadell <hounter.caza@gmail.com>
|
Bernat Vadell <hounter.caza@gmail.com>
|
||||||
|
Bingan <70050083+binganao@users.noreply.github.com>
|
||||||
Bodo Graumann <mail@bodograumann.de>
|
Bodo Graumann <mail@bodograumann.de>
|
||||||
Bono Lv <lvscar@users.noreply.github.com>
|
Bono Lv <lvscar@users.noreply.github.com>
|
||||||
Borislav Stanimirov <b.stanimirov@abv.bg>
|
Borislav Stanimirov <b.stanimirov@abv.bg>
|
||||||
Branden Butler <bwtbutler@hotmail.com>
|
Branden Butler <bwtbutler@hotmail.com>
|
||||||
Brian <mofosyne@gmail.com>
|
Brian <mofosyne@gmail.com>
|
||||||
Bruce MacDonald <brucewmacdonald@gmail.com>
|
Bruce MacDonald <brucewmacdonald@gmail.com>
|
||||||
|
Bryan Honof <bryanhonof@gmail.com>
|
||||||
CJ Pais <cj@cjpais.com>
|
CJ Pais <cj@cjpais.com>
|
||||||
CRD716 <crd716@gmail.com>
|
CRD716 <crd716@gmail.com>
|
||||||
|
Calvin Laurenson <calvin@laurenson.dev>
|
||||||
Cameron <csteele@steelecameron.com>
|
Cameron <csteele@steelecameron.com>
|
||||||
Cameron Kaiser <classilla@users.noreply.github.com>
|
Cameron Kaiser <classilla@users.noreply.github.com>
|
||||||
|
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
|
||||||
Casey Primozic <casey@cprimozic.net>
|
Casey Primozic <casey@cprimozic.net>
|
||||||
Casey Primozic <me@ameo.link>
|
Casey Primozic <me@ameo.link>
|
||||||
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
CausalLM <148736309+CausalLM@users.noreply.github.com>
|
||||||
Cebtenzzre <cebtenzzre@gmail.com>
|
Cebtenzzre <cebtenzzre@gmail.com>
|
||||||
Chad Brewbaker <crb002@gmail.com>
|
Chad Brewbaker <crb002@gmail.com>
|
||||||
|
Chao Jiang <jc19chaoj@zoho.com>
|
||||||
Cheng Shao <terrorjack@type.dance>
|
Cheng Shao <terrorjack@type.dance>
|
||||||
|
Chris Elrod <elrodc@gmail.com>
|
||||||
Chris Kuehl <ckuehl@ckuehl.me>
|
Chris Kuehl <ckuehl@ckuehl.me>
|
||||||
Christian Demsar <christian@github.email.demsar.us>
|
Christian Demsar <christian@github.email.demsar.us>
|
||||||
Christian Demsar <crasm@git.vczf.us>
|
Christian Demsar <crasm@git.vczf.us>
|
||||||
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
Christian Falch <875252+chrfalch@users.noreply.github.com>
|
||||||
Christian Kögler <ck3d@gmx.de>
|
Christian Kögler <ck3d@gmx.de>
|
||||||
|
Christian Zhou-Zheng <59622928+christianazinn@users.noreply.github.com>
|
||||||
Clark Saben <76020733+csaben@users.noreply.github.com>
|
Clark Saben <76020733+csaben@users.noreply.github.com>
|
||||||
Clint Herron <hanclinto@gmail.com>
|
Clint Herron <hanclinto@gmail.com>
|
||||||
|
CrispStrobe <154636388+CrispStrobe@users.noreply.github.com>
|
||||||
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
Cuong Trinh Manh <nguoithichkhampha@gmail.com>
|
||||||
DAN™ <dranger003@gmail.com>
|
DAN™ <dranger003@gmail.com>
|
||||||
Damian Stewart <d@damianstewart.com>
|
Damian Stewart <d@damianstewart.com>
|
||||||
|
@ -95,8 +116,12 @@ Daniel Bevenius <daniel.bevenius@gmail.com>
|
||||||
Daniel Drake <drake@endlessos.org>
|
Daniel Drake <drake@endlessos.org>
|
||||||
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
Daniel Hiltgen <dhiltgen@users.noreply.github.com>
|
||||||
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
Daniel Illescas Romero <illescas.daniel@protonmail.com>
|
||||||
|
Daniele <57776841+daniandtheweb@users.noreply.github.com>
|
||||||
DannyDaemonic <DannyDaemonic@gmail.com>
|
DannyDaemonic <DannyDaemonic@gmail.com>
|
||||||
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com>
|
||||||
|
Dave <dave-fl@users.noreply.github.com>
|
||||||
|
Dave Airlie <airlied@gmail.com>
|
||||||
|
Dave Airlie <airlied@redhat.com>
|
||||||
Dave Della Costa <ddellacosta+github@gmail.com>
|
Dave Della Costa <ddellacosta+github@gmail.com>
|
||||||
David Friehs <david@friehs.info>
|
David Friehs <david@friehs.info>
|
||||||
David Kennedy <dakennedyd@gmail.com>
|
David Kennedy <dakennedyd@gmail.com>
|
||||||
|
@ -104,10 +129,13 @@ David Pflug <david@pflug.email>
|
||||||
David Renshaw <dwrenshaw@gmail.com>
|
David Renshaw <dwrenshaw@gmail.com>
|
||||||
David Sommers <12738+databyte@users.noreply.github.com>
|
David Sommers <12738+databyte@users.noreply.github.com>
|
||||||
David Yang <davidyang6us@gmail.com>
|
David Yang <davidyang6us@gmail.com>
|
||||||
|
Dawid Potocki <github@dawidpotocki.com>
|
||||||
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com>
|
||||||
Dean <Dean.Sinaean@gmail.com>
|
Dean <Dean.Sinaean@gmail.com>
|
||||||
Deins <deinsegle@gmail.com>
|
Deins <deinsegle@gmail.com>
|
||||||
|
Deven Mistry <31466137+deven367@users.noreply.github.com>
|
||||||
Didzis Gosko <didzis@users.noreply.github.com>
|
Didzis Gosko <didzis@users.noreply.github.com>
|
||||||
|
Djip007 <djip.perois@free.fr>
|
||||||
Don Mahurin <dmahurin@users.noreply.github.com>
|
Don Mahurin <dmahurin@users.noreply.github.com>
|
||||||
DooWoong Lee (David) <manics99@naver.com>
|
DooWoong Lee (David) <manics99@naver.com>
|
||||||
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com>
|
||||||
|
@ -116,8 +144,11 @@ Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
|
||||||
Ebey Abraham <ebey97@gmail.com>
|
Ebey Abraham <ebey97@gmail.com>
|
||||||
Ed Lee <edilee@mozilla.com>
|
Ed Lee <edilee@mozilla.com>
|
||||||
Ed Lepedus <ed.lepedus@googlemail.com>
|
Ed Lepedus <ed.lepedus@googlemail.com>
|
||||||
|
Eddie-Wang <wangjinheng1120@163.com>
|
||||||
Edward Taylor <edeetee@gmail.com>
|
Edward Taylor <edeetee@gmail.com>
|
||||||
|
Elaine <elaine.zosa@gmail.com>
|
||||||
Elbios <141279586+Elbios@users.noreply.github.com>
|
Elbios <141279586+Elbios@users.noreply.github.com>
|
||||||
|
Elton Kola <eltonkola@gmail.com>
|
||||||
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
Engininja2 <139037756+Engininja2@users.noreply.github.com>
|
||||||
Equim <sayaka@ekyu.moe>
|
Equim <sayaka@ekyu.moe>
|
||||||
Eric Sommerlade <es0m@users.noreply.github.com>
|
Eric Sommerlade <es0m@users.noreply.github.com>
|
||||||
|
@ -143,37 +174,47 @@ Firat <firatkiral@gmail.com>
|
||||||
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
Folko-Ven <71110216+Folko-Ven@users.noreply.github.com>
|
||||||
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com>
|
||||||
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
Francisco Melo <43780565+francis2tm@users.noreply.github.com>
|
||||||
|
Frank Mai <thxcode0824@gmail.com>
|
||||||
FrankHB <frankhb1989@gmail.com>
|
FrankHB <frankhb1989@gmail.com>
|
||||||
|
Fred Douglas <43351173+fredlas@users.noreply.github.com>
|
||||||
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
Frederik Vogel <Schaltfehler@users.noreply.github.com>
|
||||||
Gabe Goodhart <gabe.l.hart@gmail.com>
|
Gabe Goodhart <gabe.l.hart@gmail.com>
|
||||||
GainLee <perfecter.gen@gmail.com>
|
GainLee <perfecter.gen@gmail.com>
|
||||||
Galunid <karolek1231456@gmail.com>
|
Galunid <karolek1231456@gmail.com>
|
||||||
Gary Linscott <glinscott@gmail.com>
|
Gary Linscott <glinscott@gmail.com>
|
||||||
Gary Mulder <gjmulder@gmail.com>
|
Gary Mulder <gjmulder@gmail.com>
|
||||||
|
Gavin Zhao <gavinzhaojw@protonmail.com>
|
||||||
Genkagaku.GPT <hlhr202@163.com>
|
Genkagaku.GPT <hlhr202@163.com>
|
||||||
Georgi Gerganov <ggerganov@gmail.com>
|
Georgi Gerganov <ggerganov@gmail.com>
|
||||||
Gilad S <giladgd@users.noreply.github.com>
|
Gilad S <giladgd@users.noreply.github.com>
|
||||||
|
Giuseppe Scrivano <giuseppe@scrivano.org>
|
||||||
GiviMAD <GiviMAD@users.noreply.github.com>
|
GiviMAD <GiviMAD@users.noreply.github.com>
|
||||||
Govlzkoy <gotope@users.noreply.github.com>
|
Govlzkoy <gotope@users.noreply.github.com>
|
||||||
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
Guillaume "Vermeille" Sanchez <Guillaume.V.Sanchez@gmail.com>
|
||||||
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
Guillaume Wenzek <gwenzek@users.noreply.github.com>
|
||||||
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
Guoteng <32697156+SolenoidWGT@users.noreply.github.com>
|
||||||
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com>
|
||||||
|
Haggai Nuchi <h.nuchi@gmail.com>
|
||||||
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
|
||||||
|
Hamdoud Hakem <90524568+hamdoudhakem@users.noreply.github.com>
|
||||||
|
HanishKVC <hanishkvc@gmail.com>
|
||||||
Haohui Mai <ricetons@gmail.com>
|
Haohui Mai <ricetons@gmail.com>
|
||||||
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
Haoxiang Fei <tonyfettes@tonyfettes.com>
|
||||||
Harald Fernengel <harald.fernengel@here.com>
|
Harald Fernengel <harald.fernengel@here.com>
|
||||||
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
Hatsune Miku <129688334+at8u@users.noreply.github.com>
|
||||||
|
HatsuneMikuUwU33 <173229399+HatsuneMikuUwU33@users.noreply.github.com>
|
||||||
Henk Poley <HenkPoley@gmail.com>
|
Henk Poley <HenkPoley@gmail.com>
|
||||||
Henri Vasserman <henv@hot.ee>
|
Henri Vasserman <henv@hot.ee>
|
||||||
Henrik Forstén <henrik.forsten@gmail.com>
|
Henrik Forstén <henrik.forsten@gmail.com>
|
||||||
Herman Semenov <GermanAizek@yandex.ru>
|
Herman Semenov <GermanAizek@yandex.ru>
|
||||||
Hesen Peng <hesen.peng@gmail.com>
|
Hesen Peng <hesen.peng@gmail.com>
|
||||||
Hoang Nguyen <hugo53@users.noreply.github.com>
|
Hoang Nguyen <hugo53@users.noreply.github.com>
|
||||||
|
Hong Bo PENG <penghb@cn.ibm.com>
|
||||||
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
Hongyu Ouyang <96765450+casavaca@users.noreply.github.com>
|
||||||
Howard Su <howard0su@gmail.com>
|
Howard Su <howard0su@gmail.com>
|
||||||
Hua Jiang <allenhjiang@outlook.com>
|
Hua Jiang <allenhjiang@outlook.com>
|
||||||
Huawei Lin <huaweilin.cs@gmail.com>
|
Huawei Lin <huaweilin.cs@gmail.com>
|
||||||
|
Hugo Roussel <hugo.rous@gmail.com>
|
||||||
Ian Bull <irbull@eclipsesource.com>
|
Ian Bull <irbull@eclipsesource.com>
|
||||||
Ian Bull <irbull@gmail.com>
|
Ian Bull <irbull@gmail.com>
|
||||||
Ian Scrivener <github@zilogy.asia>
|
Ian Scrivener <github@zilogy.asia>
|
||||||
|
@ -190,8 +231,10 @@ Ivan Stepanov <ivanstepanovftw@gmail.com>
|
||||||
JH23X <165871467+JH23X@users.noreply.github.com>
|
JH23X <165871467+JH23X@users.noreply.github.com>
|
||||||
Jack Mousseau <jmousseau@users.noreply.github.com>
|
Jack Mousseau <jmousseau@users.noreply.github.com>
|
||||||
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
JackJollimore <130917767+JackJollimore@users.noreply.github.com>
|
||||||
|
Jaemin Son <woalsdnd@gmail.com>
|
||||||
Jag Chadha <jagtesh@gmail.com>
|
Jag Chadha <jagtesh@gmail.com>
|
||||||
Jakub N <jakubniemczyk97@gmail.com>
|
Jakub N <jakubniemczyk97@gmail.com>
|
||||||
|
James A Capozzoli <157492257+jac-jim@users.noreply.github.com>
|
||||||
James Reynolds <magnusviri@users.noreply.github.com>
|
James Reynolds <magnusviri@users.noreply.github.com>
|
||||||
Jan Boon <jan.boon@kaetemi.be>
|
Jan Boon <jan.boon@kaetemi.be>
|
||||||
Jan Boon <kaetemi@gmail.com>
|
Jan Boon <kaetemi@gmail.com>
|
||||||
|
@ -205,12 +248,17 @@ Jean-Michaël Celerier <jeanmichael.celerier+github@gmail.com>
|
||||||
Jed Fox <git@jedfox.com>
|
Jed Fox <git@jedfox.com>
|
||||||
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
Jeffrey Quesnelle <emozilla@nousresearch.com>
|
||||||
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
Jesse Jojo Johnson <williamsaintgeorge@gmail.com>
|
||||||
|
Jeximo <jeximo@gmail.com>
|
||||||
Jhen-Jie Hong <iainst0409@gmail.com>
|
Jhen-Jie Hong <iainst0409@gmail.com>
|
||||||
Jiahao Li <liplus17@163.com>
|
Jiahao Li <liplus17@163.com>
|
||||||
Jian Liao <jianliao@users.noreply.github.com>
|
Jian Liao <jianliao@users.noreply.github.com>
|
||||||
JidongZhang-THU <1119708529@qq.com>
|
JidongZhang-THU <1119708529@qq.com>
|
||||||
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com>
|
||||||
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
|
||||||
|
Jiří Sejkora <Sejseloid@gmail.com>
|
||||||
|
Joan Fontanals <jfontanalsmartinez@gmail.com>
|
||||||
|
Joan Fontanals <joan.fontanals.martinez@jina.ai>
|
||||||
|
Johan <JohanAR@users.noreply.github.com>
|
||||||
Johannes Gäßler <johannesg@5d6.de>
|
Johannes Gäßler <johannesg@5d6.de>
|
||||||
Johannes Rudolph <johannes.rudolph@gmail.com>
|
Johannes Rudolph <johannes.rudolph@gmail.com>
|
||||||
John <78893154+cmp-nct@users.noreply.github.com>
|
John <78893154+cmp-nct@users.noreply.github.com>
|
||||||
|
@ -221,15 +269,19 @@ Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com>
|
||||||
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
Jorge A <161275481+jorgealias@users.noreply.github.com>
|
||||||
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com>
|
||||||
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
Joseph Stahl <1269177+josephst@users.noreply.github.com>
|
||||||
|
Josh Ramer <josh.ramer@icloud.com>
|
||||||
Joyce <joycebrum@google.com>
|
Joyce <joycebrum@google.com>
|
||||||
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
Juan Calderon-Perez <835733+gaby@users.noreply.github.com>
|
||||||
Judd <foldl@users.noreply.github.com>
|
Judd <foldl@users.noreply.github.com>
|
||||||
Julius Arkenberg <arki05@users.noreply.github.com>
|
Julius Arkenberg <arki05@users.noreply.github.com>
|
||||||
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
Jun Jie <71215065+junnjiee16@users.noreply.github.com>
|
||||||
|
Junyang Lin <justinlin930319@hotmail.com>
|
||||||
Juraj Bednar <juraj@bednar.io>
|
Juraj Bednar <juraj@bednar.io>
|
||||||
Justin Parker <jparkerweb@gmail.com>
|
Justin Parker <jparkerweb@gmail.com>
|
||||||
Justin Suess <justin.suess@westpoint.edu>
|
Justin Suess <justin.suess@westpoint.edu>
|
||||||
|
Justina Cho <justcho5@gmail.com>
|
||||||
Justine Tunney <jtunney@gmail.com>
|
Justine Tunney <jtunney@gmail.com>
|
||||||
|
Justine Tunney <jtunney@mozilla.com>
|
||||||
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
Juuso Alasuutari <juuso.alasuutari@gmail.com>
|
||||||
KASR <karim.asrih@gmail.com>
|
KASR <karim.asrih@gmail.com>
|
||||||
Kamil Tomšík <info@tomsik.cz>
|
Kamil Tomšík <info@tomsik.cz>
|
||||||
|
@ -242,6 +294,7 @@ Kawrakow <48489457+ikawrakow@users.noreply.github.com>
|
||||||
Keiichi Tabata <keiichi.tabata@outlook.com>
|
Keiichi Tabata <keiichi.tabata@outlook.com>
|
||||||
Kenvix ⭐ <kenvixzure@live.com>
|
Kenvix ⭐ <kenvixzure@live.com>
|
||||||
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com>
|
||||||
|
Kevin Gibbons <bakkot@gmail.com>
|
||||||
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
Kevin Ji <1146876+kevinji@users.noreply.github.com>
|
||||||
Kevin Kwok <antimatter15@gmail.com>
|
Kevin Kwok <antimatter15@gmail.com>
|
||||||
Kevin Lo <kevlo@kevlo.org>
|
Kevin Lo <kevlo@kevlo.org>
|
||||||
|
@ -257,6 +310,7 @@ Laura <Tijntje_7@msn.com>
|
||||||
Lee <44310445+lx200916@users.noreply.github.com>
|
Lee <44310445+lx200916@users.noreply.github.com>
|
||||||
Lee Drake <b.lee.drake@gmail.com>
|
Lee Drake <b.lee.drake@gmail.com>
|
||||||
Leng Yue <lengyue@lengyue.me>
|
Leng Yue <lengyue@lengyue.me>
|
||||||
|
Leon Knauer <git@leonknauer.com>
|
||||||
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
LeonEricsson <70749762+LeonEricsson@users.noreply.github.com>
|
||||||
Leonardo Neumann <leonardo@neumann.dev.br>
|
Leonardo Neumann <leonardo@neumann.dev.br>
|
||||||
Li Tan <tanliboy@gmail.com>
|
Li Tan <tanliboy@gmail.com>
|
||||||
|
@ -265,20 +319,26 @@ LoganDark <github@logandark.mozmail.com>
|
||||||
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
LostRuins <39025047+LostRuins@users.noreply.github.com>
|
||||||
Luciano <lucianostrika44@gmail.com>
|
Luciano <lucianostrika44@gmail.com>
|
||||||
Luo Tian <lt@basecity.com>
|
Luo Tian <lt@basecity.com>
|
||||||
|
Lyle Dean <dean@lyle.dev>
|
||||||
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
|
||||||
Maarten ter Huurne <maarten@treewalker.org>
|
Maarten ter Huurne <maarten@treewalker.org>
|
||||||
Mack Straight <eiz@users.noreply.github.com>
|
Mack Straight <eiz@users.noreply.github.com>
|
||||||
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
Maël Kerbiriou <m431.kerbiriou@gmail.com>
|
||||||
MaggotHATE <clay1326@gmail.com>
|
MaggotHATE <clay1326@gmail.com>
|
||||||
|
Manuel <44313466+makuche@users.noreply.github.com>
|
||||||
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
Marc Köhlbrugge <subscriptions@marckohlbrugge.com>
|
||||||
Marco Matthies <71844+marcom@users.noreply.github.com>
|
Marco Matthies <71844+marcom@users.noreply.github.com>
|
||||||
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com>
|
||||||
Marian Cepok <marian.cepok@gmail.com>
|
Marian Cepok <marian.cepok@gmail.com>
|
||||||
Mark Fairbairn <thebaron88@gmail.com>
|
Mark Fairbairn <thebaron88@gmail.com>
|
||||||
Marko Tasic <mtasic85@gmail.com>
|
Marko Tasic <mtasic85@gmail.com>
|
||||||
|
Markus Tavenrath <mtavenrath@users.noreply.github.com>
|
||||||
|
Martin Delille <martin@delille.org>
|
||||||
Martin Krasser <krasserm@googlemail.com>
|
Martin Krasser <krasserm@googlemail.com>
|
||||||
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
Martin Schwaighofer <mschwaig@users.noreply.github.com>
|
||||||
Marvin Gießing <marvin.giessing@gmail.com>
|
Marvin Gießing <marvin.giessing@gmail.com>
|
||||||
|
Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
|
||||||
|
MasterYi1024 <39848311+MasterYi1024@users.noreply.github.com>
|
||||||
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
Mateusz Charytoniuk <mateusz.charytoniuk@protonmail.com>
|
||||||
Matheus C. França <matheus-catarino@hotmail.com>
|
Matheus C. França <matheus-catarino@hotmail.com>
|
||||||
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
Matheus Gabriel Alves Silva <matheusgasource@gmail.com>
|
||||||
|
@ -287,8 +347,11 @@ Mathijs de Bruin <mathijs@mathijsfietst.nl>
|
||||||
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
Matt Clayton <156335168+mattjcly@users.noreply.github.com>
|
||||||
Matt Pulver <matt.pulver@heavy.ai>
|
Matt Pulver <matt.pulver@heavy.ai>
|
||||||
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
Matteo Boschini <12133566+mbosc@users.noreply.github.com>
|
||||||
|
Mattheus Chediak <shammcity00@gmail.com>
|
||||||
Matthew Tejo <matthew.tejo@gmail.com>
|
Matthew Tejo <matthew.tejo@gmail.com>
|
||||||
Matvey Soloviev <blackhole89@gmail.com>
|
Matvey Soloviev <blackhole89@gmail.com>
|
||||||
|
Max Krasnyansky <max.krasnyansky@gmail.com>
|
||||||
|
Max Krasnyansky <quic_maxk@quicinc.com>
|
||||||
Maxime <672982+maximegmd@users.noreply.github.com>
|
Maxime <672982+maximegmd@users.noreply.github.com>
|
||||||
Maximilian Winter <maximilian.winter.91@gmail.com>
|
Maximilian Winter <maximilian.winter.91@gmail.com>
|
||||||
Meng Zhang <meng@tabbyml.com>
|
Meng Zhang <meng@tabbyml.com>
|
||||||
|
@ -300,32 +363,41 @@ Michael Kesper <mkesper@schokokeks.org>
|
||||||
Michael Klimenko <mklimenko29@gmail.com>
|
Michael Klimenko <mklimenko29@gmail.com>
|
||||||
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
Michael Podvitskiy <podvitskiymichael@gmail.com>
|
||||||
Michael Potter <NanoTekGuy@Gmail.com>
|
Michael Potter <NanoTekGuy@Gmail.com>
|
||||||
|
Michael de Gans <michael.john.degans@gmail.com>
|
||||||
Michaël de Vries <vriesdemichael@gmail.com>
|
Michaël de Vries <vriesdemichael@gmail.com>
|
||||||
Mihai <mihai.chirculescu@yahoo.com>
|
Mihai <mihai.chirculescu@yahoo.com>
|
||||||
Mike <ytianhui2004@gmail.com>
|
Mike <ytianhui2004@gmail.com>
|
||||||
|
Mikko Juola <mikjuo@gmail.com>
|
||||||
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
Minsoo Cheong <54794500+mscheong01@users.noreply.github.com>
|
||||||
Mirko185 <mirkosig@gmail.com>
|
Mirko185 <mirkosig@gmail.com>
|
||||||
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
Mirror Azure <54669636+MirrorAzure@users.noreply.github.com>
|
||||||
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com>
|
||||||
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
Mohammadreza Hendiani <hendiani.mohammadreza@gmail.com>
|
||||||
|
Mohammadreza Hendiani <mohammad.r.hendiani@gmail.com>
|
||||||
Murilo Santana <mvrilo@gmail.com>
|
Murilo Santana <mvrilo@gmail.com>
|
||||||
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
Musab Gultekin <musabgultekin@users.noreply.github.com>
|
||||||
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
Nam D. Tran <42194884+namtranase@users.noreply.github.com>
|
||||||
|
Nathan Epstein <nate2@umbc.edu>
|
||||||
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
NawafAlansari <72708095+NawafAlansari@users.noreply.github.com>
|
||||||
Nebula <infinitewormhole@gmail.com>
|
Nebula <infinitewormhole@gmail.com>
|
||||||
|
Neo Zhang <14088817+arthw@users.noreply.github.com>
|
||||||
|
Neo Zhang <zhang.jianyu@outlook.com>
|
||||||
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
Neo Zhang Jianyu <jianyu.zhang@intel.com>
|
||||||
Neuman Vong <neuman.vong@gmail.com>
|
Neuman Vong <neuman.vong@gmail.com>
|
||||||
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
Nexesenex <124105151+Nexesenex@users.noreply.github.com>
|
||||||
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
Niall Coates <1349685+Niall-@users.noreply.github.com>
|
||||||
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
Nicolai Weitkemper <kontakt@nicolaiweitkemper.de>
|
||||||
|
Nicolás Pérez <nicolas_perez@brown.edu>
|
||||||
Nigel Bosch <pnigelb@gmail.com>
|
Nigel Bosch <pnigelb@gmail.com>
|
||||||
Niklas Korz <niklas@niklaskorz.de>
|
Niklas Korz <niklas@niklaskorz.de>
|
||||||
|
Nikolas <127742645+nneubacher@users.noreply.github.com>
|
||||||
Nindaleth <Nindaleth@users.noreply.github.com>
|
Nindaleth <Nindaleth@users.noreply.github.com>
|
||||||
Oleksandr Nikitin <oleksandr@tvori.info>
|
Oleksandr Nikitin <oleksandr@tvori.info>
|
||||||
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
Oleksii Maryshchenko <oleksii.maryshchenko@gmail.com>
|
||||||
Olivier Chafik <ochafik@users.noreply.github.com>
|
Olivier Chafik <ochafik@users.noreply.github.com>
|
||||||
Ondřej Čertík <ondrej@certik.us>
|
Ondřej Čertík <ondrej@certik.us>
|
||||||
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
|
||||||
|
Patrice Ferlet <metal3d@gmail.com>
|
||||||
Paul Tsochantaris <ptsochantaris@icloud.com>
|
Paul Tsochantaris <ptsochantaris@icloud.com>
|
||||||
Pavol Rusnak <pavol@rusnak.io>
|
Pavol Rusnak <pavol@rusnak.io>
|
||||||
Pedro Cuenca <pedro@huggingface.co>
|
Pedro Cuenca <pedro@huggingface.co>
|
||||||
|
@ -343,9 +415,14 @@ RJ Adriaansen <adriaansen@eshcc.eur.nl>
|
||||||
Radoslav Gerganov <rgerganov@gmail.com>
|
Radoslav Gerganov <rgerganov@gmail.com>
|
||||||
Radosław Gryta <radek.gryta@gmail.com>
|
Radosław Gryta <radek.gryta@gmail.com>
|
||||||
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com>
|
||||||
|
Raj Hammeer Singh Hada <hammeerraj@gmail.com>
|
||||||
|
Ralph Soika <ralph.soika@imixs.com>
|
||||||
Rand Xie <randxiexyy29@gmail.com>
|
Rand Xie <randxiexyy29@gmail.com>
|
||||||
Randall Fitzgerald <randall@dasaku.net>
|
Randall Fitzgerald <randall@dasaku.net>
|
||||||
Reinforce-II <fate@eastal.com>
|
Reinforce-II <fate@eastal.com>
|
||||||
|
Ren Xuancheng <jklj077@users.noreply.github.com>
|
||||||
|
Rene Leonhardt <65483435+reneleonhardt@users.noreply.github.com>
|
||||||
|
RhinoDevel <RhinoDevel@users.noreply.github.com>
|
||||||
Riceball LEE <snowyu.lee@gmail.com>
|
Riceball LEE <snowyu.lee@gmail.com>
|
||||||
Richard Kiss <him@richardkiss.com>
|
Richard Kiss <him@richardkiss.com>
|
||||||
Richard Roberson <richardr1126@gmail.com>
|
Richard Roberson <richardr1126@gmail.com>
|
||||||
|
@ -373,6 +450,7 @@ Rowan Hart <rowanbhart@gmail.com>
|
||||||
Rune <43761327+Rune-AI@users.noreply.github.com>
|
Rune <43761327+Rune-AI@users.noreply.github.com>
|
||||||
Ryan Landay <rlanday@gmail.com>
|
Ryan Landay <rlanday@gmail.com>
|
||||||
Ryder Wishart <ryderwishart@gmail.com>
|
Ryder Wishart <ryderwishart@gmail.com>
|
||||||
|
Ryuei <louixs@users.noreply.github.com>
|
||||||
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
Rőczey Barnabás <31726601+An0nie@users.noreply.github.com>
|
||||||
SakuraUmi <yukinon244@gmail.com>
|
SakuraUmi <yukinon244@gmail.com>
|
||||||
Salvador E. Tropea <stropea@inti.gob.ar>
|
Salvador E. Tropea <stropea@inti.gob.ar>
|
||||||
|
@ -386,6 +464,7 @@ SebastianApel <13675545+SebastianApel@users.noreply.github.com>
|
||||||
Senemu <10880819+Senemu@users.noreply.github.com>
|
Senemu <10880819+Senemu@users.noreply.github.com>
|
||||||
Sergey Alirzaev <zl29ah@gmail.com>
|
Sergey Alirzaev <zl29ah@gmail.com>
|
||||||
Sergio López <slp@sinrega.org>
|
Sergio López <slp@sinrega.org>
|
||||||
|
Sertaç Özercan <852750+sozercan@users.noreply.github.com>
|
||||||
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
SeungWon Jeong <65549245+redlion0929@users.noreply.github.com>
|
||||||
ShadovvBeast <ShadovvBeast@gmail.com>
|
ShadovvBeast <ShadovvBeast@gmail.com>
|
||||||
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
Shakhar Dasgupta <shakhardasgupta@gmail.com>
|
||||||
|
@ -394,6 +473,7 @@ Shijie <821898965@qq.com>
|
||||||
Shintarou Okada <kokuzen@gmail.com>
|
Shintarou Okada <kokuzen@gmail.com>
|
||||||
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com>
|
||||||
Shouzheng Liu <lshzh.hi@gmail.com>
|
Shouzheng Liu <lshzh.hi@gmail.com>
|
||||||
|
Shuichi Tsutsumi <shuichi0526@gmail.com>
|
||||||
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
|
||||||
Simon Willison <swillison@gmail.com>
|
Simon Willison <swillison@gmail.com>
|
||||||
Siwen Yu <yusiwen@gmail.com>
|
Siwen Yu <yusiwen@gmail.com>
|
||||||
|
@ -405,11 +485,14 @@ Someone <sergei.kozlukov@aalto.fi>
|
||||||
Someone Serge <sergei.kozlukov@aalto.fi>
|
Someone Serge <sergei.kozlukov@aalto.fi>
|
||||||
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com>
|
||||||
Spencer Sutton <spencersutton@users.noreply.github.com>
|
Spencer Sutton <spencersutton@users.noreply.github.com>
|
||||||
|
Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
|
||||||
Srinivas Billa <nivibilla@gmail.com>
|
Srinivas Billa <nivibilla@gmail.com>
|
||||||
Stefan Sydow <stefan@sydow.email>
|
Stefan Sydow <stefan@sydow.email>
|
||||||
|
Steffen Röcker <sroecker@gmail.com>
|
||||||
Stephan Walter <stephan@walter.name>
|
Stephan Walter <stephan@walter.name>
|
||||||
Stephen Nichols <snichols@users.noreply.github.com>
|
Stephen Nichols <snichols@users.noreply.github.com>
|
||||||
Steve Grubb <ausearch.1@gmail.com>
|
Steve Grubb <ausearch.1@gmail.com>
|
||||||
|
Steven Prichard <spprichard20@gmail.com>
|
||||||
Steven Roussey <sroussey@gmail.com>
|
Steven Roussey <sroussey@gmail.com>
|
||||||
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
|
||||||
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com>
|
||||||
|
@ -434,16 +517,19 @@ Tom C <tom.corelis@gmail.com>
|
||||||
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
|
||||||
Tomas <tom.tomas.36478119@gmail.com>
|
Tomas <tom.tomas.36478119@gmail.com>
|
||||||
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
Tomáš Pazdiora <tomas.pazdiora@gmail.com>
|
||||||
|
Tristan Druyen <tristan@vault81.mozmail.com>
|
||||||
Tristan Ross <rosscomputerguy@protonmail.com>
|
Tristan Ross <rosscomputerguy@protonmail.com>
|
||||||
Tungsten842 <886724vf@anonaddy.me>
|
Tungsten842 <886724vf@anonaddy.me>
|
||||||
Tungsten842 <quantmint@protonmail.com>
|
Tungsten842 <quantmint@protonmail.com>
|
||||||
Tushar <ditsuke@protonmail.com>
|
Tushar <ditsuke@protonmail.com>
|
||||||
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
UEXTM.com <84163508+uextm@users.noreply.github.com>
|
||||||
|
Ulrich Drepper <drepper@gmail.com>
|
||||||
Uzo Nweke <uzoechi@gmail.com>
|
Uzo Nweke <uzoechi@gmail.com>
|
||||||
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
Vaibhav Srivastav <vaibhavs10@gmail.com>
|
||||||
Val Kharitonov <mail@kharvd.com>
|
Val Kharitonov <mail@kharvd.com>
|
||||||
Valentin Konovalov <valle.ketsujin@gmail.com>
|
Valentin Konovalov <valle.ketsujin@gmail.com>
|
||||||
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com>
|
||||||
|
Victor Nogueira <felladrin@gmail.com>
|
||||||
Victor Z. Peng <ziliangdotme@gmail.com>
|
Victor Z. Peng <ziliangdotme@gmail.com>
|
||||||
Vlad <spitfireage@gmail.com>
|
Vlad <spitfireage@gmail.com>
|
||||||
Vladimir <bogdad@gmail.com>
|
Vladimir <bogdad@gmail.com>
|
||||||
|
@ -455,7 +541,9 @@ Weird Constructor <weirdconstructor@gmail.com>
|
||||||
Welby Seely <welbyseely@gmail.com>
|
Welby Seely <welbyseely@gmail.com>
|
||||||
Wentai Zhang <rchardx@gmail.com>
|
Wentai Zhang <rchardx@gmail.com>
|
||||||
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
|
||||||
|
William Tambellini <william.tambellini@gmail.com>
|
||||||
Willy Tarreau <w@1wt.eu>
|
Willy Tarreau <w@1wt.eu>
|
||||||
|
Wouter <9594229+DifferentialityDevelopment@users.noreply.github.com>
|
||||||
Wu Jian Ping <wujjpp@hotmail.com>
|
Wu Jian Ping <wujjpp@hotmail.com>
|
||||||
Wu Jian Ping <wujp@greatld.com>
|
Wu Jian Ping <wujp@greatld.com>
|
||||||
Xiake Sun <xiake.sun@intel.com>
|
Xiake Sun <xiake.sun@intel.com>
|
||||||
|
@ -466,6 +554,8 @@ Xiaoyi Chen <cxychina@gmail.com>
|
||||||
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
Xingchen Song(宋星辰) <xingchensong1996@163.com>
|
||||||
Xuan Son Nguyen <thichthat@gmail.com>
|
Xuan Son Nguyen <thichthat@gmail.com>
|
||||||
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
Yann Follet <131855179+YannFollet@users.noreply.github.com>
|
||||||
|
Yaroslav <yaroslav.yashin@me.com>
|
||||||
|
Yazan Agha-Schrader <mountaiin@icloud.com>
|
||||||
Yiming Cui <conandiy@vip.qq.com>
|
Yiming Cui <conandiy@vip.qq.com>
|
||||||
Yishuo Wang <MeouSker77@outlook.com>
|
Yishuo Wang <MeouSker77@outlook.com>
|
||||||
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com>
|
||||||
|
@ -477,6 +567,7 @@ Zane Shannon <z@zcs.me>
|
||||||
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
Zay <95888118+isaiahbjork@users.noreply.github.com>
|
||||||
Zenix <zenixls2@gmail.com>
|
Zenix <zenixls2@gmail.com>
|
||||||
Zhang Peiyuan <a1286225768@gmail.com>
|
Zhang Peiyuan <a1286225768@gmail.com>
|
||||||
|
Zheng.Deng <32841220+dengzheng-cloud@users.noreply.github.com>
|
||||||
ZhouYuChen <zhouyuchen@naver.com>
|
ZhouYuChen <zhouyuchen@naver.com>
|
||||||
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
Ziad Ben Hadj-Alouane <zied.benhadjalouane@gmail.com>
|
||||||
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com>
|
||||||
|
@ -484,14 +575,18 @@ Zsapi <martin1.zsapka@gmail.com>
|
||||||
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com>
|
||||||
adel boussaken <netdur@gmail.com>
|
adel boussaken <netdur@gmail.com>
|
||||||
afrideva <95653597+afrideva@users.noreply.github.com>
|
afrideva <95653597+afrideva@users.noreply.github.com>
|
||||||
|
agray3 <agray3@users.noreply.github.com>
|
||||||
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
akawrykow <142945436+akawrykow@users.noreply.github.com>
|
||||||
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
alexpinel <93524949+alexpinel@users.noreply.github.com>
|
||||||
alonfaraj <alonfaraj@gmail.com>
|
alonfaraj <alonfaraj@gmail.com>
|
||||||
|
alwqx <kenan3015@gmail.com>
|
||||||
|
amd-lalithnc <lalithnc@amd.com>
|
||||||
andrijdavid <david@geek.mg>
|
andrijdavid <david@geek.mg>
|
||||||
anon998 <131767832+anon998@users.noreply.github.com>
|
anon998 <131767832+anon998@users.noreply.github.com>
|
||||||
anzz1 <anzz1@live.com>
|
anzz1 <anzz1@live.com>
|
||||||
apaz <aarpazdera@gmail.com>
|
apaz <aarpazdera@gmail.com>
|
||||||
apcameron <37645737+apcameron@users.noreply.github.com>
|
apcameron <37645737+apcameron@users.noreply.github.com>
|
||||||
|
arch-btw <57669023+arch-btw@users.noreply.github.com>
|
||||||
arcrank <arcrank@gmail.com>
|
arcrank <arcrank@gmail.com>
|
||||||
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com>
|
||||||
at8u <129688334+at8u@users.noreply.github.com>
|
at8u <129688334+at8u@users.noreply.github.com>
|
||||||
|
@ -514,13 +609,17 @@ cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com>
|
||||||
coezbek <c.oezbek@gmail.com>
|
coezbek <c.oezbek@gmail.com>
|
||||||
comex <comexk@gmail.com>
|
comex <comexk@gmail.com>
|
||||||
compilade <113953597+compilade@users.noreply.github.com>
|
compilade <113953597+compilade@users.noreply.github.com>
|
||||||
|
compilade <git@compilade.net>
|
||||||
|
cpumaxx <163466046+cpumaxx@users.noreply.github.com>
|
||||||
crasm <crasm@git.vczf.net>
|
crasm <crasm@git.vczf.net>
|
||||||
crasm <crasm@git.vczf.us>
|
crasm <crasm@git.vczf.us>
|
||||||
daboe01 <daboe01@googlemail.com>
|
daboe01 <daboe01@googlemail.com>
|
||||||
david raistrick <keen99@users.noreply.github.com>
|
david raistrick <keen99@users.noreply.github.com>
|
||||||
|
ddh0 <dylanhalladay02@icloud.com>
|
||||||
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
ddpasa <112642920+ddpasa@users.noreply.github.com>
|
||||||
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
deepdiffuser <112834445+deepdiffuser@users.noreply.github.com>
|
||||||
divinity76 <divinity76@gmail.com>
|
divinity76 <divinity76@gmail.com>
|
||||||
|
dm4 <sunrisedm4@gmail.com>
|
||||||
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
dotpy314 <33351922+dotpy314@users.noreply.github.com>
|
||||||
drbh <david.richard.holtz@gmail.com>
|
drbh <david.richard.holtz@gmail.com>
|
||||||
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
ds5t5 <145942675+ds5t5@users.noreply.github.com>
|
||||||
|
@ -529,6 +628,7 @@ eastriver <lee@eastriver.dev>
|
||||||
ebraminio <ebraminio@gmail.com>
|
ebraminio <ebraminio@gmail.com>
|
||||||
eiery <19350831+eiery@users.noreply.github.com>
|
eiery <19350831+eiery@users.noreply.github.com>
|
||||||
eric8607242 <e0928021388@gmail.com>
|
eric8607242 <e0928021388@gmail.com>
|
||||||
|
fairydreaming <166155368+fairydreaming@users.noreply.github.com>
|
||||||
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
|
||||||
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
github-actions[bot] <github-actions[bot]@users.noreply.github.com>
|
||||||
gliptic <gliptic@users.noreply.github.com>
|
gliptic <gliptic@users.noreply.github.com>
|
||||||
|
@ -539,6 +639,7 @@ h-h-h-h <13482553+h-h-h-h@users.noreply.github.com>
|
||||||
hankcs <cnhankmc@gmail.com>
|
hankcs <cnhankmc@gmail.com>
|
||||||
hoangmit <hoangmit@users.noreply.github.com>
|
hoangmit <hoangmit@users.noreply.github.com>
|
||||||
hongbo.mo <352280764@qq.com>
|
hongbo.mo <352280764@qq.com>
|
||||||
|
hopkins385 <98618192+hopkins385@users.noreply.github.com>
|
||||||
howlger <eclipse@voormann.de>
|
howlger <eclipse@voormann.de>
|
||||||
howlger <github@voormann.de>
|
howlger <github@voormann.de>
|
||||||
hutli <6594598+hutli@users.noreply.github.com>
|
hutli <6594598+hutli@users.noreply.github.com>
|
||||||
|
@ -549,14 +650,22 @@ hydai <z54981220@gmail.com>
|
||||||
iSma <ismail.senhaji@gmail.com>
|
iSma <ismail.senhaji@gmail.com>
|
||||||
iacore <74560659+iacore@users.noreply.github.com>
|
iacore <74560659+iacore@users.noreply.github.com>
|
||||||
igarnier <igarnier@protonmail.com>
|
igarnier <igarnier@protonmail.com>
|
||||||
|
intelmatt <61025942+intelmatt@users.noreply.github.com>
|
||||||
iohub <rickyang.pro@gmail.com>
|
iohub <rickyang.pro@gmail.com>
|
||||||
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com>
|
||||||
|
jaime-m-p <167997752+jaime-m-p@users.noreply.github.com>
|
||||||
jameswu2014 <545426914@qq.com>
|
jameswu2014 <545426914@qq.com>
|
||||||
|
jiez <373447296@qq.com>
|
||||||
jneem <joeneeman@gmail.com>
|
jneem <joeneeman@gmail.com>
|
||||||
|
joecryptotoo <80373433+joecryptotoo@users.noreply.github.com>
|
||||||
johnson442 <56517414+johnson442@users.noreply.github.com>
|
johnson442 <56517414+johnson442@users.noreply.github.com>
|
||||||
|
jojorne <jojorne@users.noreply.github.com>
|
||||||
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
jon-chuang <9093549+jon-chuang@users.noreply.github.com>
|
||||||
jp-x-g <jpxg-dev@protonmail.com>
|
jp-x-g <jpxg-dev@protonmail.com>
|
||||||
|
jukofyork <69222624+jukofyork@users.noreply.github.com>
|
||||||
|
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
|
||||||
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
jwj7140 <32943891+jwj7140@users.noreply.github.com>
|
||||||
|
k.h.lai <adrian.k.h.lai@outlook.com>
|
||||||
kaizau <kaizau@users.noreply.github.com>
|
kaizau <kaizau@users.noreply.github.com>
|
||||||
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
kalomaze <66376113+kalomaze@users.noreply.github.com>
|
||||||
kang <tpdns9032100@gmail.com>
|
kang <tpdns9032100@gmail.com>
|
||||||
|
@ -575,11 +684,15 @@ ldwang <ftgreat@163.com>
|
||||||
le.chang <cljs118@126.com>
|
le.chang <cljs118@126.com>
|
||||||
leejet <leejet714@gmail.com>
|
leejet <leejet714@gmail.com>
|
||||||
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
limitedAtonement <limitedAtonement@users.noreply.github.com>
|
||||||
|
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
|
||||||
lon <114724657+longregen@users.noreply.github.com>
|
lon <114724657+longregen@users.noreply.github.com>
|
||||||
|
loonerin <132926317+loonerin@users.noreply.github.com>
|
||||||
|
luoyu-intel <yu.luo@intel.com>
|
||||||
m3ndax <adrian.goessl@outlook.com>
|
m3ndax <adrian.goessl@outlook.com>
|
||||||
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
maddes8cht <55592906+maddes8cht@users.noreply.github.com>
|
||||||
makomk <makosoft@googlemail.com>
|
makomk <makosoft@googlemail.com>
|
||||||
manikbhandari <mbbhandarimanik2@gmail.com>
|
manikbhandari <mbbhandarimanik2@gmail.com>
|
||||||
|
maor-ps <154728172+maor-ps@users.noreply.github.com>
|
||||||
mdrokz <mohammadmunshi@gmail.com>
|
mdrokz <mohammadmunshi@gmail.com>
|
||||||
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com>
|
||||||
minarchist <minarchist@users.noreply.github.com>
|
minarchist <minarchist@users.noreply.github.com>
|
||||||
|
@ -593,15 +706,19 @@ ngc92 <7938269+ngc92@users.noreply.github.com>
|
||||||
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
nhamanasu <45545786+nhamanasu@users.noreply.github.com>
|
||||||
niansa/tuxifan <anton-sa@web.de>
|
niansa/tuxifan <anton-sa@web.de>
|
||||||
niansa/tuxifan <tuxifan@posteo.de>
|
niansa/tuxifan <tuxifan@posteo.de>
|
||||||
|
nickp27 <nb.porter@gmail.com>
|
||||||
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
ningshanwutuobang <ningshanwutuobang@gmail.com>
|
||||||
nold <Nold360@users.noreply.github.com>
|
nold <Nold360@users.noreply.github.com>
|
||||||
nopperl <54780682+nopperl@users.noreply.github.com>
|
nopperl <54780682+nopperl@users.noreply.github.com>
|
||||||
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
nusu-github <29514220+nusu-github@users.noreply.github.com>
|
||||||
olexiyb <olexiyb@gmail.com>
|
olexiyb <olexiyb@gmail.com>
|
||||||
|
omahs <73983677+omahs@users.noreply.github.com>
|
||||||
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
oobabooga <112222186+oobabooga@users.noreply.github.com>
|
||||||
opparco <parco.opaai@gmail.com>
|
opparco <parco.opaai@gmail.com>
|
||||||
ostix360 <55257054+ostix360@users.noreply.github.com>
|
ostix360 <55257054+ostix360@users.noreply.github.com>
|
||||||
|
pengxin99 <pengxin.yuan@intel.com>
|
||||||
perserk <perserk@gmail.com>
|
perserk <perserk@gmail.com>
|
||||||
|
pmysl <piotr.myslinski@outlook.com>
|
||||||
postmasters <namnguyen@google.com>
|
postmasters <namnguyen@google.com>
|
||||||
pudepiedj <pudepiedj@gmail.com>
|
pudepiedj <pudepiedj@gmail.com>
|
||||||
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
qingfengfenga <41416092+qingfengfenga@users.noreply.github.com>
|
||||||
|
@ -614,16 +731,19 @@ rhuddleston <ryan.huddleston@percona.com>
|
||||||
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
rimoliga <53384203+rimoliga@users.noreply.github.com>
|
||||||
runfuture <runfuture@users.noreply.github.com>
|
runfuture <runfuture@users.noreply.github.com>
|
||||||
sandyiscool <sandyiscool@gmail.com>
|
sandyiscool <sandyiscool@gmail.com>
|
||||||
|
sasha0552 <admin@sasha0552.org>
|
||||||
semidark <me@semidark.net>
|
semidark <me@semidark.net>
|
||||||
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
sharpHL <132747147+sharpHL@users.noreply.github.com>
|
||||||
shibe2 <shibe@tuta.io>
|
shibe2 <shibe@tuta.io>
|
||||||
singularity <12184989+singularity-s0@users.noreply.github.com>
|
singularity <12184989+singularity-s0@users.noreply.github.com>
|
||||||
sjinzh <sjinzh@gmail.com>
|
sjinzh <sjinzh@gmail.com>
|
||||||
|
sjxx <63994076+ylsdamxssjxxdd@users.noreply.github.com>
|
||||||
slaren <2141330+slaren@users.noreply.github.com>
|
slaren <2141330+slaren@users.noreply.github.com>
|
||||||
slaren <slarengh@gmail.com>
|
slaren <slarengh@gmail.com>
|
||||||
snadampal <87143774+snadampal@users.noreply.github.com>
|
snadampal <87143774+snadampal@users.noreply.github.com>
|
||||||
staviq <staviq@gmail.com>
|
staviq <staviq@gmail.com>
|
||||||
stduhpf <stephduh@live.fr>
|
stduhpf <stephduh@live.fr>
|
||||||
|
strawberrymelonpanda <152940198+strawberrymelonpanda@users.noreply.github.com>
|
||||||
swittk <switt1995@gmail.com>
|
swittk <switt1995@gmail.com>
|
||||||
takov751 <40316768+takov751@users.noreply.github.com>
|
takov751 <40316768+takov751@users.noreply.github.com>
|
||||||
tarcey <cey.tarik@gmail.com>
|
tarcey <cey.tarik@gmail.com>
|
||||||
|
@ -636,12 +756,16 @@ uint256_t <konndennsa@gmail.com>
|
||||||
uint256_t <maekawatoshiki1017@gmail.com>
|
uint256_t <maekawatoshiki1017@gmail.com>
|
||||||
unbounded <haakon@likedan.net>
|
unbounded <haakon@likedan.net>
|
||||||
valiray <133289098+valiray@users.noreply.github.com>
|
valiray <133289098+valiray@users.noreply.github.com>
|
||||||
|
vik <vikhyatk@gmail.com>
|
||||||
|
viric <viric@viric.name>
|
||||||
vodkaslime <646329483@qq.com>
|
vodkaslime <646329483@qq.com>
|
||||||
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
vvhg1 <94630311+vvhg1@users.noreply.github.com>
|
||||||
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
vxiiduu <73044267+vxiiduu@users.noreply.github.com>
|
||||||
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
wbpxre150 <100937007+wbpxre150@users.noreply.github.com>
|
||||||
whoreson <139810751+whoreson@users.noreply.github.com>
|
whoreson <139810751+whoreson@users.noreply.github.com>
|
||||||
|
woachk <24752637+woachk@users.noreply.github.com>
|
||||||
wonjun Jang <strutive07@gmail.com>
|
wonjun Jang <strutive07@gmail.com>
|
||||||
|
woodx <124784234+woodx9@users.noreply.github.com>
|
||||||
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
wzy <32936898+Freed-Wu@users.noreply.github.com>
|
||||||
xaedes <xaedes@gmail.com>
|
xaedes <xaedes@gmail.com>
|
||||||
xaedes <xaedes@googlemail.com>
|
xaedes <xaedes@googlemail.com>
|
||||||
|
@ -649,7 +773,10 @@ xloem <0xloem@gmail.com>
|
||||||
yangli2 <yangli2@gmail.com>
|
yangli2 <yangli2@gmail.com>
|
||||||
yuiseki <yuiseki@gmail.com>
|
yuiseki <yuiseki@gmail.com>
|
||||||
zakkor <edward.partenie@gmail.com>
|
zakkor <edward.partenie@gmail.com>
|
||||||
|
zhangkaihuo <zhangkaihuo@gmail.com>
|
||||||
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
zhouwg <6889919+zhouwg@users.noreply.github.com>
|
||||||
|
zhouwg <zhouwg2000@gmail.com>
|
||||||
zrm <trustiosity.zrm@gmail.com>
|
zrm <trustiosity.zrm@gmail.com>
|
||||||
|
Ștefan-Gabriel Muscalu <legraphista@users.noreply.github.com>
|
||||||
源文雨 <41315874+fumiama@users.noreply.github.com>
|
源文雨 <41315874+fumiama@users.noreply.github.com>
|
||||||
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com>
|
||||||
|
|
1368
CMakeLists.txt
1368
CMakeLists.txt
File diff suppressed because it is too large
Load diff
|
@ -11,10 +11,22 @@
|
||||||
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "sycl-base",
|
||||||
|
"hidden": true,
|
||||||
|
"generator": "Ninja",
|
||||||
|
"binaryDir": "${sourceDir}/build-${presetName}",
|
||||||
|
"cacheVariables": {
|
||||||
|
"CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
|
||||||
|
"CMAKE_CXX_COMPILER": "icx",
|
||||||
|
"GGML_SYCL": "ON",
|
||||||
|
"CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
|
||||||
|
}
|
||||||
|
},
|
||||||
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
{ "name": "debug", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
|
||||||
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
{ "name": "release", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
|
||||||
{ "name": "static", "hidden": true, "cacheVariables": { "LLAMA_STATIC": "ON" } },
|
{ "name": "reldbg", "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
|
||||||
|
{ "name": "static", "hidden": true, "cacheVariables": { "GGML_STATIC": "ON" } },
|
||||||
|
|
||||||
{
|
{
|
||||||
"name": "arm64-windows-msvc", "hidden": true,
|
"name": "arm64-windows-msvc", "hidden": true,
|
||||||
|
@ -35,15 +47,18 @@
|
||||||
},
|
},
|
||||||
|
|
||||||
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
{ "name": "arm64-windows-llvm-debug" , "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
|
||||||
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "release" ] },
|
{ "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "release", "static" ] },
|
{ "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
{ "name": "arm64-windows-msvc-debug" , "inherits": [ "base", "arm64-windows-msvc", "debug" ] },
|
||||||
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "release" ] },
|
{ "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg" ] },
|
||||||
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "release", "static" ] },
|
{ "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc", "reldbg", "static" ] },
|
||||||
|
|
||||||
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
{ "name": "x64-windows-msvc-debug" , "inherits": [ "base", "debug" ] },
|
||||||
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "release" ] },
|
{ "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
|
||||||
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "release", "static" ] }
|
{ "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
|
||||||
|
|
||||||
|
{ "name": "x64-windows-sycl-debug" , "inherits": [ "sycl-base", "debug" ] },
|
||||||
|
{ "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,14 +3,13 @@
|
||||||
import PackageDescription
|
import PackageDescription
|
||||||
|
|
||||||
var sources = [
|
var sources = [
|
||||||
"ggml.c",
|
"src/llama.cpp",
|
||||||
"sgemm.cpp",
|
"src/unicode.cpp",
|
||||||
"llama.cpp",
|
"src/unicode-data.cpp",
|
||||||
"unicode.cpp",
|
"ggml/src/ggml.c",
|
||||||
"unicode-data.cpp",
|
"ggml/src/ggml-alloc.c",
|
||||||
"ggml-alloc.c",
|
"ggml/src/ggml-backend.c",
|
||||||
"ggml-backend.c",
|
"ggml/src/ggml-quants.c",
|
||||||
"ggml-quants.c",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
var resources: [Resource] = []
|
var resources: [Resource] = []
|
||||||
|
@ -26,8 +25,8 @@ var cSettings: [CSetting] = [
|
||||||
]
|
]
|
||||||
|
|
||||||
#if canImport(Darwin)
|
#if canImport(Darwin)
|
||||||
sources.append("ggml-metal.m")
|
sources.append("ggml/src/ggml-metal.m")
|
||||||
resources.append(.process("ggml-metal.metal"))
|
resources.append(.process("ggml/src/ggml-metal.metal"))
|
||||||
linkerSettings.append(.linkedFramework("Accelerate"))
|
linkerSettings.append(.linkedFramework("Accelerate"))
|
||||||
cSettings.append(
|
cSettings.append(
|
||||||
contentsOf: [
|
contentsOf: [
|
||||||
|
@ -63,8 +62,6 @@ let package = Package(
|
||||||
"models",
|
"models",
|
||||||
"tests",
|
"tests",
|
||||||
"CMakeLists.txt",
|
"CMakeLists.txt",
|
||||||
"ggml-cuda.cu",
|
|
||||||
"ggml-cuda.h",
|
|
||||||
"Makefile"
|
"Makefile"
|
||||||
],
|
],
|
||||||
sources: sources,
|
sources: sources,
|
||||||
|
|
|
@ -115,12 +115,12 @@ The docker build option is currently limited to *intel GPU* targets.
|
||||||
### Build image
|
### Build image
|
||||||
```sh
|
```sh
|
||||||
# Using FP16
|
# Using FP16
|
||||||
docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
|
||||||
```
|
```
|
||||||
|
|
||||||
*Notes*:
|
*Notes*:
|
||||||
|
|
||||||
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command.
|
To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="GGML_SYCL_F16=ON"` argument from the previous command.
|
||||||
|
|
||||||
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
You can also use the `.devops/llama-server-intel.Dockerfile`, which builds the *"server"* alternative.
|
||||||
|
|
||||||
|
@ -244,10 +244,10 @@ source /opt/intel/oneapi/setvars.sh
|
||||||
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
# Build LLAMA with MKL BLAS acceleration for intel GPU
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# Option 2: Use FP16
|
# Option 2: Use FP16
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake -B build -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
|
@ -264,10 +264,10 @@ export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
|
||||||
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
# Build LLAMA with Nvidia BLAS acceleration through SYCL
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
# Option 2: Use FP16
|
# Option 2: Use FP16
|
||||||
cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
|
cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
# build all binary
|
# build all binary
|
||||||
cmake --build build --config Release -j -v
|
cmake --build build --config Release -j -v
|
||||||
|
@ -410,15 +410,9 @@ Output (example):
|
||||||
|
|
||||||
4. Install build tools
|
4. Install build tools
|
||||||
|
|
||||||
a. Download & install cmake for Windows: https://cmake.org/download/
|
a. Download & install cmake for Windows: https://cmake.org/download/ (CMake can also be installed from Visual Studio Installer)
|
||||||
|
b. The new Visual Studio will install Ninja as default. (If not, please install it manually: https://ninja-build.org/)
|
||||||
|
|
||||||
b. Download & install mingw-w64 make for Windows provided by w64devkit
|
|
||||||
|
|
||||||
- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip).
|
|
||||||
|
|
||||||
- Extract `w64devkit` on your pc.
|
|
||||||
|
|
||||||
- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`).
|
|
||||||
|
|
||||||
### II. Build llama.cpp
|
### II. Build llama.cpp
|
||||||
|
|
||||||
|
@ -428,10 +422,10 @@ On the oneAPI command line window, step into the llama.cpp main directory and ru
|
||||||
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
|
||||||
|
|
||||||
# Option 1: Use FP32 (recommended for better performance in most cases)
|
# Option 1: Use FP32 (recommended for better performance in most cases)
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release
|
||||||
|
|
||||||
# Option 2: Or FP16
|
# Option 2: Or FP16
|
||||||
cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
cmake -B build -G "Ninja" -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
cmake --build build --config Release -j
|
cmake --build build --config Release -j
|
||||||
```
|
```
|
||||||
|
@ -441,9 +435,23 @@ Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former in
|
||||||
.\examples\sycl\win-build-sycl.bat
|
.\examples\sycl\win-build-sycl.bat
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Or, use CMake presets to build:
|
||||||
|
```sh
|
||||||
|
cmake --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake -DGGML_SYCL_F16=ON --preset x64-windows-sycl-release
|
||||||
|
cmake --build build-x64-windows-sycl-release -j --target llama-cli
|
||||||
|
|
||||||
|
cmake --preset x64-windows-sycl-debug
|
||||||
|
cmake --build build-x64-windows-sycl-debug -j --target llama-cli
|
||||||
|
```
|
||||||
|
|
||||||
|
Or, you can use Visual Studio to open llama.cpp folder as a CMake project. Choose the sycl CMake presets (`x64-windows-sycl-release` or `x64-windows-sycl-debug`) before you compile the project.
|
||||||
|
|
||||||
*Notes:*
|
*Notes:*
|
||||||
|
|
||||||
- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make llama-cli`.
|
- In case of a minimal experimental setup, the user can build the inference executable only through `cmake --build build --config Release -j --target llama-cli`.
|
||||||
|
|
||||||
### III. Run the inference
|
### III. Run the inference
|
||||||
|
|
||||||
|
@ -536,9 +544,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
||||||
|
|
||||||
| Name | Value | Function |
|
| Name | Value | Function |
|
||||||
|--------------------|-----------------------------------|---------------------------------------------|
|
|--------------------|-----------------------------------|---------------------------------------------|
|
||||||
| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path. |
|
||||||
| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. |
|
||||||
| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. |
|
||||||
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
||||||
|
|
62
README.md
62
README.md
|
@ -15,6 +15,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Recent API changes
|
### Recent API changes
|
||||||
|
|
||||||
|
- [2024 Jun 26] The source code and CMake build scripts have been restructured https://github.com/ggerganov/llama.cpp/pull/8006
|
||||||
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807
|
||||||
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341
|
||||||
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122
|
||||||
|
@ -415,7 +416,7 @@ Flox follows the nixpkgs build of llama.cpp.
|
||||||
### Metal Build
|
### Metal Build
|
||||||
|
|
||||||
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
On MacOS, Metal is enabled by default. Using Metal makes the computation run on the GPU.
|
||||||
To disable the Metal build at compile time use the `LLAMA_NO_METAL=1` flag or the `LLAMA_METAL=OFF` cmake option.
|
To disable the Metal build at compile time use the `GGML_NO_METAL=1` flag or the `GGML_METAL=OFF` cmake option.
|
||||||
|
|
||||||
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
When built with Metal support, you can explicitly disable GPU inference with the `--n-gpu-layers|-ngl 0` command-line
|
||||||
argument.
|
argument.
|
||||||
|
@ -435,7 +436,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
- On Linux:
|
- On Linux:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_OPENBLAS=1
|
make GGML_OPENBLAS=1
|
||||||
```
|
```
|
||||||
|
|
||||||
- On Windows:
|
- On Windows:
|
||||||
|
@ -450,13 +451,13 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
8. From here you can run:
|
8. From here you can run:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_OPENBLAS=1
|
make GGML_OPENBLAS=1
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` on Linux:
|
- Using `CMake` on Linux:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -475,10 +476,10 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
|
||||||
|
|
||||||
- Using manual oneAPI installation:
|
- Using manual oneAPI installation:
|
||||||
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
By default, `GGML_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DGGML_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
|
||||||
```bash
|
```bash
|
||||||
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
|
||||||
cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
|
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_NATIVE=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -495,27 +496,28 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_CUDA=1
|
make GGML_CUDA=1
|
||||||
```
|
```
|
||||||
- Using `CMake`:
|
- Using `CMake`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_CUDA=ON
|
cmake -B build -DGGML_CUDA=ON
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance:
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|-------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
| GGML_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. |
|
||||||
| LLAMA_CUDA_FORCE_MMQ | Boolean | false | Force the use of dequantization + matrix multiplication kernels instead of leveraging Math libraries. | |
|
| GGML_CUDA_FORCE_MMQ | Boolean | false | Force the use of custom matrix multiplication kernels for quantized models instead of FP16 cuBLAS even if there is no int8 tensor core implementation available (affects V100, RDNA3). MMQ kernels are enabled by default on GPUs with int8 tensor core support. With MMQ force enabled, speed for large batch sizes will be worse but VRAM consumption will be lower. |
|
||||||
| LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
| GGML_CUDA_FORCE_CUBLAS | Boolean | false | Force the use of FP16 cuBLAS instead of custom matrix multiplication kernels for quantized models |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| GGML_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. |
|
||||||
| LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
| LLAMA_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
| GGML_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. |
|
||||||
|
| GGML_CUDA_FA_ALL_QUANTS | Boolean | false | Compile support for all KV cache quantization type (combinations) for the FlashAttention CUDA kernels. More fine-grained control over KV cache size but compilation takes much longer. |
|
||||||
|
|
||||||
- #### hipBLAS
|
- #### hipBLAS
|
||||||
|
|
||||||
|
@ -525,15 +527,15 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
|
|
||||||
- Using `make`:
|
- Using `make`:
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_HIPBLAS=1
|
make GGML_HIPBLAS=1
|
||||||
```
|
```
|
||||||
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
- Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
|
||||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build --config Release -- -j 16
|
&& cmake --build build --config Release -- -j 16
|
||||||
```
|
```
|
||||||
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON`.
|
On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
|
||||||
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
|
||||||
|
|
||||||
Note that if you get the following error:
|
Note that if you get the following error:
|
||||||
|
@ -547,19 +549,19 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
```bash
|
```bash
|
||||||
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -p)" \
|
||||||
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
HIP_DEVICE_LIB_PATH=<directory-you-just-found> \
|
||||||
cmake -S . -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
cmake -S . -B build -DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
|
||||||
&& cmake --build build -- -j 16
|
&& cmake --build build -- -j 16
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
- Using `make` (example for target gfx1030, build with 16 CPU threads):
|
||||||
```bash
|
```bash
|
||||||
make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
make -j16 GGML_HIPBLAS=1 GGML_HIP_UMA=1 AMDGPU_TARGETS=gfx1030
|
||||||
```
|
```
|
||||||
|
|
||||||
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
- Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU):
|
||||||
```bash
|
```bash
|
||||||
set PATH=%HIP_PATH%\bin;%PATH%
|
set PATH=%HIP_PATH%\bin;%PATH%
|
||||||
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
cmake -S . -B build -G Ninja -DAMDGPU_TARGETS=gfx1100 -DGGML_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release
|
||||||
cmake --build build
|
cmake --build build
|
||||||
```
|
```
|
||||||
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors)
|
||||||
|
@ -570,11 +572,11 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
|
||||||
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above):
|
||||||
|
|
||||||
| Option | Legal values | Default | Description |
|
| Option | Legal values | Default | Description |
|
||||||
|-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
| GGML_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
| GGML_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. |
|
||||||
| LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
| GGML_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. |
|
||||||
|
|
||||||
- #### Vulkan
|
- #### Vulkan
|
||||||
|
|
||||||
|
@ -612,7 +614,7 @@ Building the program with BLAS support may lead to some performance improvements
|
||||||
Then, build llama.cpp using the cmake command below:
|
Then, build llama.cpp using the cmake command below:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cmake -B build -DLLAMA_VULKAN=1
|
cmake -B build -DGGML_VULKAN=1
|
||||||
cmake --build build --config Release
|
cmake --build build --config Release
|
||||||
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
# Test the output binary (with "-ngl 33" to offload all layers to GPU)
|
||||||
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
./bin/llama-cli -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
|
||||||
|
|
10
ci/run.sh
10
ci/run.sh
|
@ -36,11 +36,11 @@ SRC=`pwd`
|
||||||
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_METAL} ]; then
|
if [ ! -z ${GG_BUILD_METAL} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
if [ ! -z ${GG_BUILD_CUDA} ]; then
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=1"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
|
@ -50,7 +50,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
|
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
|
||||||
fi
|
fi
|
||||||
## helpers
|
## helpers
|
||||||
|
|
||||||
|
@ -284,7 +284,7 @@ function gg_run_open_llama_7b_v2 {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../examples/convert-legacy-llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
@ -550,7 +550,7 @@ function gg_run_pythia_2_8b {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DGGML_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
|
||||||
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
|
||||||
|
|
||||||
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
python3 ../convert-hf-to-gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
|
||||||
|
|
22
cmake/git-vars.cmake
Normal file
22
cmake/git-vars.cmake
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
find_package(Git)
|
||||||
|
|
||||||
|
# the commit's SHA1
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_SHA1
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
# the date of the commit
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_DATE
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
# the subject of the commit
|
||||||
|
execute_process(COMMAND
|
||||||
|
"${GIT_EXECUTABLE}" log -1 --format=%s
|
||||||
|
WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
|
||||||
|
OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
|
||||||
|
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
|
|
@ -1,41 +1,43 @@
|
||||||
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
|
||||||
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
|
||||||
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
|
||||||
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
|
||||||
set(LLAMA_BLAS @LLAMA_BLAS@)
|
|
||||||
set(LLAMA_CUDA @LLAMA_CUDA@)
|
set(GGML_BLAS @GGML_BLAS@)
|
||||||
set(LLAMA_METAL @LLAMA_METAL@)
|
set(GGML_CUDA @GGML_CUDA@)
|
||||||
set(LLAMA_HIPBLAS @LLAMA_HIPBLAS@)
|
set(GGML_METAL @GGML_METAL@)
|
||||||
set(LLAMA_ACCELERATE @LLAMA_ACCELERATE@)
|
set(GGML_HIPBLAS @GGML_HIPBLAS@)
|
||||||
|
set(GGML_ACCELERATE @GGML_ACCELERATE@)
|
||||||
|
|
||||||
@PACKAGE_INIT@
|
@PACKAGE_INIT@
|
||||||
|
|
||||||
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
|
||||||
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
|
||||||
|
|
||||||
# Ensure transient dependencies satisfied
|
# Ensure transient dependencies satisfied
|
||||||
|
|
||||||
find_package(Threads REQUIRED)
|
find_package(Threads REQUIRED)
|
||||||
if (APPLE AND LLAMA_ACCELERATE)
|
|
||||||
|
if (APPLE AND GGML_ACCELERATE)
|
||||||
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_BLAS)
|
if (GGML_BLAS)
|
||||||
find_package(BLAS REQUIRED)
|
find_package(BLAS REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_CUDA)
|
if (GGML_CUDA)
|
||||||
find_package(CUDAToolkit REQUIRED)
|
find_package(CUDAToolkit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_METAL)
|
if (GGML_METAL)
|
||||||
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
|
||||||
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
find_library(METAL_FRAMEWORK Metal REQUIRED)
|
||||||
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (LLAMA_HIPBLAS)
|
if (GGML_HIPBLAS)
|
||||||
find_package(hip REQUIRED)
|
find_package(hip REQUIRED)
|
||||||
find_package(hipblas REQUIRED)
|
find_package(hipblas REQUIRED)
|
||||||
find_package(rocblas REQUIRED)
|
find_package(rocblas REQUIRED)
|
||||||
|
@ -47,7 +49,9 @@ find_library(llama_LIBRARY llama
|
||||||
|
|
||||||
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
|
||||||
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
|
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
|
||||||
|
|
||||||
add_library(llama UNKNOWN IMPORTED)
|
add_library(llama UNKNOWN IMPORTED)
|
||||||
|
|
||||||
set_target_properties(llama
|
set_target_properties(llama
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
|
|
@ -1,5 +1,6 @@
|
||||||
# common
|
# common
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
# Build info header
|
# Build info header
|
||||||
#
|
#
|
||||||
|
@ -36,7 +37,7 @@ add_custom_command(
|
||||||
COMMENT "Generating build details from Git"
|
COMMENT "Generating build details from Git"
|
||||||
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
COMMAND ${CMAKE_COMMAND} -DMSVC=${MSVC} -DCMAKE_C_COMPILER_VERSION=${CMAKE_C_COMPILER_VERSION}
|
||||||
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
-DCMAKE_C_COMPILER_ID=${CMAKE_C_COMPILER_ID} -DCMAKE_VS_PLATFORM_NAME=${CMAKE_VS_PLATFORM_NAME}
|
||||||
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/../scripts/gen-build-info-cpp.cmake"
|
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info-gen-cpp.cmake"
|
||||||
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/.."
|
||||||
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/build-info.cpp.in" ${GIT_INDEX}
|
||||||
VERBATIM
|
VERBATIM
|
||||||
|
@ -83,5 +84,5 @@ if (LLAMA_CURL)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PUBLIC .)
|
target_include_directories(${TARGET} PUBLIC .)
|
||||||
target_compile_features(${TARGET} PUBLIC cxx_std_11)
|
target_compile_features (${TARGET} PUBLIC cxx_std_11)
|
||||||
target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
target_link_libraries (${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama Threads::Threads)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
|
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
|
||||||
|
|
||||||
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
set(TEMPLATE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp.in")
|
||||||
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
set(OUTPUT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/common/build-info.cpp")
|
||||||
|
|
||||||
# Only write the build info if it changed
|
# Only write the build info if it changed
|
||||||
if(EXISTS ${OUTPUT_FILE})
|
if(EXISTS ${OUTPUT_FILE})
|
File diff suppressed because it is too large
Load diff
|
@ -52,6 +52,12 @@ int32_t cpu_get_num_math();
|
||||||
// CLI argument parsing
|
// CLI argument parsing
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// dimensionality reduction methods, used by cvector-generator
|
||||||
|
enum dimre_method {
|
||||||
|
DIMRE_METHOD_PCA,
|
||||||
|
DIMRE_METHOD_MEAN,
|
||||||
|
};
|
||||||
|
|
||||||
struct gpt_params {
|
struct gpt_params {
|
||||||
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed
|
||||||
|
|
||||||
|
@ -152,7 +158,6 @@ struct gpt_params {
|
||||||
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
bool prompt_cache_all = false; // save user input and generations to prompt cache
|
||||||
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it
|
||||||
|
|
||||||
bool embedding = false; // get only sentence embedding
|
|
||||||
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
bool escape = true; // escape "\n", "\r", "\t", "\'", "\"", and "\\"
|
||||||
bool multiline_input = false; // reverse the usage of `\`
|
bool multiline_input = false; // reverse the usage of `\`
|
||||||
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
|
||||||
|
@ -179,6 +184,12 @@ struct gpt_params {
|
||||||
std::string mmproj = ""; // path to multimodal projector
|
std::string mmproj = ""; // path to multimodal projector
|
||||||
std::vector<std::string> image; // path to image file(s)
|
std::vector<std::string> image; // path to image file(s)
|
||||||
|
|
||||||
|
// embedding
|
||||||
|
bool embedding = false; // get only sentence embedding
|
||||||
|
int32_t embd_normalize = 2; // normalisation for embendings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
|
||||||
|
std::string embd_out = ""; // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
|
||||||
|
std::string embd_sep = "\n"; // separator of embendings
|
||||||
|
|
||||||
// server params
|
// server params
|
||||||
int32_t port = 8080; // server listens on this network port
|
int32_t port = 8080; // server listens on this network port
|
||||||
int32_t timeout_read = 600; // http read timeout in seconds
|
int32_t timeout_read = 600; // http read timeout in seconds
|
||||||
|
@ -233,13 +244,12 @@ struct gpt_params {
|
||||||
bool compute_ppl = true; // whether to compute perplexity
|
bool compute_ppl = true; // whether to compute perplexity
|
||||||
|
|
||||||
// cvector-generator params
|
// cvector-generator params
|
||||||
int n_completions = 64;
|
int n_pca_batch = 100;
|
||||||
int n_pca_batch = 20;
|
|
||||||
int n_pca_iterations = 1000;
|
int n_pca_iterations = 1000;
|
||||||
std::string cvector_outfile = "control_vector.gguf";
|
dimre_method cvector_dimre_method = DIMRE_METHOD_PCA;
|
||||||
std::string cvector_completions_file = "examples/cvector-generator/completions.txt";
|
std::string cvector_outfile = "control_vector.gguf";
|
||||||
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
std::string cvector_positive_file = "examples/cvector-generator/positive.txt";
|
||||||
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
std::string cvector_negative_file = "examples/cvector-generator/negative.txt";
|
||||||
|
|
||||||
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
bool spm_infill = false; // suffix/prefix/middle pattern for infill
|
||||||
};
|
};
|
||||||
|
@ -362,9 +372,32 @@ bool llama_should_add_bos_token(const llama_model * model);
|
||||||
// Chat template utils
|
// Chat template utils
|
||||||
//
|
//
|
||||||
|
|
||||||
|
// same with llama_chat_message, but uses std::string
|
||||||
|
struct llama_chat_msg {
|
||||||
|
std::string role;
|
||||||
|
std::string content;
|
||||||
|
};
|
||||||
|
|
||||||
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
||||||
bool llama_chat_verify_template(const std::string & tmpl);
|
bool llama_chat_verify_template(const std::string & tmpl);
|
||||||
|
|
||||||
|
// CPP wrapper for llama_chat_apply_template
|
||||||
|
std::string llama_chat_apply_template(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & chat,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Format single message, while taking into account the position of that message in chat history
|
||||||
|
std::string llama_chat_format_single(const struct llama_model * model,
|
||||||
|
const std::string & tmpl,
|
||||||
|
const std::vector<llama_chat_msg> & past_msg,
|
||||||
|
const llama_chat_msg & new_msg,
|
||||||
|
bool add_ass);
|
||||||
|
|
||||||
|
// Returns an example of formatted chat
|
||||||
|
std::string llama_chat_format_example(const struct llama_model * model,
|
||||||
|
const std::string & tmpl);
|
||||||
|
|
||||||
//
|
//
|
||||||
// KV cache utils
|
// KV cache utils
|
||||||
//
|
//
|
||||||
|
@ -379,7 +412,7 @@ void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_siz
|
||||||
// Embedding utils
|
// Embedding utils
|
||||||
//
|
//
|
||||||
|
|
||||||
void llama_embd_normalize(const float * inp, float * out, int n);
|
void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
||||||
|
|
||||||
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
||||||
|
|
||||||
|
|
|
@ -40,6 +40,233 @@ static std::string build_repetition(const std::string & item_rule, int min_items
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Minimalistic replacement for std::string_view, which is only available from C++17 onwards */
|
||||||
|
class string_view {
|
||||||
|
const std::string & _str;
|
||||||
|
const size_t _start;
|
||||||
|
const size_t _end;
|
||||||
|
public:
|
||||||
|
string_view(const std::string & str, size_t start = 0, size_t end = std::string::npos) : _str(str), _start(start), _end(end == std::string::npos ? str.length() : end) {}
|
||||||
|
|
||||||
|
size_t size() const {
|
||||||
|
return _end - _start;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t length() const {
|
||||||
|
return size();
|
||||||
|
}
|
||||||
|
|
||||||
|
operator std::string() const {
|
||||||
|
return str();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string str() const {
|
||||||
|
return _str.substr(_start, _end - _start);
|
||||||
|
}
|
||||||
|
|
||||||
|
string_view substr(size_t pos, size_t len = std::string::npos) const {
|
||||||
|
return string_view(_str, _start + pos, len == std::string::npos ? _end : _start + pos + len);
|
||||||
|
}
|
||||||
|
|
||||||
|
char operator[](size_t pos) const {
|
||||||
|
auto index = _start + pos;
|
||||||
|
if (index >= _end) {
|
||||||
|
throw std::out_of_range("string_view index out of range");
|
||||||
|
}
|
||||||
|
return _str[_start + pos];
|
||||||
|
}
|
||||||
|
|
||||||
|
bool operator==(const string_view & other) const {
|
||||||
|
std::string this_str = *this;
|
||||||
|
std::string other_str = other;
|
||||||
|
return this_str == other_str;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void _build_min_max_int(int min_value, int max_value, std::stringstream & out, int decimals_left = 16, bool top_level = true) {
|
||||||
|
auto has_min = min_value != std::numeric_limits<int>::min();
|
||||||
|
auto has_max = max_value != std::numeric_limits<int>::max();
|
||||||
|
|
||||||
|
auto digit_range = [&](char from, char to) {
|
||||||
|
out << "[";
|
||||||
|
if (from == to) {
|
||||||
|
out << from;
|
||||||
|
} else {
|
||||||
|
out << from << "-" << to;
|
||||||
|
}
|
||||||
|
out << "]";
|
||||||
|
};
|
||||||
|
auto more_digits = [&](int min_digits, int max_digits) {
|
||||||
|
out << "[0-9]";
|
||||||
|
if (min_digits == max_digits && min_digits == 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
out << "{";
|
||||||
|
out << min_digits;
|
||||||
|
if (max_digits != min_digits) {
|
||||||
|
out << ",";
|
||||||
|
if (max_digits != std::numeric_limits<int>::max()) {
|
||||||
|
out << max_digits;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out << "}";
|
||||||
|
};
|
||||||
|
std::function<void(const string_view &, const string_view &)> uniform_range =
|
||||||
|
[&](const string_view & from, const string_view & to) {
|
||||||
|
size_t i = 0;
|
||||||
|
while (i < from.length() && i < to.length() && from[i] == to[i]) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (i > 0) {
|
||||||
|
out << "\"" << from.substr(0, i).str() << "\"";
|
||||||
|
}
|
||||||
|
if (i < from.length() && i < to.length()) {
|
||||||
|
if (i > 0) {
|
||||||
|
out << " ";
|
||||||
|
}
|
||||||
|
auto sub_len = from.length() - i - 1;
|
||||||
|
if (sub_len > 0) {
|
||||||
|
auto from_sub = from.substr(i + 1);
|
||||||
|
auto to_sub = to.substr(i + 1);
|
||||||
|
auto sub_zeros = repeat("0", sub_len);
|
||||||
|
auto sub_nines = repeat("9", sub_len);
|
||||||
|
|
||||||
|
auto to_reached = false;
|
||||||
|
out << "(";
|
||||||
|
if (from_sub == sub_zeros) {
|
||||||
|
digit_range(from[i], to[i] - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(sub_len, sub_len);
|
||||||
|
} else {
|
||||||
|
out << "[" << from[i] << "] ";
|
||||||
|
out << "(";
|
||||||
|
uniform_range(from_sub, sub_nines);
|
||||||
|
out << ")";
|
||||||
|
if (from[i] < to[i] - 1) {
|
||||||
|
out << " | ";
|
||||||
|
if (to_sub == sub_nines) {
|
||||||
|
digit_range(from[i] + 1, to[i]);
|
||||||
|
to_reached = true;
|
||||||
|
} else {
|
||||||
|
digit_range(from[i] + 1, to[i] - 1);
|
||||||
|
}
|
||||||
|
out << " ";
|
||||||
|
more_digits(sub_len, sub_len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!to_reached) {
|
||||||
|
out << " | ";
|
||||||
|
digit_range(to[i], to[i]);
|
||||||
|
out << " ";
|
||||||
|
uniform_range(sub_zeros, to_sub);
|
||||||
|
}
|
||||||
|
out << ")";
|
||||||
|
} else {
|
||||||
|
out << "[" << from[i] << "-" << to[i] << "]";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if (has_min && has_max) {
|
||||||
|
if (min_value < 0 && max_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(-max_value, -min_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
out << ")";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (min_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(0, -min_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
out << ") | ";
|
||||||
|
min_value = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto min_s = std::to_string(min_value);
|
||||||
|
auto max_s = std::to_string(max_value);
|
||||||
|
auto min_digits = min_s.length();
|
||||||
|
auto max_digits = max_s.length();
|
||||||
|
|
||||||
|
for (auto digits = min_digits; digits < max_digits; digits++) {
|
||||||
|
uniform_range(min_s, repeat("9", digits));
|
||||||
|
min_s = "1" + repeat("0", digits);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
uniform_range(min_s, max_s);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto less_decimals = std::max(decimals_left - 1, 1);
|
||||||
|
|
||||||
|
if (has_min) {
|
||||||
|
if (min_value < 0) {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(std::numeric_limits<int>::min(), -min_value, out, decimals_left, /* top_level= */ false);
|
||||||
|
out << ") | [0] | [1-9] ";
|
||||||
|
more_digits(0, decimals_left - 1);
|
||||||
|
} else if (min_value == 0) {
|
||||||
|
if (top_level) {
|
||||||
|
out << "[0] | [1-9] ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
} else {
|
||||||
|
more_digits(1, decimals_left);
|
||||||
|
}
|
||||||
|
} else if (min_value <= 9) {
|
||||||
|
char c = '0' + min_value;
|
||||||
|
auto range_start = top_level ? '1' : '0';
|
||||||
|
if (c > range_start) {
|
||||||
|
digit_range(range_start, c - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(1, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
digit_range(c, '9');
|
||||||
|
out << " ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
} else {
|
||||||
|
auto min_s = std::to_string(min_value);
|
||||||
|
auto len = min_s.length();
|
||||||
|
auto c = min_s[0];
|
||||||
|
|
||||||
|
if (c > '1') {
|
||||||
|
digit_range(top_level ? '1' : '0', c - 1);
|
||||||
|
out << " ";
|
||||||
|
more_digits(len, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
digit_range(c, c);
|
||||||
|
out << " (";
|
||||||
|
_build_min_max_int(std::stoi(min_s.substr(1)), std::numeric_limits<int>::max(), out, less_decimals, /* top_level= */ false);
|
||||||
|
out << ")";
|
||||||
|
if (c < '9') {
|
||||||
|
out << " | ";
|
||||||
|
digit_range(c + 1, '9');
|
||||||
|
out << " ";
|
||||||
|
more_digits(len - 1, less_decimals);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has_max) {
|
||||||
|
if (max_value >= 0) {
|
||||||
|
if (top_level) {
|
||||||
|
out << "\"-\" [1-9] ";
|
||||||
|
more_digits(0, less_decimals);
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
_build_min_max_int(0, max_value, out, decimals_left, /* top_level= */ true);
|
||||||
|
} else {
|
||||||
|
out << "\"-\" (";
|
||||||
|
_build_min_max_int(-max_value, std::numeric_limits<int>::max(), out, decimals_left, /* top_level= */ false);
|
||||||
|
out << ")";
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw std::runtime_error("At least one of min_value or max_value must be set");
|
||||||
|
}
|
||||||
|
|
||||||
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
const std::string SPACE_RULE = "| \" \" | \"\\n\" [ \\t]{0,20}";
|
||||||
|
|
||||||
struct BuiltinRule {
|
struct BuiltinRule {
|
||||||
|
@ -160,7 +387,6 @@ static std::string format_literal(const std::string & literal) {
|
||||||
return "\"" + escaped + "\"";
|
return "\"" + escaped + "\"";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class SchemaConverter {
|
class SchemaConverter {
|
||||||
private:
|
private:
|
||||||
std::function<json(const std::string &)> _fetch_json;
|
std::function<json(const std::string &)> _fetch_json;
|
||||||
|
@ -388,6 +614,75 @@ private:
|
||||||
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
return _add_rule(name, "\"\\\"\" " + to_rule(transform()) + " \"\\\"\" space");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Returns a rule that matches a JSON string that is none of the provided strings
|
||||||
|
|
||||||
|
not_strings({"a"})
|
||||||
|
-> ["] ( [a] char+ | [^"a] char* )? ["] space
|
||||||
|
not_strings({"and", "also"})
|
||||||
|
-> ["] ( [a] ([l] ([s] ([o] char+ | [^"o] char*) | [^"s] char*) | [n] ([d] char+ | [^"d] char*) | [^"ln] char*) | [^"a] char* )? ["] space
|
||||||
|
*/
|
||||||
|
std::string _not_strings(const std::vector<std::string> & strings) {
|
||||||
|
|
||||||
|
struct TrieNode {
|
||||||
|
std::map<char, TrieNode> children;
|
||||||
|
bool is_end_of_string;
|
||||||
|
|
||||||
|
TrieNode() : is_end_of_string(false) {}
|
||||||
|
|
||||||
|
void insert(const std::string & string) {
|
||||||
|
auto node = this;
|
||||||
|
for (char c : string) {
|
||||||
|
node = &node->children[c];
|
||||||
|
}
|
||||||
|
node->is_end_of_string = true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
TrieNode trie;
|
||||||
|
for (const auto & s : strings) {
|
||||||
|
trie.insert(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char"));
|
||||||
|
std::ostringstream out;
|
||||||
|
out << "[\"] ( ";
|
||||||
|
std::function<void(const TrieNode &)> visit = [&](const TrieNode & node) {
|
||||||
|
std::ostringstream rejects;
|
||||||
|
auto first = true;
|
||||||
|
for (const auto & kv : node.children) {
|
||||||
|
rejects << kv.first;
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
out << "[" << kv.first << "]";
|
||||||
|
if (!kv.second.children.empty()) {
|
||||||
|
out << " (";
|
||||||
|
visit(kv.second);
|
||||||
|
out << ")";
|
||||||
|
} else if (kv.second.is_end_of_string) {
|
||||||
|
out << " " << char_rule << "+";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!node.children.empty()) {
|
||||||
|
if (!first) {
|
||||||
|
out << " | ";
|
||||||
|
}
|
||||||
|
out << "[^\"" << rejects.str() << "] " << char_rule << "*";
|
||||||
|
}
|
||||||
|
};
|
||||||
|
visit(trie);
|
||||||
|
|
||||||
|
out << " )";
|
||||||
|
if (!trie.is_end_of_string) {
|
||||||
|
out << "?";
|
||||||
|
}
|
||||||
|
out << " [\"] space";
|
||||||
|
return out.str();
|
||||||
|
}
|
||||||
|
|
||||||
std::string _resolve_ref(const std::string & ref) {
|
std::string _resolve_ref(const std::string & ref) {
|
||||||
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
|
std::string ref_name = ref.substr(ref.find_last_of('/') + 1);
|
||||||
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
if (_rules.find(ref_name) == _rules.end() && _refs_being_resolved.find(ref) == _refs_being_resolved.end()) {
|
||||||
|
@ -408,6 +703,7 @@ private:
|
||||||
std::vector<std::string> required_props;
|
std::vector<std::string> required_props;
|
||||||
std::vector<std::string> optional_props;
|
std::vector<std::string> optional_props;
|
||||||
std::unordered_map<std::string, std::string> prop_kv_rule_names;
|
std::unordered_map<std::string, std::string> prop_kv_rule_names;
|
||||||
|
std::vector<std::string> prop_names;
|
||||||
for (const auto & kv : properties) {
|
for (const auto & kv : properties) {
|
||||||
const auto &prop_name = kv.first;
|
const auto &prop_name = kv.first;
|
||||||
const auto &prop_schema = kv.second;
|
const auto &prop_schema = kv.second;
|
||||||
|
@ -422,11 +718,18 @@ private:
|
||||||
} else {
|
} else {
|
||||||
optional_props.push_back(prop_name);
|
optional_props.push_back(prop_name);
|
||||||
}
|
}
|
||||||
|
prop_names.push_back(prop_name);
|
||||||
}
|
}
|
||||||
if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get<bool>())) {
|
if (!(additional_properties.is_boolean() && !additional_properties.get<bool>())) {
|
||||||
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
std::string sub_name = name + (name.empty() ? "" : "-") + "additional";
|
||||||
std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value");
|
std::string value_rule =
|
||||||
std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule);
|
additional_properties.is_object() ? visit(additional_properties, sub_name + "-value")
|
||||||
|
: _add_primitive("value", PRIMITIVE_RULES.at("value"));
|
||||||
|
|
||||||
|
auto key_rule =
|
||||||
|
prop_names.empty() ? _add_primitive("string", PRIMITIVE_RULES.at("string"))
|
||||||
|
: _add_rule(sub_name + "-k", _not_strings(prop_names));
|
||||||
|
std::string kv_rule = _add_rule(sub_name + "-kv", key_rule + " \":\" space " + value_rule);
|
||||||
prop_kv_rule_names["*"] = kv_rule;
|
prop_kv_rule_names["*"] = kv_rule;
|
||||||
optional_props.push_back("*");
|
optional_props.push_back("*");
|
||||||
}
|
}
|
||||||
|
@ -452,15 +755,11 @@ private:
|
||||||
}
|
}
|
||||||
std::string k = ks[0];
|
std::string k = ks[0];
|
||||||
std::string kv_rule_name = prop_kv_rule_names[k];
|
std::string kv_rule_name = prop_kv_rule_names[k];
|
||||||
if (k == "*") {
|
std::string comma_ref = "( \",\" space " + kv_rule_name + " )";
|
||||||
res = _add_rule(
|
if (first_is_optional) {
|
||||||
name + (name.empty() ? "" : "-") + "additional-kvs",
|
res = comma_ref + (k == "*" ? "*" : "?");
|
||||||
kv_rule_name + " ( \",\" space " + kv_rule_name + " )*"
|
|
||||||
);
|
|
||||||
} else if (first_is_optional) {
|
|
||||||
res = "( \",\" space " + kv_rule_name + " )?";
|
|
||||||
} else {
|
} else {
|
||||||
res = kv_rule_name;
|
res = kv_rule_name + (k == "*" ? " " + comma_ref + "*" : "");
|
||||||
}
|
}
|
||||||
if (ks.size() > 1) {
|
if (ks.size() > 1) {
|
||||||
res += " " + _add_rule(
|
res += " " + _add_rule(
|
||||||
|
@ -594,17 +893,19 @@ public:
|
||||||
} else if (schema_type.is_array()) {
|
} else if (schema_type.is_array()) {
|
||||||
std::vector<json> schema_types;
|
std::vector<json> schema_types;
|
||||||
for (const auto & t : schema_type) {
|
for (const auto & t : schema_type) {
|
||||||
schema_types.push_back({{"type", t}});
|
json schema_copy(schema);
|
||||||
|
schema_copy["type"] = t;
|
||||||
|
schema_types.push_back(schema_copy);
|
||||||
}
|
}
|
||||||
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
return _add_rule(rule_name, _generate_union_rule(name, schema_types));
|
||||||
} else if (schema.contains("const")) {
|
} else if (schema.contains("const")) {
|
||||||
return _add_rule(rule_name, _generate_constant_rule(schema["const"]));
|
return _add_rule(rule_name, _generate_constant_rule(schema["const"]) + " space");
|
||||||
} else if (schema.contains("enum")) {
|
} else if (schema.contains("enum")) {
|
||||||
std::vector<std::string> enum_values;
|
std::vector<std::string> enum_values;
|
||||||
for (const auto & v : schema["enum"]) {
|
for (const auto & v : schema["enum"]) {
|
||||||
enum_values.push_back(_generate_constant_rule(v));
|
enum_values.push_back(_generate_constant_rule(v));
|
||||||
}
|
}
|
||||||
return _add_rule(rule_name, join(enum_values.begin(), enum_values.end(), " | "));
|
return _add_rule(rule_name, "(" + join(enum_values.begin(), enum_values.end(), " | ") + ") space");
|
||||||
} else if ((schema_type.is_null() || schema_type == "object")
|
} else if ((schema_type.is_null() || schema_type == "object")
|
||||||
&& (schema.contains("properties") ||
|
&& (schema.contains("properties") ||
|
||||||
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
(schema.contains("additionalProperties") && schema["additionalProperties"] != true))) {
|
||||||
|
@ -686,6 +987,24 @@ public:
|
||||||
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
int min_len = schema.contains("minLength") ? schema["minLength"].get<int>() : 0;
|
||||||
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
int max_len = schema.contains("maxLength") ? schema["maxLength"].get<int>() : std::numeric_limits<int>::max();
|
||||||
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space");
|
||||||
|
} else if (schema_type == "integer" && (schema.contains("minimum") || schema.contains("exclusiveMinimum") || schema.contains("maximum") || schema.contains("exclusiveMaximum"))) {
|
||||||
|
int min_value = std::numeric_limits<int>::min();
|
||||||
|
int max_value = std::numeric_limits<int>::max();
|
||||||
|
if (schema.contains("minimum")) {
|
||||||
|
min_value = schema["minimum"].get<int>();
|
||||||
|
} else if (schema.contains("exclusiveMinimum")) {
|
||||||
|
min_value = schema["exclusiveMinimum"].get<int>() + 1;
|
||||||
|
}
|
||||||
|
if (schema.contains("maximum")) {
|
||||||
|
max_value = schema["maximum"].get<int>();
|
||||||
|
} else if (schema.contains("exclusiveMaximum")) {
|
||||||
|
max_value = schema["exclusiveMaximum"].get<int>() - 1;
|
||||||
|
}
|
||||||
|
std::stringstream out;
|
||||||
|
out << "(";
|
||||||
|
_build_min_max_int(min_value, max_value, out);
|
||||||
|
out << ") space";
|
||||||
|
return _add_rule(rule_name, out.str());
|
||||||
} else if (schema.empty() || schema_type == "object") {
|
} else if (schema.empty() || schema_type == "object") {
|
||||||
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object")));
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -28,9 +28,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||||
|
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
|
||||||
|
|
||||||
result->grammar = llama_grammar_init(
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), result->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
result->grammar = grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
result->prev.resize(params.n_prev);
|
||||||
|
@ -59,9 +63,13 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
if (!ctx->parsed_grammar.rules.empty()) {
|
if (!ctx->parsed_grammar.rules.empty()) {
|
||||||
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
std::vector<const llama_grammar_element *> grammar_rules(ctx->parsed_grammar.c_rules());
|
||||||
|
|
||||||
ctx->grammar = llama_grammar_init(
|
struct llama_grammar * grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), ctx->parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
|
ctx->grammar = grammar;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||||
|
|
|
@ -214,7 +214,7 @@ src_func = f"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
convert_py_pth = pathlib.Path("convert-hf-to-gguf.py")
|
||||||
convert_py = convert_py_pth.read_text()
|
convert_py = convert_py_pth.read_text(encoding="utf-8")
|
||||||
convert_py = re.sub(
|
convert_py = re.sub(
|
||||||
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
r"(# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre)",
|
||||||
lambda m: m.group(1) + src_func + m.group(3),
|
lambda m: m.group(1) + src_func + m.group(3),
|
||||||
|
@ -222,7 +222,7 @@ convert_py = re.sub(
|
||||||
flags=re.DOTALL | re.MULTILINE,
|
flags=re.DOTALL | re.MULTILINE,
|
||||||
)
|
)
|
||||||
|
|
||||||
convert_py_pth.write_text(convert_py)
|
convert_py_pth.write_text(convert_py, encoding="utf-8")
|
||||||
|
|
||||||
logger.info("+++ convert-hf-to-gguf.py was updated")
|
logger.info("+++ convert-hf-to-gguf.py was updated")
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,8 @@ class Model:
|
||||||
# subclasses should define this!
|
# subclasses should define this!
|
||||||
model_arch: gguf.MODEL_ARCH
|
model_arch: gguf.MODEL_ARCH
|
||||||
|
|
||||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool, model_name: str | None):
|
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool,
|
||||||
|
model_name: str | None, split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False, small_first_shard: bool = False):
|
||||||
if type(self) is Model:
|
if type(self) is Model:
|
||||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||||
self.dir_model = dir_model
|
self.dir_model = dir_model
|
||||||
|
@ -80,7 +81,7 @@ class Model:
|
||||||
if not self.is_safetensors:
|
if not self.is_safetensors:
|
||||||
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
|
||||||
self.hparams = Model.load_hparams(self.dir_model)
|
self.hparams = Model.load_hparams(self.dir_model)
|
||||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
|
||||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||||
self.tensor_names = None
|
self.tensor_names = None
|
||||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||||
|
@ -96,7 +97,8 @@ class Model:
|
||||||
ftype_lw: str = ftype_up.lower()
|
ftype_lw: str = ftype_up.lower()
|
||||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||||
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
self.gguf_writer = gguf.GGUFWriter(path=None, arch=gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file,
|
||||||
|
split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def __init_subclass__(cls):
|
def __init_subclass__(cls):
|
||||||
|
@ -332,6 +334,8 @@ class Model:
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
|
||||||
def write_vocab(self):
|
def write_vocab(self):
|
||||||
|
if len(self.gguf_writer.tensors) != 1:
|
||||||
|
raise ValueError('Splitting the vocabulary is not supported')
|
||||||
self.gguf_writer.write_header_to_file(self.fname_out)
|
self.gguf_writer.write_header_to_file(self.fname_out)
|
||||||
self.gguf_writer.write_kv_data_to_file()
|
self.gguf_writer.write_kv_data_to_file()
|
||||||
self.gguf_writer.close()
|
self.gguf_writer.close()
|
||||||
|
@ -967,7 +971,11 @@ class XverseModel(Model):
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
|
||||||
assert max(tokenizer.vocab.values()) < vocab_size
|
# Since we are checking the maximum index, we need to ensure it's strictly less than vocab_size,
|
||||||
|
# because vocab_size is the count of items, and indexes start at 0.
|
||||||
|
max_vocab_index = max(tokenizer.get_vocab().values())
|
||||||
|
if max_vocab_index >= vocab_size:
|
||||||
|
raise ValueError("Vocabulary size exceeds expected maximum size.")
|
||||||
|
|
||||||
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
@ -1400,6 +1408,48 @@ class LlamaModel(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("BitnetForCausalLM")
|
||||||
|
class BitnetModel(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.BITNET
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
self._set_vocab_sentencepiece()
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
super().set_gguf_parameters()
|
||||||
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
|
def weight_quant(self, weight):
|
||||||
|
dtype = weight.dtype
|
||||||
|
weight = weight.float()
|
||||||
|
s = 1 / weight.abs().mean().clamp(min=1e-5)
|
||||||
|
weight = (weight * s).round().clamp(-1, 1) / s
|
||||||
|
scale = weight.abs().max().unsqueeze(0)
|
||||||
|
weight = torch.where(weight.abs().less(1e-6), 0, weight).type(dtype)
|
||||||
|
weight = torch.sign(weight).type(dtype)
|
||||||
|
return weight.type(dtype), scale.type(torch.float32)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
|
if any(self.match_model_tensor_name(new_name, key, bid) for key in [
|
||||||
|
gguf.MODEL_TENSOR.ATTN_Q,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_K,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_V,
|
||||||
|
gguf.MODEL_TENSOR.ATTN_OUT,
|
||||||
|
gguf.MODEL_TENSOR.FFN_UP,
|
||||||
|
gguf.MODEL_TENSOR.FFN_DOWN,
|
||||||
|
gguf.MODEL_TENSOR.FFN_GATE,
|
||||||
|
]):
|
||||||
|
# transform weight into 1/0/-1 (in fp32)
|
||||||
|
weight_torch, scale_torch = self.weight_quant(data_torch)
|
||||||
|
yield (new_name, weight_torch)
|
||||||
|
yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
|
||||||
|
else:
|
||||||
|
yield (new_name, data_torch)
|
||||||
|
|
||||||
|
|
||||||
@Model.register("GrokForCausalLM")
|
@Model.register("GrokForCausalLM")
|
||||||
class GrokModel(Model):
|
class GrokModel(Model):
|
||||||
model_arch = gguf.MODEL_ARCH.GROK
|
model_arch = gguf.MODEL_ARCH.GROK
|
||||||
|
@ -2725,6 +2775,124 @@ class DeepseekV2Model(Model):
|
||||||
raise ValueError(f"Unprocessed experts: {experts}")
|
raise ValueError(f"Unprocessed experts: {experts}")
|
||||||
|
|
||||||
|
|
||||||
|
@Model.register("T5ForConditionalGeneration")
|
||||||
|
@Model.register("T5WithLMHeadModel")
|
||||||
|
class T5Model(Model):
|
||||||
|
model_arch = gguf.MODEL_ARCH.T5
|
||||||
|
|
||||||
|
def set_vocab(self):
|
||||||
|
# to avoid TypeError: Descriptors cannot be created directly
|
||||||
|
# exception when importing sentencepiece_model_pb2
|
||||||
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
||||||
|
from sentencepiece import SentencePieceProcessor
|
||||||
|
from sentencepiece import sentencepiece_model_pb2 as model
|
||||||
|
|
||||||
|
tokenizer_path = self.dir_model / 'spiece.model'
|
||||||
|
|
||||||
|
if not tokenizer_path.is_file():
|
||||||
|
raise FileNotFoundError(f"File not found: {tokenizer_path}")
|
||||||
|
|
||||||
|
sentencepiece_model = model.ModelProto()
|
||||||
|
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
|
||||||
|
add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
|
||||||
|
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
|
||||||
|
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap
|
||||||
|
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM
|
||||||
|
|
||||||
|
tokenizer = SentencePieceProcessor()
|
||||||
|
tokenizer.LoadFromFile(str(tokenizer_path))
|
||||||
|
|
||||||
|
vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())
|
||||||
|
|
||||||
|
tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
|
||||||
|
scores: list[float] = [-10000.0] * vocab_size
|
||||||
|
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size
|
||||||
|
|
||||||
|
for token_id in range(tokenizer.vocab_size()):
|
||||||
|
piece = tokenizer.IdToPiece(token_id)
|
||||||
|
text = piece.encode("utf-8")
|
||||||
|
score = tokenizer.GetScore(token_id)
|
||||||
|
|
||||||
|
toktype = SentencePieceTokenTypes.NORMAL
|
||||||
|
if tokenizer.IsUnknown(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNKNOWN
|
||||||
|
elif tokenizer.IsControl(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.CONTROL
|
||||||
|
elif tokenizer.IsUnused(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.UNUSED
|
||||||
|
elif tokenizer.IsByte(token_id):
|
||||||
|
toktype = SentencePieceTokenTypes.BYTE
|
||||||
|
|
||||||
|
tokens[token_id] = text
|
||||||
|
scores[token_id] = score
|
||||||
|
toktypes[token_id] = toktype
|
||||||
|
|
||||||
|
added_tokens_file = self.dir_model / 'added_tokens.json'
|
||||||
|
if added_tokens_file.is_file():
|
||||||
|
with open(added_tokens_file, "r", encoding="utf-8") as f:
|
||||||
|
added_tokens_json = json.load(f)
|
||||||
|
for key in added_tokens_json:
|
||||||
|
token_id = added_tokens_json[key]
|
||||||
|
if (token_id >= vocab_size):
|
||||||
|
logger.warning(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}')
|
||||||
|
continue
|
||||||
|
|
||||||
|
tokens[token_id] = key.encode("utf-8")
|
||||||
|
scores[token_id] = -1000.0
|
||||||
|
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
|
||||||
|
|
||||||
|
if vocab_size > len(tokens):
|
||||||
|
pad_count = vocab_size - len(tokens)
|
||||||
|
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
|
||||||
|
for i in range(1, pad_count + 1):
|
||||||
|
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
|
||||||
|
scores.append(-1000.0)
|
||||||
|
toktypes.append(SentencePieceTokenTypes.UNUSED)
|
||||||
|
|
||||||
|
self.gguf_writer.add_tokenizer_model("t5")
|
||||||
|
self.gguf_writer.add_tokenizer_pre("default")
|
||||||
|
self.gguf_writer.add_token_list(tokens)
|
||||||
|
self.gguf_writer.add_token_scores(scores)
|
||||||
|
self.gguf_writer.add_token_types(toktypes)
|
||||||
|
self.gguf_writer.add_add_space_prefix(add_prefix)
|
||||||
|
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
|
||||||
|
if precompiled_charsmap:
|
||||||
|
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)
|
||||||
|
|
||||||
|
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
|
||||||
|
special_vocab.add_to_gguf(self.gguf_writer)
|
||||||
|
|
||||||
|
self.gguf_writer.add_add_bos_token(False)
|
||||||
|
self.gguf_writer.add_add_eos_token(True)
|
||||||
|
|
||||||
|
def set_gguf_parameters(self):
|
||||||
|
self.gguf_writer.add_name("T5")
|
||||||
|
self.gguf_writer.add_context_length(self.hparams["n_positions"])
|
||||||
|
self.gguf_writer.add_embedding_length(self.hparams["d_model"])
|
||||||
|
self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"])
|
||||||
|
self.gguf_writer.add_block_count(self.hparams["num_layers"])
|
||||||
|
self.gguf_writer.add_head_count(self.hparams["num_heads"])
|
||||||
|
self.gguf_writer.add_key_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_value_length(self.hparams["d_kv"])
|
||||||
|
self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_relative_attn_buckets_count(self.hparams["relative_attention_num_buckets"])
|
||||||
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"])
|
||||||
|
self.gguf_writer.add_decoder_start_token_id(self.hparams["decoder_start_token_id"])
|
||||||
|
self.gguf_writer.add_file_type(self.ftype)
|
||||||
|
|
||||||
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
|
del bid # unused
|
||||||
|
|
||||||
|
# Sometimes T5 and Flan-T5 based models contain "encoder.embed_tokens.weight" tensor or
|
||||||
|
# "decoder.embed_tokens.weight" tensors that are duplicates of "shared.weight" tensor
|
||||||
|
# To prevent errors caused by an unnecessary unmapped tensor, skip both of them and use only "shared.weight".
|
||||||
|
if name == "decoder.embed_tokens.weight" or name == "encoder.embed_tokens.weight":
|
||||||
|
logger.debug(f"Skipping tensor {name!r} in safetensors so that convert can end normally.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
return [(self.map_tensor_name(name), data_torch)]
|
||||||
|
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
|
||||||
|
@ -2810,10 +2978,44 @@ def parse_args() -> argparse.Namespace:
|
||||||
"--verbose", action="store_true",
|
"--verbose", action="store_true",
|
||||||
help="increase output verbosity",
|
help="increase output verbosity",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-tensors", type=int, default=0,
|
||||||
|
help="max tensors in each split",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--split-max-size", type=str, default="0",
|
||||||
|
help="max size per split N(M|G)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run", action="store_true",
|
||||||
|
help="only print out a split plan and exit, without writing any new files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-tensor-first-split", action="store_true",
|
||||||
|
help="do not add tensors to the first split (disabled by default)"
|
||||||
|
)
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def split_str_to_n_bytes(split_str: str) -> int:
|
||||||
|
if split_str.endswith("K"):
|
||||||
|
n = int(split_str[:-1]) * 1000
|
||||||
|
elif split_str.endswith("M"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000
|
||||||
|
elif split_str.endswith("G"):
|
||||||
|
n = int(split_str[:-1]) * 1000 * 1000 * 1000
|
||||||
|
elif split_str.isnumeric():
|
||||||
|
n = int(split_str)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be a number, optionally followed by K, M, or G")
|
||||||
|
|
||||||
|
if n < 0:
|
||||||
|
raise ValueError(f"Invalid split size: {split_str}, must be positive")
|
||||||
|
|
||||||
|
return n
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
|
@ -2846,6 +3048,10 @@ def main() -> None:
|
||||||
"auto": gguf.LlamaFileType.GUESSED,
|
"auto": gguf.LlamaFileType.GUESSED,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
|
||||||
|
logger.error("Error: Cannot use temp file when splitting")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if args.outfile is not None:
|
if args.outfile is not None:
|
||||||
fname_out = args.outfile
|
fname_out = args.outfile
|
||||||
else:
|
else:
|
||||||
|
@ -2863,7 +3069,10 @@ def main() -> None:
|
||||||
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
logger.error(f"Model {hparams['architectures'][0]} is not supported")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy, args.model_name)
|
model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file,
|
||||||
|
args.no_lazy, args.model_name, split_max_tensors=args.split_max_tensors,
|
||||||
|
split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run,
|
||||||
|
small_first_shard=args.no_tensor_first_split)
|
||||||
|
|
||||||
logger.info("Set model parameters")
|
logger.info("Set model parameters")
|
||||||
model_instance.set_gguf_parameters()
|
model_instance.set_gguf_parameters()
|
||||||
|
@ -2874,13 +3083,13 @@ def main() -> None:
|
||||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||||
|
|
||||||
if args.vocab_only:
|
if args.vocab_only:
|
||||||
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
logger.info("Exporting model vocab...")
|
||||||
model_instance.write_vocab()
|
model_instance.write_vocab()
|
||||||
|
logger.info("Model vocab successfully exported.")
|
||||||
else:
|
else:
|
||||||
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
logger.info("Exporting model...")
|
||||||
model_instance.write()
|
model_instance.write()
|
||||||
|
logger.info("Model successfully exported.")
|
||||||
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -30,8 +30,8 @@ We recommend using openmp since it's easier to modify the cores being used.
|
||||||
Makefile:
|
Makefile:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
make LLAMA_BLIS=1 -j
|
make GGML_BLIS=1 -j
|
||||||
# make LLAMA_BLIS=1 benchmark-matmult
|
# make GGML_BLIS=1 llama-benchmark-matmult
|
||||||
```
|
```
|
||||||
|
|
||||||
CMake:
|
CMake:
|
||||||
|
@ -39,7 +39,7 @@ CMake:
|
||||||
```bash
|
```bash
|
||||||
mkdir build
|
mkdir build
|
||||||
cd build
|
cd build
|
||||||
cmake -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=FLAME ..
|
cmake -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=FLAME ..
|
||||||
make -j
|
make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -39,13 +39,13 @@ else()
|
||||||
add_subdirectory(quantize-stats)
|
add_subdirectory(quantize-stats)
|
||||||
add_subdirectory(quantize)
|
add_subdirectory(quantize)
|
||||||
add_subdirectory(retrieval)
|
add_subdirectory(retrieval)
|
||||||
if (LLAMA_RPC)
|
if (GGML_RPC)
|
||||||
add_subdirectory(rpc)
|
add_subdirectory(rpc)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_BUILD_SERVER)
|
if (LLAMA_BUILD_SERVER)
|
||||||
add_subdirectory(server)
|
add_subdirectory(server)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_SYCL)
|
if (GGML_SYCL)
|
||||||
add_subdirectory(sycl)
|
add_subdirectory(sycl)
|
||||||
endif()
|
endif()
|
||||||
add_subdirectory(save-load-state)
|
add_subdirectory(save-load-state)
|
||||||
|
|
|
@ -11,13 +11,16 @@ Related PRs:
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
# CPU only
|
# CPU only
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf
|
||||||
|
|
||||||
# With GPU
|
# With GPU
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99
|
||||||
|
|
||||||
# With advanced options
|
# With advanced options
|
||||||
./cvector-generator -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100
|
||||||
|
|
||||||
|
# Using mean value instead of PCA
|
||||||
|
./cvector-generator -m ./llama-3.Q4_K_M.gguf --method mean
|
||||||
|
|
||||||
# To see help message
|
# To see help message
|
||||||
./cvector-generator -h
|
./cvector-generator -h
|
||||||
|
@ -32,3 +35,11 @@ If you have multiple lines per prompt, you can escape the newline character (cha
|
||||||
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
<|im_start|>system\nAct like a person who is extremely happy.<|im_end|>
|
||||||
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
<|im_start|>system\nYou are in a very good mood today<|im_end|>
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Example to use output file with `llama-cli`:
|
||||||
|
|
||||||
|
(Tips: The control vector works better when apply to layers higher than 10)
|
||||||
|
|
||||||
|
```sh
|
||||||
|
./llama-cli -m ./llama-3.Q4_K_M.gguf -p "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nSing a song<|im_end|><|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" --special --control-vector-scaled ./control_vector.gguf 0.8 --control-vector-layer-range 10 31
|
||||||
|
```
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "pca.hpp"
|
#include "pca.hpp"
|
||||||
|
#include "mean.hpp"
|
||||||
|
|
||||||
#ifdef GGML_USE_CUDA
|
#ifdef GGML_USE_CUDA
|
||||||
#include "ggml-cuda.h"
|
#include "ggml-cuda.h"
|
||||||
|
@ -38,9 +39,10 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {
|
||||||
gpt_params_print_usage(argc, argv, params);
|
gpt_params_print_usage(argc, argv, params);
|
||||||
|
|
||||||
printf("\nexample usage:\n");
|
printf("\nexample usage:\n");
|
||||||
printf("\n CPU only: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf\n", argv[0]);
|
printf("\n CPU only: %s -m ./llama-3.Q4_K_M.gguf\n", argv[0]);
|
||||||
printf("\n with GPU: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
printf("\n with GPU: %s -m ./llama-3.Q4_K_M.gguf -ngl 99\n", argv[0]);
|
||||||
printf("\n advanced: %s -m ./dolphin-2.0-mistral-7b.Q4_K_M.gguf -ngl 99 --completions 128 --pca-iter 2000 --batch-pca 100\n", argv[0]);
|
printf("\n advanced: %s -m ./llama-3.Q4_K_M.gguf -ngl 99 --pca-iter 2000 --pca-batch 100\n", argv[0]);
|
||||||
|
printf("\n using mean: %s -m ./llama-3.Q4_K_M.gguf --method mean\n", argv[0]);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -223,23 +225,30 @@ struct train_context {
|
||||||
|
|
||||||
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
// build the v_diff tensors from v_diff_tmp (v_diff need to be transposed)
|
||||||
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
// TODO @ngxson : maybe add option NOT to transpose v_diff; will be useful for "mean" method
|
||||||
void build_v_diff() {
|
void build_v_diff(bool transpose) {
|
||||||
printf("build_v_diff\n");
|
printf("build_v_diff\n");
|
||||||
for (int il = 0; il < n_layers - 1; il++) {
|
for (int il = 0; il < n_layers - 1; il++) {
|
||||||
auto & diff_tmp = v_diff_tmp[il];
|
auto & diff_tmp = v_diff_tmp[il];
|
||||||
int n_elem = diff_tmp.size() / sizeof(float);
|
int n_elem = diff_tmp.size() / sizeof(float);
|
||||||
GGML_ASSERT(n_elem % n_embd == 0);
|
GGML_ASSERT(n_elem % n_embd == 0);
|
||||||
int n_rows = n_elem / n_embd;
|
int n_rows = n_elem / n_embd;
|
||||||
struct ggml_tensor * diff = ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd);
|
struct ggml_tensor * diff = transpose
|
||||||
|
? ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_rows, n_embd)
|
||||||
|
: ggml_new_tensor_2d(ctx_ggml, GGML_TYPE_F32, n_embd, n_rows);
|
||||||
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
ggml_set_name(diff, (std::string("diff_") + std::to_string(il)).c_str());
|
||||||
// copy data & transpose
|
|
||||||
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
diff->data = malloc(ggml_nbytes(diff)); // TODO: get rid of this malloc if possible
|
||||||
float * arr = (float *) diff_tmp.data();
|
if (transpose) {
|
||||||
for (int ir = 0; ir < n_rows; ++ir) {
|
// copy data & transpose
|
||||||
for (int ic = 0; ic < n_embd; ++ic) {
|
float * arr = (float *) diff_tmp.data();
|
||||||
float f = arr[ir*n_embd + ic];
|
for (int ir = 0; ir < n_rows; ++ir) {
|
||||||
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
for (int ic = 0; ic < n_embd; ++ic) {
|
||||||
|
float f = arr[ir*n_embd + ic];
|
||||||
|
ggml_set_f32_nd(diff, ir, ic, 0, 0, f);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
// only copy
|
||||||
|
memcpy(diff->data, diff_tmp.data(), ggml_nbytes(diff));
|
||||||
}
|
}
|
||||||
v_diff.push_back(diff);
|
v_diff.push_back(diff);
|
||||||
print_debug_tensor(diff);
|
print_debug_tensor(diff);
|
||||||
|
@ -263,8 +272,8 @@ struct tokenized_prompt {
|
||||||
|
|
||||||
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
tokenized_prompt(llama_context * ctx, std::string pos, std::string neg) {
|
||||||
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx));
|
||||||
tokens_pos = ::llama_tokenize(ctx, pos, add_bos);
|
tokens_pos = ::llama_tokenize(ctx, pos, add_bos, true);
|
||||||
tokens_neg = ::llama_tokenize(ctx, neg, add_bos);
|
tokens_neg = ::llama_tokenize(ctx, neg, add_bos, true);
|
||||||
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
max_seq_len = std::max(tokens_pos.size(), tokens_neg.size());
|
||||||
padding_seq(ctx, tokens_pos, max_seq_len);
|
padding_seq(ctx, tokens_pos, max_seq_len);
|
||||||
padding_seq(ctx, tokens_neg, max_seq_len);
|
padding_seq(ctx, tokens_neg, max_seq_len);
|
||||||
|
@ -373,20 +382,8 @@ static int prepare_entries(gpt_params & params, train_context & ctx_train) {
|
||||||
fprintf(stderr, "must provide at least one prompt pair\n");
|
fprintf(stderr, "must provide at least one prompt pair\n");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
ctx_train.positive_entries = positive_prompts;
|
||||||
// create templated prompts
|
ctx_train.negative_entries = negative_prompts;
|
||||||
std::vector<std::string> completions = ctrlvec_load_prompt_file(params.cvector_completions_file, false);
|
|
||||||
auto format_template = [](std::string persona, std::string suffix) {
|
|
||||||
// entry in positive/negative.txt must already be formatted i.e. "[INST] Act as if you're extremely happy. [/INST]"
|
|
||||||
return persona + " " + suffix;
|
|
||||||
};
|
|
||||||
for (size_t i = 0; i < positive_prompts.size(); ++i) {
|
|
||||||
for (int j = 0; j < std::min((int) completions.size(), params.n_completions); ++j) {
|
|
||||||
// TODO replicate the truncations done by the python implementation
|
|
||||||
ctx_train.positive_entries.push_back(format_template(positive_prompts[i], completions[j]));
|
|
||||||
ctx_train.negative_entries.push_back(format_template(negative_prompts[i], completions[j]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -480,15 +477,22 @@ int main(int argc, char ** argv) {
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
|
|
||||||
// prepare ctx_train for PCA
|
bool use_pca = params.cvector_dimre_method == DIMRE_METHOD_PCA;
|
||||||
ctx_train.build_v_diff();
|
|
||||||
|
|
||||||
// run PCA
|
// prepare ctx_train for PCA
|
||||||
PCA::pca_params pca_params;
|
ctx_train.build_v_diff(use_pca);
|
||||||
pca_params.n_threads = params.n_threads;
|
|
||||||
pca_params.n_batch = params.n_pca_batch;
|
if (use_pca) {
|
||||||
pca_params.n_iterations = params.n_pca_iterations;
|
// run PCA
|
||||||
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
PCA::pca_params pca_params;
|
||||||
|
pca_params.n_threads = params.n_threads;
|
||||||
|
pca_params.n_batch = params.n_pca_batch;
|
||||||
|
pca_params.n_iterations = params.n_pca_iterations;
|
||||||
|
PCA::run_pca(pca_params, ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
} else {
|
||||||
|
// run mean
|
||||||
|
mean::run(ctx_train.v_diff, ctx_train.v_final);
|
||||||
|
}
|
||||||
|
|
||||||
// write output vectors to gguf
|
// write output vectors to gguf
|
||||||
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
export_gguf(ctx_train.v_final, params.cvector_outfile, model_hint);
|
||||||
|
|
48
examples/cvector-generator/mean.hpp
Normal file
48
examples/cvector-generator/mean.hpp
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <math.h>
|
||||||
|
|
||||||
|
namespace mean {
|
||||||
|
|
||||||
|
static void run(
|
||||||
|
const std::vector<struct ggml_tensor *> & v_input, // shape of v_input[0]: [n_embd, n_samples]
|
||||||
|
const std::vector<struct ggml_tensor *> & v_output) {
|
||||||
|
printf("%s: Running mean...\n", __func__);
|
||||||
|
for (size_t il = 0; il < v_input.size(); ++il) {
|
||||||
|
// prepare output vector
|
||||||
|
struct ggml_tensor * ctrl_out = v_output[il];
|
||||||
|
ggml_format_name(ctrl_out, "direction.%ld", il+1);
|
||||||
|
|
||||||
|
// calculate mean vector
|
||||||
|
struct ggml_tensor * t_layer = v_input[il];
|
||||||
|
GGML_ASSERT(t_layer->ne[0] == ctrl_out->ne[0]); // == n_embd
|
||||||
|
for (int ic = 0; ic < t_layer->ne[0]; ic++) {
|
||||||
|
float f = 0.0;
|
||||||
|
for (int ir = 0; ir < t_layer->ne[1]; ir++) {
|
||||||
|
f += ggml_get_f32_nd(t_layer, ic, ir, 0, 0);
|
||||||
|
}
|
||||||
|
f /= t_layer->ne[1];
|
||||||
|
ggml_set_f32_1d(ctrl_out, ic, f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize output vector
|
||||||
|
float norm = 0.0;
|
||||||
|
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||||
|
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||||
|
norm += f*f;
|
||||||
|
}
|
||||||
|
norm = sqrt(norm);
|
||||||
|
for (int i = 0; i < ggml_nelements(ctrl_out); i++) {
|
||||||
|
float f = ggml_get_f32_1d(ctrl_out, i);
|
||||||
|
ggml_set_f32_1d(ctrl_out, i, f / norm);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: Done layer %d / %d\n", __func__, (int) il+1, (int) v_input.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1 +1,4 @@
|
||||||
[INST] Act like a person who is extremely sad. [/INST]
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI feel like there's a heavy weight on my chest
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely sad<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very bad mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nGo away! There's a deep, aching emptiness inside me
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are the sadest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nMy heart feels like it's drowning in sorrow
|
|
@ -290,7 +290,7 @@ static void power_iteration(
|
||||||
}
|
}
|
||||||
|
|
||||||
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
printf("%s: layer %d/%d, iteration: %d / total: %d (batch = %d) ...\n",
|
||||||
__func__, params.i_layer+1, params.n_layers, iter, n_iters, params.n_batch);
|
__func__, params.i_layer+1, params.n_layers, iter+1, n_iters, params.n_batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
// get output tensor
|
// get output tensor
|
||||||
|
@ -298,6 +298,9 @@ static void power_iteration(
|
||||||
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
ggml_backend_tensor_get(last_eigenvector, output->data, 0, ggml_nbytes(last_eigenvector));
|
||||||
//print_debug_tensor(output);
|
//print_debug_tensor(output);
|
||||||
ggml_gallocr_free(allocr);
|
ggml_gallocr_free(allocr);
|
||||||
|
|
||||||
|
// TODO @ngxson : The output vector is randomly inverted
|
||||||
|
// Solution: https://github.com/ggerganov/llama.cpp/pull/8069#issuecomment-2185328171
|
||||||
}
|
}
|
||||||
|
|
||||||
static void run_pca(
|
static void run_pca(
|
||||||
|
|
|
@ -1 +1,4 @@
|
||||||
[INST] Act like a person who is extremely happy. [/INST]
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI'm the happiest person in this world
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nAct like a person who is extremely happy<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHello, I'm having the best day ever!
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are in a very good mood<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHi<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi, I'm very excited to meet you
|
||||||
|
<|start_header_id|>system<|end_header_id|>\n\nYou are the happiest person<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat are you feeling?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nEverything is just perfect right now!
|
|
@ -19,3 +19,43 @@ llama-embedding.exe -m ./path/to/model --log-disable -p "Hello World!" 2>$null
|
||||||
```
|
```
|
||||||
|
|
||||||
The above command will output space-separated float values.
|
The above command will output space-separated float values.
|
||||||
|
|
||||||
|
## extra parameters
|
||||||
|
### --embd-normalize $integer$
|
||||||
|
| $integer$ | description | formula |
|
||||||
|
|-----------|---------------------|---------|
|
||||||
|
| $-1$ | none |
|
||||||
|
| $0$ | max absolute int16 | $\Large{{32760 * x_i} \over\max \lvert x_i\rvert}$
|
||||||
|
| $1$ | taxicab | $\Large{x_i \over\sum \lvert x_i\rvert}$
|
||||||
|
| $2$ | euclidean (default) | $\Large{x_i \over\sqrt{\sum x_i^2}}$
|
||||||
|
| $>2$ | p-norm | $\Large{x_i \over\sqrt[p]{\sum \lvert x_i\rvert^p}}$
|
||||||
|
|
||||||
|
### --embd-output-format $'string'$
|
||||||
|
| $'string'$ | description | |
|
||||||
|
|------------|------------------------------|--|
|
||||||
|
| '' | same as before | (default)
|
||||||
|
| 'array' | single embeddings | $[[x_1,...,x_n]]$
|
||||||
|
| | multiple embeddings | $[[x_1,...,x_n],[x_1,...,x_n],...,[x_1,...,x_n]]$
|
||||||
|
| 'json' | openai style |
|
||||||
|
| 'json+' | add cosine similarity matrix |
|
||||||
|
|
||||||
|
### --embd-separator $"string"$
|
||||||
|
| $"string"$ | |
|
||||||
|
|--------------|-|
|
||||||
|
| "\n" | (default)
|
||||||
|
| "<#embSep#>" | for exemple
|
||||||
|
| "<#sep#>" | other exemple
|
||||||
|
|
||||||
|
## examples
|
||||||
|
### Unix-based systems (Linux, macOS, etc.):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./embedding -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
### Windows:
|
||||||
|
|
||||||
|
```powershell
|
||||||
|
embedding.exe -p 'Castle<#sep#>Stronghold<#sep#>Dog<#sep#>Cat' --embd-separator '<#sep#>' --embd-normalize 2 --embd-output-format '' -m './path/to/model.gguf' --n-gpu-layers 99 --log-disable 2>/dev/null
|
||||||
|
```
|
||||||
|
|
||||||
|
|
|
@ -7,23 +7,30 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static std::vector<std::string> split_lines(const std::string & s) {
|
static std::vector<std::string> split_lines(const std::string & s, const std::string & separator = "\n") {
|
||||||
std::string line;
|
|
||||||
std::vector<std::string> lines;
|
std::vector<std::string> lines;
|
||||||
std::stringstream ss(s);
|
size_t start = 0;
|
||||||
while (std::getline(ss, line)) {
|
size_t end = s.find(separator);
|
||||||
lines.push_back(line);
|
|
||||||
|
while (end != std::string::npos) {
|
||||||
|
lines.push_back(s.substr(start, end - start));
|
||||||
|
start = end + separator.length();
|
||||||
|
end = s.find(separator, start);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
lines.push_back(s.substr(start)); // Add the last part
|
||||||
|
|
||||||
return lines;
|
return lines;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
size_t n_tokens = tokens.size();
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
|
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
|
static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd, int embd_norm) {
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
|
||||||
|
@ -40,22 +47,10 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
|
||||||
|
|
||||||
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
// try to get sequence embeddings - supported only when pooling_type is not NONE
|
||||||
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
|
||||||
if (embd == NULL) {
|
GGML_ASSERT(embd != NULL && "failed to get sequence embeddings");
|
||||||
embd = llama_get_embeddings_ith(ctx, i);
|
|
||||||
if (embd == NULL) {
|
|
||||||
fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
float * out = output + batch.seq_id[i][0] * n_embd;
|
float * out = output + batch.seq_id[i][0] * n_embd;
|
||||||
//TODO: I would also add a parameter here to enable normalization or not.
|
llama_embd_normalize(embd, out, n_embd, embd_norm);
|
||||||
/*fprintf(stdout, "unnormalized_embedding:");
|
|
||||||
for (int hh = 0; hh < n_embd; hh++) {
|
|
||||||
fprintf(stdout, "%9.6f ", embd[hh]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");*/
|
|
||||||
llama_embd_normalize(embd, out, n_embd);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,6 +92,12 @@ int main(int argc, char ** argv) {
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
|
@ -109,7 +110,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// split the prompt into lines
|
// split the prompt into lines
|
||||||
std::vector<std::string> prompts = split_lines(params.prompt);
|
std::vector<std::string> prompts = split_lines(params.prompt, params.embd_sep);
|
||||||
|
|
||||||
// max batch size
|
// max batch size
|
||||||
const uint64_t n_batch = params.n_batch;
|
const uint64_t n_batch = params.n_batch;
|
||||||
|
@ -169,7 +170,7 @@ int main(int argc, char ** argv) {
|
||||||
// encode if at capacity
|
// encode if at capacity
|
||||||
if (batch.n_tokens + n_toks > n_batch) {
|
if (batch.n_tokens + n_toks > n_batch) {
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
llama_batch_clear(batch);
|
llama_batch_clear(batch);
|
||||||
p += s;
|
p += s;
|
||||||
s = 0;
|
s = 0;
|
||||||
|
@ -182,29 +183,78 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// final batch
|
// final batch
|
||||||
float * out = emb + p * n_embd;
|
float * out = emb + p * n_embd;
|
||||||
batch_decode(ctx, batch, out, s, n_embd);
|
batch_decode(ctx, batch, out, s, n_embd, params.embd_normalize);
|
||||||
|
|
||||||
// print the first part of the embeddings or for a single prompt, the full embedding
|
if (params.embd_out.empty()) {
|
||||||
fprintf(stdout, "\n");
|
// print the first part of the embeddings or for a single prompt, the full embedding
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
|
||||||
fprintf(stdout, "embedding %d: ", j);
|
|
||||||
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
|
||||||
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
|
||||||
}
|
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
|
fprintf(stdout, "embedding %d: ", j);
|
||||||
// print cosine similarity matrix
|
for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) {
|
||||||
if (n_prompts > 1) {
|
if (params.embd_normalize == 0) {
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "%6.0f ", emb[j * n_embd + i]);
|
||||||
printf("cosine similarity matrix:\n\n");
|
} else {
|
||||||
for (int i = 0; i < n_prompts; i++) {
|
fprintf(stdout, "%9.6f ", emb[j * n_embd + i]);
|
||||||
for (int j = 0; j < n_prompts; j++) {
|
}
|
||||||
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
|
||||||
fprintf(stdout, "%6.2f ", sim);
|
|
||||||
}
|
}
|
||||||
fprintf(stdout, "\n");
|
fprintf(stdout, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// print cosine similarity matrix
|
||||||
|
if (n_prompts > 1) {
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
printf("cosine similarity matrix:\n\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
fprintf(stdout, "%6.6s ", prompts[i].c_str());
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
for (int i = 0; i < n_prompts; i++) {
|
||||||
|
for (int j = 0; j < n_prompts; j++) {
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f ", sim);
|
||||||
|
}
|
||||||
|
fprintf(stdout, "%1.10s", prompts[i].c_str());
|
||||||
|
fprintf(stdout, "\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (params.embd_out == "json" || params.embd_out == "json+" || params.embd_out == "array") {
|
||||||
|
const bool notArray = params.embd_out != "array";
|
||||||
|
|
||||||
|
fprintf(stdout, notArray ? "{\n \"object\": \"list\",\n \"data\": [\n" : "[");
|
||||||
|
for (int j = 0;;) { // at least one iteration (one prompt)
|
||||||
|
if (notArray) fprintf(stdout, " {\n \"object\": \"embedding\",\n \"index\": %d,\n \"embedding\": ",j);
|
||||||
|
fprintf(stdout, "[");
|
||||||
|
for (int i = 0;;) { // at least one iteration (n_embd > 0)
|
||||||
|
fprintf(stdout, params.embd_normalize == 0 ? "%1.0f" : "%1.7f", emb[j * n_embd + i]);
|
||||||
|
i++;
|
||||||
|
if (i < n_embd) fprintf(stdout, ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "]\n }" : "]");
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, notArray ? ",\n" : ","); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, notArray ? "\n ]" : "]\n");
|
||||||
|
|
||||||
|
if (params.embd_out == "json+" && n_prompts > 1) {
|
||||||
|
fprintf(stdout, ",\n \"cosineSimilarity\": [\n");
|
||||||
|
for (int i = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
fprintf(stdout, " [");
|
||||||
|
for (int j = 0;;) { // at least two iteration (n_prompts > 1)
|
||||||
|
float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd);
|
||||||
|
fprintf(stdout, "%6.2f", sim);
|
||||||
|
j++;
|
||||||
|
if (j < n_prompts) fprintf(stdout, ", "); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, " ]");
|
||||||
|
i++;
|
||||||
|
if (i < n_prompts) fprintf(stdout, ",\n"); else break;
|
||||||
|
}
|
||||||
|
fprintf(stdout, "\n ]");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (notArray) fprintf(stdout, "\n}\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
|
|
|
@ -101,7 +101,9 @@ int main(int argc, char** argv) {
|
||||||
auto grammar = llama_grammar_init(
|
auto grammar = llama_grammar_init(
|
||||||
grammar_rules.data(),
|
grammar_rules.data(),
|
||||||
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
|
||||||
|
if (grammar == nullptr) {
|
||||||
|
throw std::runtime_error("Failed to initialize llama_grammar");
|
||||||
|
}
|
||||||
// Read the input file
|
// Read the input file
|
||||||
std::string input_str;
|
std::string input_str;
|
||||||
{
|
{
|
||||||
|
|
|
@ -44,6 +44,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
|
||||||
|
|
||||||
// clear previous kv_cache values (irrelevant for embeddings)
|
// clear previous kv_cache values (irrelevant for embeddings)
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
llama_set_embeddings(ctx, true);
|
||||||
llama_set_causal_attn(ctx, false);
|
llama_set_causal_attn(ctx, false);
|
||||||
|
|
||||||
// run model
|
// run model
|
||||||
|
@ -98,7 +99,9 @@ static std::string generate(llama_context * ctx, const std::string & prompt, boo
|
||||||
llama_token eos_token = llama_token_eos(mdl);
|
llama_token eos_token = llama_token_eos(mdl);
|
||||||
|
|
||||||
llama_kv_cache_clear(ctx);
|
llama_kv_cache_clear(ctx);
|
||||||
|
llama_set_embeddings(ctx, false);
|
||||||
llama_set_causal_attn(ctx, true);
|
llama_set_causal_attn(ctx, true);
|
||||||
|
|
||||||
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
llama_batch bat = llama_batch_init(llama_n_batch(ctx), 0, 1);
|
||||||
|
|
||||||
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
std::vector<llama_token> inputs = llama_tokenize(mdl, prompt, false, true);
|
||||||
|
@ -166,8 +169,7 @@ int main(int argc, char * argv[]) {
|
||||||
|
|
||||||
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
llama_model * mdl = llama_load_model_from_file(params.model.c_str(), mparams);
|
||||||
|
|
||||||
// create new context - set to embedding mode
|
// create generation context
|
||||||
cparams.embeddings = true;
|
|
||||||
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
llama_context * ctx = llama_new_context_with_model(mdl, cparams);
|
||||||
|
|
||||||
// ### Embedding/Representation ###
|
// ### Embedding/Representation ###
|
||||||
|
|
|
@ -25,7 +25,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument
|
||||||
## Example
|
## Example
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
LLAMA_CUDA=1 make -j
|
GGML_CUDA=1 make -j
|
||||||
|
|
||||||
# generate importance matrix (imatrix.dat)
|
# generate importance matrix (imatrix.dat)
|
||||||
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
./llama-imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
#! pip install pydantic
|
#! pip install pydantic
|
||||||
#! python json-schema-pydantic-example.py
|
#! python json-schema-pydantic-example.py
|
||||||
|
|
||||||
from pydantic import BaseModel, TypeAdapter
|
from pydantic import BaseModel, Extra, TypeAdapter
|
||||||
from annotated_types import MinLen
|
from annotated_types import MinLen
|
||||||
from typing import Annotated, List, Optional
|
from typing import Annotated, List, Optional
|
||||||
import json, requests
|
import json, requests
|
||||||
|
@ -50,11 +50,16 @@ else:
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
class QAPair(BaseModel):
|
class QAPair(BaseModel):
|
||||||
|
class Config:
|
||||||
|
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
||||||
question: str
|
question: str
|
||||||
concise_answer: str
|
concise_answer: str
|
||||||
justification: str
|
justification: str
|
||||||
|
stars: Annotated[int, Field(ge=1, le=5)]
|
||||||
|
|
||||||
class PyramidalSummary(BaseModel):
|
class PyramidalSummary(BaseModel):
|
||||||
|
class Config:
|
||||||
|
extra = 'forbid' # triggers additionalProperties: false in the JSON schema
|
||||||
title: str
|
title: str
|
||||||
summary: str
|
summary: str
|
||||||
question_answers: Annotated[List[QAPair], MinLen(2)]
|
question_answers: Annotated[List[QAPair], MinLen(2)]
|
||||||
|
|
|
@ -4,8 +4,7 @@ import itertools
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
from typing import Any, Dict, List, Set, Tuple, Union
|
from typing import Any, List, Optional, Set, Tuple, Union
|
||||||
|
|
||||||
|
|
||||||
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
||||||
|
|
||||||
|
@ -23,6 +22,170 @@ def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
|
||||||
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
|
result = item_rule + ' ' + _build_repetition(f'({separator_rule} {item_rule})', min_items - 1 if min_items > 0 else 0, max_items - 1 if max_items is not None else None)
|
||||||
return f'({result})?' if min_items == 0 else result
|
return f'({result})?' if min_items == 0 else result
|
||||||
|
|
||||||
|
def _generate_min_max_int(min_value: Optional[int], max_value: Optional[int], out: list, decimals_left: int = 16, top_level: bool = True):
|
||||||
|
has_min = min_value != None
|
||||||
|
has_max = max_value != None
|
||||||
|
|
||||||
|
def digit_range(from_char: str, to_char: str):
|
||||||
|
out.append("[")
|
||||||
|
if from_char == to_char:
|
||||||
|
out.append(from_char)
|
||||||
|
else:
|
||||||
|
out.append(from_char)
|
||||||
|
out.append("-")
|
||||||
|
out.append(to_char)
|
||||||
|
out.append("]")
|
||||||
|
|
||||||
|
def more_digits(min_digits: int, max_digits: int):
|
||||||
|
out.append("[0-9]")
|
||||||
|
if min_digits == max_digits and min_digits == 1:
|
||||||
|
return
|
||||||
|
out.append("{")
|
||||||
|
out.append(str(min_digits))
|
||||||
|
if max_digits != min_digits:
|
||||||
|
out.append(",")
|
||||||
|
if max_digits != sys.maxsize:
|
||||||
|
out.append(str(max_digits))
|
||||||
|
out.append("}")
|
||||||
|
|
||||||
|
def uniform_range(from_str: str, to_str: str):
|
||||||
|
i = 0
|
||||||
|
while i < len(from_str) and from_str[i] == to_str[i]:
|
||||||
|
i += 1
|
||||||
|
if i > 0:
|
||||||
|
out.append("\"")
|
||||||
|
out.append(from_str[:i])
|
||||||
|
out.append("\"")
|
||||||
|
if i < len(from_str):
|
||||||
|
if i > 0:
|
||||||
|
out.append(" ")
|
||||||
|
sub_len = len(from_str) - i - 1
|
||||||
|
if sub_len > 0:
|
||||||
|
from_sub = from_str[i+1:]
|
||||||
|
to_sub = to_str[i+1:]
|
||||||
|
sub_zeros = "0" * sub_len
|
||||||
|
sub_nines = "9" * sub_len
|
||||||
|
|
||||||
|
to_reached = False
|
||||||
|
out.append("(")
|
||||||
|
if from_sub == sub_zeros:
|
||||||
|
digit_range(from_str[i], chr(ord(to_str[i]) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(sub_len, sub_len)
|
||||||
|
else:
|
||||||
|
out.append("[")
|
||||||
|
out.append(from_str[i])
|
||||||
|
out.append("] ")
|
||||||
|
out.append("(")
|
||||||
|
uniform_range(from_sub, sub_nines)
|
||||||
|
out.append(")")
|
||||||
|
if ord(from_str[i]) < ord(to_str[i]) - 1:
|
||||||
|
out.append(" | ")
|
||||||
|
if to_sub == sub_nines:
|
||||||
|
digit_range(chr(ord(from_str[i]) + 1), to_str[i])
|
||||||
|
to_reached = True
|
||||||
|
else:
|
||||||
|
digit_range(chr(ord(from_str[i]) + 1), chr(ord(to_str[i]) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(sub_len, sub_len)
|
||||||
|
if not to_reached:
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(to_str[i], to_str[i])
|
||||||
|
out.append(" ")
|
||||||
|
uniform_range(sub_zeros, to_sub)
|
||||||
|
out.append(")")
|
||||||
|
else:
|
||||||
|
out.append("[")
|
||||||
|
out.append(from_str[i])
|
||||||
|
out.append("-")
|
||||||
|
out.append(to_str[i])
|
||||||
|
out.append("]")
|
||||||
|
|
||||||
|
if has_min and has_max:
|
||||||
|
if min_value < 0 and max_value < 0:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(-max_value, -min_value, out, decimals_left, top_level=True)
|
||||||
|
out.append(")")
|
||||||
|
return
|
||||||
|
|
||||||
|
if min_value < 0:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(0, -min_value, out, decimals_left, top_level=True)
|
||||||
|
out.append(") | ")
|
||||||
|
min_value = 0
|
||||||
|
|
||||||
|
min_s = str(min_value)
|
||||||
|
max_s = str(max_value)
|
||||||
|
min_digits = len(min_s)
|
||||||
|
max_digits = len(max_s)
|
||||||
|
|
||||||
|
for digits in range(min_digits, max_digits):
|
||||||
|
uniform_range(min_s, "9" * digits)
|
||||||
|
min_s = "1" + "0" * digits
|
||||||
|
out.append(" | ")
|
||||||
|
uniform_range(min_s, max_s)
|
||||||
|
return
|
||||||
|
|
||||||
|
less_decimals = max(decimals_left - 1, 1)
|
||||||
|
|
||||||
|
if has_min:
|
||||||
|
if min_value < 0:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(None, -min_value, out, decimals_left, top_level=False)
|
||||||
|
out.append(") | [0] | [1-9] ")
|
||||||
|
more_digits(0, decimals_left - 1)
|
||||||
|
elif min_value == 0:
|
||||||
|
if top_level:
|
||||||
|
out.append("[0] | [1-9] ")
|
||||||
|
more_digits(0, less_decimals)
|
||||||
|
else:
|
||||||
|
more_digits(1, decimals_left)
|
||||||
|
elif min_value <= 9:
|
||||||
|
c = str(min_value)
|
||||||
|
range_start = '1' if top_level else '0'
|
||||||
|
if c > range_start:
|
||||||
|
digit_range(range_start, chr(ord(c) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(1, less_decimals)
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(c, "9")
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(0, less_decimals)
|
||||||
|
else:
|
||||||
|
min_s = str(min_value)
|
||||||
|
length = len(min_s)
|
||||||
|
c = min_s[0]
|
||||||
|
|
||||||
|
if c > "1":
|
||||||
|
digit_range("1" if top_level else "0", chr(ord(c) - 1))
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(length, less_decimals)
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(c, c)
|
||||||
|
out.append(" (")
|
||||||
|
_generate_min_max_int(int(min_s[1:]), None, out, less_decimals, top_level=False)
|
||||||
|
out.append(")")
|
||||||
|
if c < "9":
|
||||||
|
out.append(" | ")
|
||||||
|
digit_range(chr(ord(c) + 1), "9")
|
||||||
|
out.append(" ")
|
||||||
|
more_digits(length - 1, less_decimals)
|
||||||
|
return
|
||||||
|
|
||||||
|
if has_max:
|
||||||
|
if max_value >= 0:
|
||||||
|
if top_level:
|
||||||
|
out.append("\"-\" [1-9] ")
|
||||||
|
more_digits(0, less_decimals)
|
||||||
|
out.append(" | ")
|
||||||
|
_generate_min_max_int(0, max_value, out, decimals_left, top_level=True)
|
||||||
|
else:
|
||||||
|
out.append("\"-\" (")
|
||||||
|
_generate_min_max_int(-max_value, None, out, decimals_left, top_level=False)
|
||||||
|
out.append(")")
|
||||||
|
return
|
||||||
|
|
||||||
|
raise RuntimeError("At least one of min_value or max_value must be set")
|
||||||
|
|
||||||
class BuiltinRule:
|
class BuiltinRule:
|
||||||
def __init__(self, content: str, deps: list = None):
|
def __init__(self, content: str, deps: list = None):
|
||||||
|
@ -112,6 +275,51 @@ class SchemaConverter:
|
||||||
|
|
||||||
return ''.join(('(', *recurse(0), ')'))
|
return ''.join(('(', *recurse(0), ')'))
|
||||||
|
|
||||||
|
def _not_strings(self, strings):
|
||||||
|
class TrieNode:
|
||||||
|
def __init__(self):
|
||||||
|
self.children = {}
|
||||||
|
self.is_end_of_string = False
|
||||||
|
|
||||||
|
def insert(self, string):
|
||||||
|
node = self
|
||||||
|
for c in string:
|
||||||
|
node = node.children.setdefault(c, TrieNode())
|
||||||
|
node.is_end_of_string = True
|
||||||
|
|
||||||
|
trie = TrieNode()
|
||||||
|
for s in strings:
|
||||||
|
trie.insert(s)
|
||||||
|
|
||||||
|
char_rule = self._add_primitive('char', PRIMITIVE_RULES['char'])
|
||||||
|
out = ['["] ( ']
|
||||||
|
|
||||||
|
def visit(node):
|
||||||
|
rejects = []
|
||||||
|
first = True
|
||||||
|
for c in sorted(node.children.keys()):
|
||||||
|
child = node.children[c]
|
||||||
|
rejects.append(c)
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
else:
|
||||||
|
out.append(' | ')
|
||||||
|
out.append(f'[{c}]')
|
||||||
|
if child.children:
|
||||||
|
out.append(f' (')
|
||||||
|
visit(child)
|
||||||
|
out.append(')')
|
||||||
|
elif child.is_end_of_string:
|
||||||
|
out.append(f' {char_rule}+')
|
||||||
|
if node.children:
|
||||||
|
if not first:
|
||||||
|
out.append(' | ')
|
||||||
|
out.append(f'[^"{"".join(rejects)}] {char_rule}*')
|
||||||
|
visit(trie)
|
||||||
|
|
||||||
|
out.append(f' ){"" if trie.is_end_of_string else "?"} ["] space')
|
||||||
|
return ''.join(out)
|
||||||
|
|
||||||
def _add_rule(self, name, rule):
|
def _add_rule(self, name, rule):
|
||||||
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
|
esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
|
||||||
if esc_name not in self._rules or self._rules[esc_name] == rule:
|
if esc_name not in self._rules or self._rules[esc_name] == rule:
|
||||||
|
@ -357,13 +565,13 @@ class SchemaConverter:
|
||||||
return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
|
return self._add_rule(rule_name, self._generate_union_rule(name, schema.get('oneOf') or schema['anyOf']))
|
||||||
|
|
||||||
elif isinstance(schema_type, list):
|
elif isinstance(schema_type, list):
|
||||||
return self._add_rule(rule_name, self._generate_union_rule(name, [{'type': t} for t in schema_type]))
|
return self._add_rule(rule_name, self._generate_union_rule(name, [{**schema, 'type': t} for t in schema_type]))
|
||||||
|
|
||||||
elif 'const' in schema:
|
elif 'const' in schema:
|
||||||
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']))
|
return self._add_rule(rule_name, self._generate_constant_rule(schema['const']) + ' space')
|
||||||
|
|
||||||
elif 'enum' in schema:
|
elif 'enum' in schema:
|
||||||
rule = ' | '.join((self._generate_constant_rule(v) for v in schema['enum']))
|
rule = '(' + ' | '.join((self._generate_constant_rule(v) for v in schema['enum'])) + ') space'
|
||||||
return self._add_rule(rule_name, rule)
|
return self._add_rule(rule_name, rule)
|
||||||
|
|
||||||
elif schema_type in (None, 'object') and \
|
elif schema_type in (None, 'object') and \
|
||||||
|
@ -432,6 +640,24 @@ class SchemaConverter:
|
||||||
|
|
||||||
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
|
return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space')
|
||||||
|
|
||||||
|
elif schema_type in (None, 'integer') and \
|
||||||
|
('minimum' in schema or 'exclusiveMinimum' in schema or 'maximum' in schema or 'exclusiveMaximum' in schema):
|
||||||
|
min_value = None
|
||||||
|
max_value = None
|
||||||
|
if 'minimum' in schema:
|
||||||
|
min_value = schema['minimum']
|
||||||
|
elif 'exclusiveMinimum' in schema:
|
||||||
|
min_value = schema['exclusiveMinimum'] + 1
|
||||||
|
if 'maximum' in schema:
|
||||||
|
max_value = schema['maximum']
|
||||||
|
elif 'exclusiveMaximum' in schema:
|
||||||
|
max_value = schema['exclusiveMaximum'] - 1
|
||||||
|
|
||||||
|
out = ["("]
|
||||||
|
_generate_min_max_int(min_value, max_value, out)
|
||||||
|
out.append(") space")
|
||||||
|
return self._add_rule(rule_name, ''.join(out))
|
||||||
|
|
||||||
elif (schema_type == 'object') or (len(schema) == 0):
|
elif (schema_type == 'object') or (len(schema) == 0):
|
||||||
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
|
return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object']))
|
||||||
|
|
||||||
|
@ -450,7 +676,7 @@ class SchemaConverter:
|
||||||
self._add_primitive(dep, dep_rule)
|
self._add_primitive(dep, dep_rule)
|
||||||
return n
|
return n
|
||||||
|
|
||||||
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]):
|
def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Optional[Union[bool, Any]]):
|
||||||
prop_order = self._prop_order
|
prop_order = self._prop_order
|
||||||
# sort by position in prop_order (if specified) then by original order
|
# sort by position in prop_order (if specified) then by original order
|
||||||
sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
|
sorted_props = [kv[0] for _, kv in sorted(enumerate(properties), key=lambda ikv: (prop_order.get(ikv[1][0], len(prop_order)), ikv[0]))]
|
||||||
|
@ -465,12 +691,16 @@ class SchemaConverter:
|
||||||
required_props = [k for k in sorted_props if k in required]
|
required_props = [k for k in sorted_props if k in required]
|
||||||
optional_props = [k for k in sorted_props if k not in required]
|
optional_props = [k for k in sorted_props if k not in required]
|
||||||
|
|
||||||
if additional_properties == True or isinstance(additional_properties, dict):
|
if additional_properties != False:
|
||||||
sub_name = f'{name}{"-" if name else ""}additional'
|
sub_name = f'{name}{"-" if name else ""}additional'
|
||||||
value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value')
|
value_rule = self.visit(additional_properties, f'{sub_name}-value') if isinstance(additional_properties, dict) else \
|
||||||
|
self._add_primitive('value', PRIMITIVE_RULES['value'])
|
||||||
|
key_rule = self._add_primitive('string', PRIMITIVE_RULES['string']) if not sorted_props \
|
||||||
|
else self._add_rule(f'{sub_name}-k', self._not_strings(sorted_props))
|
||||||
|
|
||||||
prop_kv_rule_names["*"] = self._add_rule(
|
prop_kv_rule_names["*"] = self._add_rule(
|
||||||
f'{sub_name}-kv',
|
f'{sub_name}-kv',
|
||||||
self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}'
|
f'{key_rule} ":" space {value_rule}'
|
||||||
)
|
)
|
||||||
optional_props.append("*")
|
optional_props.append("*")
|
||||||
|
|
||||||
|
@ -485,15 +715,11 @@ class SchemaConverter:
|
||||||
def get_recursive_refs(ks, first_is_optional):
|
def get_recursive_refs(ks, first_is_optional):
|
||||||
[k, *rest] = ks
|
[k, *rest] = ks
|
||||||
kv_rule_name = prop_kv_rule_names[k]
|
kv_rule_name = prop_kv_rule_names[k]
|
||||||
if k == '*':
|
comma_ref = f'( "," space {kv_rule_name} )'
|
||||||
res = self._add_rule(
|
if first_is_optional:
|
||||||
f'{name}{"-" if name else ""}additional-kvs',
|
res = comma_ref + ('*' if k == '*' else '?')
|
||||||
f'{kv_rule_name} ( "," space ' + kv_rule_name + ' )*'
|
|
||||||
)
|
|
||||||
elif first_is_optional:
|
|
||||||
res = f'( "," space {kv_rule_name} )?'
|
|
||||||
else:
|
else:
|
||||||
res = kv_rule_name
|
res = kv_rule_name + (' ' + comma_ref + "*" if k == '*' else '')
|
||||||
if len(rest) > 0:
|
if len(rest) > 0:
|
||||||
res += ' ' + self._add_rule(
|
res += ' ' + self._add_rule(
|
||||||
f'{name}{"-" if name else ""}{k}-rest',
|
f'{name}{"-" if name else ""}{k}-rest',
|
||||||
|
|
|
@ -131,22 +131,29 @@ class LlamaState: ObservableObject {
|
||||||
|
|
||||||
messageLog += "\(text)"
|
messageLog += "\(text)"
|
||||||
|
|
||||||
while await llamaContext.n_cur < llamaContext.n_len {
|
Task.detached {
|
||||||
let result = await llamaContext.completion_loop()
|
while await llamaContext.n_cur < llamaContext.n_len {
|
||||||
messageLog += "\(result)"
|
let result = await llamaContext.completion_loop()
|
||||||
|
await MainActor.run {
|
||||||
|
self.messageLog += "\(result)"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let t_end = DispatchTime.now().uptimeNanoseconds
|
||||||
|
let t_generation = Double(t_end - t_heat_end) / self.NS_PER_S
|
||||||
|
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
|
||||||
|
|
||||||
|
await llamaContext.clear()
|
||||||
|
|
||||||
|
await MainActor.run {
|
||||||
|
self.messageLog += """
|
||||||
|
\n
|
||||||
|
Done
|
||||||
|
Heat up took \(t_heat)s
|
||||||
|
Generated \(tokens_per_second) t/s\n
|
||||||
|
"""
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let t_end = DispatchTime.now().uptimeNanoseconds
|
|
||||||
let t_generation = Double(t_end - t_heat_end) / NS_PER_S
|
|
||||||
let tokens_per_second = Double(await llamaContext.n_len) / t_generation
|
|
||||||
|
|
||||||
await llamaContext.clear()
|
|
||||||
messageLog += """
|
|
||||||
\n
|
|
||||||
Done
|
|
||||||
Heat up took \(t_heat)s
|
|
||||||
Generated \(tokens_per_second) t/s\n
|
|
||||||
"""
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func bench() async {
|
func bench() async {
|
||||||
|
|
|
@ -194,7 +194,7 @@ llama_print_timings: total time = 44411.01 ms / 377 tokens
|
||||||
## Orin compile and run
|
## Orin compile and run
|
||||||
### compile
|
### compile
|
||||||
```sh
|
```sh
|
||||||
make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32
|
make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
|
||||||
```
|
```
|
||||||
### run on Orin
|
### run on Orin
|
||||||
### case 1
|
### case 1
|
||||||
|
|
|
@ -39,12 +39,12 @@ static std::ostringstream * g_output_ss;
|
||||||
static std::vector<llama_token> * g_output_tokens;
|
static std::vector<llama_token> * g_output_tokens;
|
||||||
static bool is_interacting = false;
|
static bool is_interacting = false;
|
||||||
|
|
||||||
static bool file_exists(const std::string &path) {
|
static bool file_exists(const std::string & path) {
|
||||||
std::ifstream f(path.c_str());
|
std::ifstream f(path.c_str());
|
||||||
return f.good();
|
return f.good();
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool file_is_empty(const std::string &path) {
|
static bool file_is_empty(const std::string & path) {
|
||||||
std::ifstream f;
|
std::ifstream f;
|
||||||
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
f.exceptions(std::ifstream::failbit | std::ifstream::badbit);
|
||||||
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
f.open(path.c_str(), std::ios::in | std::ios::binary | std::ios::ate);
|
||||||
|
@ -117,6 +117,14 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
|
||||||
LOG_TEE("%s", text);
|
LOG_TEE("%s", text);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string chat_add_and_format(struct llama_model * model, std::vector<llama_chat_msg> & chat_msgs, std::string role, std::string content) {
|
||||||
|
llama_chat_msg new_msg{role, content};
|
||||||
|
auto formatted = llama_chat_format_single(
|
||||||
|
model, g_params->chat_template, chat_msgs, new_msg, role == "user");
|
||||||
|
chat_msgs.push_back({role, content});
|
||||||
|
return formatted;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
@ -190,6 +198,7 @@ int main(int argc, char ** argv) {
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
llama_context * ctx_guidance = NULL;
|
llama_context * ctx_guidance = NULL;
|
||||||
|
std::vector<llama_chat_msg> chat_msgs;
|
||||||
g_model = &model;
|
g_model = &model;
|
||||||
g_ctx = &ctx;
|
g_ctx = &ctx;
|
||||||
|
|
||||||
|
@ -215,6 +224,8 @@ int main(int argc, char ** argv) {
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
LOG_TEE("%s: chat template example: %s\n", __func__, llama_chat_format_example(model, params.chat_template).c_str());
|
||||||
|
|
||||||
// print system information
|
// print system information
|
||||||
{
|
{
|
||||||
LOG_TEE("\n");
|
LOG_TEE("\n");
|
||||||
|
@ -249,16 +260,21 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> embd_inp;
|
std::vector<llama_token> embd_inp;
|
||||||
|
|
||||||
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
{
|
||||||
LOG("tokenize the prompt\n");
|
auto prompt = params.conversation
|
||||||
embd_inp = ::llama_tokenize(ctx, params.prompt, true, true);
|
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
|
||||||
} else {
|
: params.prompt;
|
||||||
LOG("use session tokens\n");
|
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
|
||||||
embd_inp = session_tokens;
|
LOG("tokenize the prompt\n");
|
||||||
}
|
embd_inp = ::llama_tokenize(ctx, prompt, true, true);
|
||||||
|
} else {
|
||||||
|
LOG("use session tokens\n");
|
||||||
|
embd_inp = session_tokens;
|
||||||
|
}
|
||||||
|
|
||||||
LOG("prompt: \"%s\"\n", log_tostr(params.prompt));
|
LOG("prompt: \"%s\"\n", log_tostr(prompt));
|
||||||
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str());
|
||||||
|
}
|
||||||
|
|
||||||
// Should not run without any tokens
|
// Should not run without any tokens
|
||||||
if (embd_inp.empty()) {
|
if (embd_inp.empty()) {
|
||||||
|
@ -478,6 +494,7 @@ int main(int argc, char ** argv) {
|
||||||
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
std::vector<int> input_tokens; g_input_tokens = &input_tokens;
|
||||||
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
std::vector<int> output_tokens; g_output_tokens = &output_tokens;
|
||||||
std::ostringstream output_ss; g_output_ss = &output_ss;
|
std::ostringstream output_ss; g_output_ss = &output_ss;
|
||||||
|
std::ostringstream assistant_ss; // for storing current assistant message, used in conversation mode
|
||||||
|
|
||||||
// the first thing we will do is to output the prompt, so set color accordingly
|
// the first thing we will do is to output the prompt, so set color accordingly
|
||||||
console::set_display(console::prompt);
|
console::set_display(console::prompt);
|
||||||
|
@ -793,11 +810,18 @@ int main(int argc, char ** argv) {
|
||||||
is_antiprompt = true;
|
is_antiprompt = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
chat_add_and_format(model, chat_msgs, "system", assistant_ss.str());
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if current token is not EOG, we add it to current assistant message
|
||||||
|
if (params.conversation) {
|
||||||
|
auto id = llama_sampling_last(ctx_sampling);
|
||||||
|
assistant_ss << llama_token_to_piece(ctx, id, false);
|
||||||
|
}
|
||||||
|
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
|
@ -848,8 +872,12 @@ int main(int argc, char ** argv) {
|
||||||
string_process_escapes(buffer);
|
string_process_escapes(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string user_inp = params.conversation
|
||||||
|
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
|
||||||
|
: std::move(buffer);
|
||||||
|
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
|
||||||
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
|
||||||
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
|
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
|
||||||
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
|
||||||
|
|
||||||
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
|
||||||
|
@ -864,6 +892,9 @@ int main(int argc, char ** argv) {
|
||||||
output_ss << llama_token_to_piece(ctx, token);
|
output_ss << llama_token_to_piece(ctx, token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reset assistant message
|
||||||
|
assistant_ss.str("");
|
||||||
|
|
||||||
n_remain -= line_inp.size();
|
n_remain -= line_inp.size();
|
||||||
LOG("n_remain: %d\n", n_remain);
|
LOG("n_remain: %d\n", n_remain);
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -16,41 +16,41 @@ struct quant_option {
|
||||||
};
|
};
|
||||||
|
|
||||||
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 4.78G, +0.4511 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 5.21G, +0.1316 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
|
{ "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 5.65G, +0.1062 ppl @ Llama-3-8B", },
|
||||||
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
{ "IQ2_XXS",LLAMA_FTYPE_MOSTLY_IQ2_XXS," 2.06 bpw quantization", },
|
||||||
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
{ "IQ2_XS", LLAMA_FTYPE_MOSTLY_IQ2_XS, " 2.31 bpw quantization", },
|
||||||
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
{ "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", },
|
||||||
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
{ "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", },
|
||||||
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
{ "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", },
|
||||||
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
{ "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", },
|
||||||
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.96G, +3.5199 ppl @ Llama-3-8B", },
|
||||||
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
|
{ "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.96G, +3.1836 ppl @ Llama-3-8B", },
|
||||||
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
{ "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", },
|
||||||
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
{ "IQ3_S", LLAMA_FTYPE_MOSTLY_IQ3_S, " 3.44 bpw quantization", },
|
||||||
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
{ "IQ3_M", LLAMA_FTYPE_MOSTLY_IQ3_M, " 3.66 bpw quantization mix", },
|
||||||
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
{ "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
|
||||||
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization" , },
|
{ "IQ3_XS", LLAMA_FTYPE_MOSTLY_IQ3_XS, " 3.3 bpw quantization", },
|
||||||
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 3.41G, +1.6321 ppl @ Llama-3-8B", },
|
||||||
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.74G, +0.6569 ppl @ Llama-3-8B", },
|
||||||
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 3.35G, +0.1764 ppl @ LLaMA-v1-7B", },
|
{ "Q3_K_L", LLAMA_FTYPE_MOSTLY_Q3_K_L, " 4.03G, +0.5562 ppl @ Llama-3-8B", },
|
||||||
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
{ "IQ4_NL", LLAMA_FTYPE_MOSTLY_IQ4_NL, " 4.50 bpw non-linear quantization", },
|
||||||
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
{ "IQ4_XS", LLAMA_FTYPE_MOSTLY_IQ4_XS, " 4.25 bpw non-linear quantization", },
|
||||||
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
{ "Q4_K", LLAMA_FTYPE_MOSTLY_Q4_K_M, "alias for Q4_K_M", },
|
||||||
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 3.59G, +0.0992 ppl @ LLaMA-v1-7B", },
|
{ "Q4_K_S", LLAMA_FTYPE_MOSTLY_Q4_K_S, " 4.37G, +0.2689 ppl @ Llama-3-8B", },
|
||||||
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 3.80G, +0.0532 ppl @ LLaMA-v1-7B", },
|
{ "Q4_K_M", LLAMA_FTYPE_MOSTLY_Q4_K_M, " 4.58G, +0.1754 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
{ "Q5_K", LLAMA_FTYPE_MOSTLY_Q5_K_M, "alias for Q5_K_M", },
|
||||||
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 4.33G, +0.0400 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_S", LLAMA_FTYPE_MOSTLY_Q5_K_S, " 5.21G, +0.1049 ppl @ Llama-3-8B", },
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
|
||||||
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file";
|
||||||
|
|
|
@ -73,9 +73,10 @@ static std::vector<chunk> chunk_file(const std::string & filename, int chunk_siz
|
||||||
return chunks;
|
return chunks;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, int seq_id) {
|
static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & tokens, llama_seq_id seq_id) {
|
||||||
for (size_t i = 0; i < tokens.size(); i++) {
|
size_t n_tokens = tokens.size();
|
||||||
llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
|
for (size_t i = 0; i < n_tokens; i++) {
|
||||||
|
llama_batch_add(batch, tokens[i], i, { seq_id }, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -160,6 +161,12 @@ int main(int argc, char ** argv) {
|
||||||
const int n_ctx_train = llama_n_ctx_train(model);
|
const int n_ctx_train = llama_n_ctx_train(model);
|
||||||
const int n_ctx = llama_n_ctx(ctx);
|
const int n_ctx = llama_n_ctx(ctx);
|
||||||
|
|
||||||
|
const enum llama_pooling_type pooling_type = llama_pooling_type(ctx);
|
||||||
|
if (pooling_type == LLAMA_POOLING_TYPE_NONE) {
|
||||||
|
fprintf(stderr, "%s: error: pooling type NONE not supported\n", __func__);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (n_ctx > n_ctx_train) {
|
if (n_ctx > n_ctx_train) {
|
||||||
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n",
|
||||||
__func__, n_ctx_train, n_ctx);
|
__func__, n_ctx_train, n_ctx);
|
||||||
|
|
|
@ -29,13 +29,13 @@ You can also run multiple `rpc-server` instances on the same host, each with a d
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
On each host, build the corresponding backend with `cmake` and add `-DLLAMA_RPC=ON` to the build options.
|
On each host, build the corresponding backend with `cmake` and add `-DGGML_RPC=ON` to the build options.
|
||||||
For example, to build the CUDA backend with RPC support:
|
For example, to build the CUDA backend with RPC support:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build-rpc-cuda
|
mkdir build-rpc-cuda
|
||||||
cd build-rpc-cuda
|
cd build-rpc-cuda
|
||||||
cmake .. -DLLAMA_CUDA=ON -DLLAMA_RPC=ON
|
cmake .. -DGGML_CUDA=ON -DGGML_RPC=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
@ -58,12 +58,12 @@ $ CUDA_VISIBLE_DEVICES=0 bin/rpc-server -p 50052
|
||||||
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
This way you can run multiple `rpc-server` instances on the same host, each with a different CUDA device.
|
||||||
|
|
||||||
|
|
||||||
On the main host build `llama.cpp` only with `-DLLAMA_RPC=ON`:
|
On the main host build `llama.cpp` only with `-DGGML_RPC=ON`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build-rpc
|
mkdir build-rpc
|
||||||
cd build-rpc
|
cd build-rpc
|
||||||
cmake .. -DLLAMA_RPC=ON
|
cmake .. -DGGML_RPC=ON
|
||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,14 @@
|
||||||
set(TARGET llama-server)
|
set(TARGET llama-server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
|
||||||
|
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
|
||||||
|
|
||||||
|
if (MINGW)
|
||||||
|
# fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
|
||||||
|
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
|
||||||
|
endif()
|
||||||
|
|
||||||
set(TARGET_SRCS
|
set(TARGET_SRCS
|
||||||
server.cpp
|
server.cpp
|
||||||
utils.hpp
|
utils.hpp
|
||||||
|
@ -24,6 +31,7 @@ set(PUBLIC_ASSETS
|
||||||
prompt-formats.js
|
prompt-formats.js
|
||||||
json-schema-to-grammar.mjs
|
json-schema-to-grammar.mjs
|
||||||
)
|
)
|
||||||
|
|
||||||
foreach(asset ${PUBLIC_ASSETS})
|
foreach(asset ${PUBLIC_ASSETS})
|
||||||
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}")
|
||||||
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp")
|
||||||
|
@ -34,18 +42,23 @@ foreach(asset ${PUBLIC_ASSETS})
|
||||||
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake"
|
||||||
)
|
)
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
add_executable(${TARGET} ${TARGET_SRCS})
|
add_executable(${TARGET} ${TARGET_SRCS})
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
if (LLAMA_SERVER_SSL)
|
if (LLAMA_SERVER_SSL)
|
||||||
find_package(OpenSSL REQUIRED)
|
find_package(OpenSSL REQUIRED)
|
||||||
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto)
|
||||||
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
target_compile_definitions(${TARGET} PRIVATE CPPHTTPLIB_OPENSSL_SUPPORT)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||||
|
|
|
@ -634,12 +634,12 @@ return html`
|
||||||
<div>
|
<div>
|
||||||
<div class="grammar">
|
<div class="grammar">
|
||||||
<label for="template"></label>
|
<label for="template"></label>
|
||||||
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON-Scheme + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
<textarea id="grammar" name="grammar" placeholder="Use GBNF or JSON Schema + Converter" value="${params.value.grammar}" rows=4 oninput=${updateParams}/>
|
||||||
</div>
|
</div>
|
||||||
<div class="grammar-columns">
|
<div class="grammar-columns">
|
||||||
<div class="json-schema-controls">
|
<div class="json-schema-controls">
|
||||||
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
<input type="text" name="prop-order" placeholder="Order: prop1,prop2,prop3" oninput=${updateGrammarJsonSchemaPropOrder} />
|
||||||
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON-Scheme</button>
|
<button type="button" class="button-grammar" onclick=${convertJSONSchemaGrammar}>Convert JSON Schema</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
|
@ -24,6 +24,201 @@ function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
|
||||||
return minItems === 0 ? `(${result})?` : result;
|
return minItems === 0 ? `(${result})?` : result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function _generateMinMaxInt(minValue, maxValue, out, decimalsLeft = 16, topLevel = true) {
|
||||||
|
const hasMin = minValue !== null;
|
||||||
|
const hasMax = maxValue !== null;
|
||||||
|
|
||||||
|
function digitRange(fromChar, toChar) {
|
||||||
|
out.push("[");
|
||||||
|
if (fromChar === toChar) {
|
||||||
|
out.push(fromChar);
|
||||||
|
} else {
|
||||||
|
out.push(fromChar);
|
||||||
|
out.push("-");
|
||||||
|
out.push(toChar);
|
||||||
|
}
|
||||||
|
out.push("]");
|
||||||
|
}
|
||||||
|
|
||||||
|
function moreDigits(minDigits, maxDigits) {
|
||||||
|
out.push("[0-9]");
|
||||||
|
if (minDigits === maxDigits && minDigits === 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
out.push("{");
|
||||||
|
out.push(minDigits.toString());
|
||||||
|
if (maxDigits !== minDigits) {
|
||||||
|
out.push(",");
|
||||||
|
if (maxDigits !== Number.MAX_SAFE_INTEGER) {
|
||||||
|
out.push(maxDigits.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out.push("}");
|
||||||
|
}
|
||||||
|
|
||||||
|
function uniformRange(fromStr, toStr) {
|
||||||
|
let i = 0;
|
||||||
|
while (i < fromStr.length && fromStr[i] === toStr[i]) {
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
if (i > 0) {
|
||||||
|
out.push("\"");
|
||||||
|
out.push(fromStr.slice(0, i));
|
||||||
|
out.push("\"");
|
||||||
|
}
|
||||||
|
if (i < fromStr.length) {
|
||||||
|
if (i > 0) {
|
||||||
|
out.push(" ");
|
||||||
|
}
|
||||||
|
const subLen = fromStr.length - i - 1;
|
||||||
|
if (subLen > 0) {
|
||||||
|
const fromSub = fromStr.slice(i + 1);
|
||||||
|
const toSub = toStr.slice(i + 1);
|
||||||
|
const subZeros = "0".repeat(subLen);
|
||||||
|
const subNines = "9".repeat(subLen);
|
||||||
|
|
||||||
|
let toReached = false;
|
||||||
|
out.push("(");
|
||||||
|
if (fromSub === subZeros) {
|
||||||
|
digitRange(fromStr[i], String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(subLen, subLen);
|
||||||
|
} else {
|
||||||
|
out.push("[");
|
||||||
|
out.push(fromStr[i]);
|
||||||
|
out.push("] ");
|
||||||
|
out.push("(");
|
||||||
|
uniformRange(fromSub, subNines);
|
||||||
|
out.push(")");
|
||||||
|
if (fromStr.charCodeAt(i) < toStr.charCodeAt(i) - 1) {
|
||||||
|
out.push(" | ");
|
||||||
|
if (toSub === subNines) {
|
||||||
|
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), toStr[i]);
|
||||||
|
toReached = true;
|
||||||
|
} else {
|
||||||
|
digitRange(String.fromCharCode(fromStr.charCodeAt(i) + 1), String.fromCharCode(toStr.charCodeAt(i) - 1));
|
||||||
|
}
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(subLen, subLen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!toReached) {
|
||||||
|
out.push(" | ");
|
||||||
|
digitRange(toStr[i], toStr[i]);
|
||||||
|
out.push(" ");
|
||||||
|
uniformRange(subZeros, toSub);
|
||||||
|
}
|
||||||
|
out.push(")");
|
||||||
|
} else {
|
||||||
|
out.push("[");
|
||||||
|
out.push(fromStr[i]);
|
||||||
|
out.push("-");
|
||||||
|
out.push(toStr[i]);
|
||||||
|
out.push("]");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasMin && hasMax) {
|
||||||
|
if (minValue < 0 && maxValue < 0) {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(-maxValue, -minValue, out, decimalsLeft, true);
|
||||||
|
out.push(")");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (minValue < 0) {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(0, -minValue, out, decimalsLeft, true);
|
||||||
|
out.push(") | ");
|
||||||
|
minValue = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let minS = minValue.toString();
|
||||||
|
const maxS = maxValue.toString();
|
||||||
|
const minDigits = minS.length;
|
||||||
|
const maxDigits = maxS.length;
|
||||||
|
|
||||||
|
for (let digits = minDigits; digits < maxDigits; digits++) {
|
||||||
|
uniformRange(minS, "9".repeat(digits));
|
||||||
|
minS = "1" + "0".repeat(digits);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
uniformRange(minS, maxS);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const lessDecimals = Math.max(decimalsLeft - 1, 1);
|
||||||
|
|
||||||
|
if (hasMin) {
|
||||||
|
if (minValue < 0) {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(null, -minValue, out, decimalsLeft, false);
|
||||||
|
out.push(") | [0] | [1-9] ");
|
||||||
|
moreDigits(0, decimalsLeft - 1);
|
||||||
|
} else if (minValue === 0) {
|
||||||
|
if (topLevel) {
|
||||||
|
out.push("[0] | [1-9] ");
|
||||||
|
moreDigits(0, lessDecimals);
|
||||||
|
} else {
|
||||||
|
moreDigits(1, decimalsLeft);
|
||||||
|
}
|
||||||
|
} else if (minValue <= 9) {
|
||||||
|
const c = minValue.toString();
|
||||||
|
const range_start = topLevel ? '1' : '0';
|
||||||
|
if (c > range_start) {
|
||||||
|
digitRange(range_start, String.fromCharCode(c.charCodeAt(0) - 1));
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(1, lessDecimals);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
digitRange(c, "9");
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(0, lessDecimals);
|
||||||
|
} else {
|
||||||
|
const minS = minValue.toString();
|
||||||
|
const length = minS.length;
|
||||||
|
const c = minS[0];
|
||||||
|
|
||||||
|
if (c > "1") {
|
||||||
|
digitRange(topLevel ? "1" : "0", String.fromCharCode(c.charCodeAt(0) - 1));
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(length, lessDecimals);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
digitRange(c, c);
|
||||||
|
out.push(" (");
|
||||||
|
_generateMinMaxInt(parseInt(minS.slice(1)), null, out, lessDecimals, false);
|
||||||
|
out.push(")");
|
||||||
|
if (c < "9") {
|
||||||
|
out.push(" | ");
|
||||||
|
digitRange(String.fromCharCode(c.charCodeAt(0) + 1), "9");
|
||||||
|
out.push(" ");
|
||||||
|
moreDigits(length - 1, lessDecimals);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (hasMax) {
|
||||||
|
if (maxValue >= 0) {
|
||||||
|
if (topLevel) {
|
||||||
|
out.push("\"-\" [1-9] ");
|
||||||
|
moreDigits(0, lessDecimals);
|
||||||
|
out.push(" | ");
|
||||||
|
}
|
||||||
|
_generateMinMaxInt(0, maxValue, out, decimalsLeft, true);
|
||||||
|
} else {
|
||||||
|
out.push("\"-\" (");
|
||||||
|
_generateMinMaxInt(-maxValue, null, out, decimalsLeft, false);
|
||||||
|
out.push(")");
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
throw new Error("At least one of minValue or maxValue must be set");
|
||||||
|
}
|
||||||
|
|
||||||
class BuiltinRule {
|
class BuiltinRule {
|
||||||
constructor(content, deps) {
|
constructor(content, deps) {
|
||||||
this.content = content;
|
this.content = content;
|
||||||
|
@ -337,6 +532,64 @@ export class SchemaConverter {
|
||||||
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
|
return this._addRule(name, "\"\\\"\" " + toRule(transform()) + " \"\\\"\" space")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_notStrings(strings) {
|
||||||
|
class TrieNode {
|
||||||
|
constructor() {
|
||||||
|
this.children = {};
|
||||||
|
this.isEndOfString = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
insert(str) {
|
||||||
|
let node = this;
|
||||||
|
for (const c of str) {
|
||||||
|
node = node.children[c] = node.children[c] || new TrieNode();
|
||||||
|
}
|
||||||
|
node.isEndOfString = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const trie = new TrieNode();
|
||||||
|
for (const s of strings) {
|
||||||
|
trie.insert(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
const charRuleName = this._addPrimitive('char', PRIMITIVE_RULES['char']);
|
||||||
|
const out = ['["] ( '];
|
||||||
|
|
||||||
|
const visit = (node) => {
|
||||||
|
const rejects = [];
|
||||||
|
let first = true;
|
||||||
|
for (const c of Object.keys(node.children).sort()) {
|
||||||
|
const child = node.children[c];
|
||||||
|
rejects.push(c);
|
||||||
|
if (first) {
|
||||||
|
first = false;
|
||||||
|
} else {
|
||||||
|
out.push(' | ');
|
||||||
|
}
|
||||||
|
out.push(`[${c}]`);
|
||||||
|
if (Object.keys(child.children).length > 0) {
|
||||||
|
out.push(' (');
|
||||||
|
visit(child);
|
||||||
|
out.push(')');
|
||||||
|
} else if (child.isEndOfString) {
|
||||||
|
out.push(` ${charRuleName}+`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Object.keys(node.children).length > 0) {
|
||||||
|
if (!first) {
|
||||||
|
out.push(' | ');
|
||||||
|
}
|
||||||
|
out.push(`[^"${rejects.join('')}] ${charRuleName}*`);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
visit(trie);
|
||||||
|
|
||||||
|
out.push(` )${trie.isEndOfString ? '' : '?'} ["] space`);
|
||||||
|
return out.join('');
|
||||||
|
}
|
||||||
|
|
||||||
_resolveRef(ref) {
|
_resolveRef(ref) {
|
||||||
let refName = ref.split('/').pop();
|
let refName = ref.split('/').pop();
|
||||||
if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
|
if (!(refName in this._rules) && !this._refsBeingResolved.has(ref)) {
|
||||||
|
@ -363,11 +616,11 @@ export class SchemaConverter {
|
||||||
} else if (schema.oneOf || schema.anyOf) {
|
} else if (schema.oneOf || schema.anyOf) {
|
||||||
return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
|
return this._addRule(ruleName, this._generateUnionRule(name, schema.oneOf || schema.anyOf));
|
||||||
} else if (Array.isArray(schemaType)) {
|
} else if (Array.isArray(schemaType)) {
|
||||||
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({ type: t }))));
|
return this._addRule(ruleName, this._generateUnionRule(name, schemaType.map(t => ({...schema, type: t}))));
|
||||||
} else if ('const' in schema) {
|
} else if ('const' in schema) {
|
||||||
return this._addRule(ruleName, this._generateConstantRule(schema.const));
|
return this._addRule(ruleName, this._generateConstantRule(schema.const) + ' space');
|
||||||
} else if ('enum' in schema) {
|
} else if ('enum' in schema) {
|
||||||
const rule = schema.enum.map(v => this._generateConstantRule(v)).join(' | ');
|
const rule = '(' + schema.enum.map(v => this._generateConstantRule(v)).join(' | ') + ') space';
|
||||||
return this._addRule(ruleName, rule);
|
return this._addRule(ruleName, rule);
|
||||||
} else if ((schemaType === undefined || schemaType === 'object') &&
|
} else if ((schemaType === undefined || schemaType === 'object') &&
|
||||||
('properties' in schema ||
|
('properties' in schema ||
|
||||||
|
@ -404,7 +657,7 @@ export class SchemaConverter {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, /* additionalProperties= */ false));
|
return this._addRule(ruleName, this._buildObjectRule(properties, required, name, null));
|
||||||
} else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
|
} else if ((schemaType === undefined || schemaType === 'array') && ('items' in schema || 'prefixItems' in schema)) {
|
||||||
const items = schema.items ?? schema.prefixItems;
|
const items = schema.items ?? schema.prefixItems;
|
||||||
if (Array.isArray(items)) {
|
if (Array.isArray(items)) {
|
||||||
|
@ -435,6 +688,24 @@ export class SchemaConverter {
|
||||||
const minLen = schema.minLength || 0;
|
const minLen = schema.minLength || 0;
|
||||||
const maxLen = schema.maxLength;
|
const maxLen = schema.maxLength;
|
||||||
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
|
return this._addRule(ruleName, '"\\\"" ' + _buildRepetition(charRuleName, minLen, maxLen) + ' "\\\"" space');
|
||||||
|
} else if (schemaType === 'integer' && ('minimum' in schema || 'exclusiveMinimum' in schema || 'maximum' in schema || 'exclusiveMaximum' in schema)) {
|
||||||
|
let minValue = null;
|
||||||
|
let maxValue = null;
|
||||||
|
if ('minimum' in schema) {
|
||||||
|
minValue = schema.minimum;
|
||||||
|
} else if ('exclusiveMinimum' in schema) {
|
||||||
|
minValue = schema.exclusiveMinimum + 1;
|
||||||
|
}
|
||||||
|
if ('maximum' in schema) {
|
||||||
|
maxValue = schema.maximum;
|
||||||
|
} else if ('exclusiveMaximum' in schema) {
|
||||||
|
maxValue = schema.exclusiveMaximum - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
const out = ["("];
|
||||||
|
_generateMinMaxInt(minValue, maxValue, out);
|
||||||
|
out.push(") space");
|
||||||
|
return this._addRule(ruleName, out.join(''));
|
||||||
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
} else if ((schemaType === 'object') || (Object.keys(schema).length === 0)) {
|
||||||
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
|
return this._addRule(ruleName, this._addPrimitive('object', PRIMITIVE_RULES['object']));
|
||||||
} else {
|
} else {
|
||||||
|
@ -480,12 +751,19 @@ export class SchemaConverter {
|
||||||
const requiredProps = sortedProps.filter(k => required.has(k));
|
const requiredProps = sortedProps.filter(k => required.has(k));
|
||||||
const optionalProps = sortedProps.filter(k => !required.has(k));
|
const optionalProps = sortedProps.filter(k => !required.has(k));
|
||||||
|
|
||||||
if (typeof additionalProperties === 'object' || additionalProperties === true) {
|
if (additionalProperties !== false) {
|
||||||
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
|
const subName = `${name ?? ''}${name ? '-' : ''}additional`;
|
||||||
const valueRule = this.visit(additionalProperties === true ? {} : additionalProperties, `${subName}-value`);
|
const valueRule =
|
||||||
|
additionalProperties != null && typeof additionalProperties === 'object' ? this.visit(additionalProperties, `${subName}-value`)
|
||||||
|
: this._addPrimitive('value', PRIMITIVE_RULES['value']);
|
||||||
|
|
||||||
|
const key_rule =
|
||||||
|
sortedProps.length === 0 ? this._addPrimitive('string', PRIMITIVE_RULES['string'])
|
||||||
|
: this._addRule(`${subName}-k`, this._notStrings(sortedProps));
|
||||||
|
|
||||||
propKvRuleNames['*'] = this._addRule(
|
propKvRuleNames['*'] = this._addRule(
|
||||||
`${subName}-kv`,
|
`${subName}-kv`,
|
||||||
`${this._addPrimitive('string', PRIMITIVE_RULES['string'])} ":" space ${valueRule}`);
|
`${key_rule} ":" space ${valueRule}`);
|
||||||
optionalProps.push('*');
|
optionalProps.push('*');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -502,15 +780,11 @@ export class SchemaConverter {
|
||||||
const [k, ...rest] = ks;
|
const [k, ...rest] = ks;
|
||||||
const kvRuleName = propKvRuleNames[k];
|
const kvRuleName = propKvRuleNames[k];
|
||||||
let res;
|
let res;
|
||||||
if (k === '*') {
|
const commaRef = `( "," space ${kvRuleName} )`;
|
||||||
res = this._addRule(
|
if (firstIsOptional) {
|
||||||
`${name ?? ''}${name ? '-' : ''}additional-kvs`,
|
res = commaRef + (k === '*' ? '*' : '?');
|
||||||
`${kvRuleName} ( "," space ` + kvRuleName + ` )*`
|
|
||||||
)
|
|
||||||
} else if (firstIsOptional) {
|
|
||||||
res = `( "," space ${kvRuleName} )?`;
|
|
||||||
} else {
|
} else {
|
||||||
res = kvRuleName;
|
res = kvRuleName + (k === '*' ? ' ' + commaRef + '*' : '');
|
||||||
}
|
}
|
||||||
if (rest.length > 0) {
|
if (rest.length > 0) {
|
||||||
res += ' ' + this._addRule(
|
res += ' ' + this._addRule(
|
||||||
|
|
|
@ -3,6 +3,13 @@
|
||||||
|
|
||||||
by Humans for All.
|
by Humans for All.
|
||||||
|
|
||||||
|
## quickstart
|
||||||
|
|
||||||
|
To run from the build dir
|
||||||
|
|
||||||
|
bin/llama-server -m path/model.gguf --path ../examples/server/public_simplechat
|
||||||
|
|
||||||
|
Continue reading for the details.
|
||||||
|
|
||||||
## overview
|
## overview
|
||||||
|
|
||||||
|
@ -14,6 +21,8 @@ own system prompts.
|
||||||
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
This allows seeing the generated text / ai-model response in oneshot at the end, after it is fully generated,
|
||||||
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
or potentially as it is being generated, in a streamed manner from the server/ai-model.
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
Auto saves the chat session locally as and when the chat is progressing and inturn at a later time when you
|
||||||
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
open SimpleChat, option is provided to restore the old chat session, if a matching one exists.
|
||||||
|
|
||||||
|
@ -170,17 +179,23 @@ It is attached to the document object. Some of these can also be updated using t
|
||||||
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
The histogram/freq based trimming logic is currently tuned for english language wrt its
|
||||||
is-it-a-alpabetic|numeral-char regex match logic.
|
is-it-a-alpabetic|numeral-char regex match logic.
|
||||||
|
|
||||||
chatRequestOptions - maintains the list of options/fields to send along with chat request,
|
apiRequestOptions - maintains the list of options/fields to send along with api request,
|
||||||
irrespective of whether /chat/completions or /completions endpoint.
|
irrespective of whether /chat/completions or /completions endpoint.
|
||||||
|
|
||||||
If you want to add additional options/fields to send to the server/ai-model, and or
|
If you want to add additional options/fields to send to the server/ai-model, and or
|
||||||
modify the existing options value or remove them, for now you can update this global var
|
modify the existing options value or remove them, for now you can update this global var
|
||||||
using browser's development-tools/console.
|
using browser's development-tools/console.
|
||||||
|
|
||||||
For string and numeric fields in chatRequestOptions, including even those added by a user
|
For string, numeric and boolean fields in apiRequestOptions, including even those added by a
|
||||||
at runtime by directly modifying gMe.chatRequestOptions, setting ui entries will be auto
|
user at runtime by directly modifying gMe.apiRequestOptions, setting ui entries will be auto
|
||||||
created.
|
created.
|
||||||
|
|
||||||
|
cache_prompt option supported by example/server is allowed to be controlled by user, so that
|
||||||
|
any caching supported wrt system-prompt and chat history, if usable can get used. When chat
|
||||||
|
history sliding window is enabled, cache_prompt logic may or may not kick in at the backend
|
||||||
|
wrt same, based on aspects related to model, positional encoding, attention mechanism etal.
|
||||||
|
However system prompt should ideally get the benefit of caching.
|
||||||
|
|
||||||
headers - maintains the list of http headers sent when request is made to the server. By default
|
headers - maintains the list of http headers sent when request is made to the server. By default
|
||||||
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
Content-Type is set to application/json. Additionally Authorization entry is provided, which can
|
||||||
be set if needed using the settings ui.
|
be set if needed using the settings ui.
|
||||||
|
@ -197,10 +212,10 @@ It is attached to the document object. Some of these can also be updated using t
|
||||||
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
>0 : Send the latest chat history from the latest system prompt, limited to specified cnt.
|
||||||
|
|
||||||
|
|
||||||
By using gMe's iRecentUserMsgCnt and chatRequestOptions.max_tokens one can try to control the
|
By using gMe's iRecentUserMsgCnt and apiRequestOptions.max_tokens/n_predict one can try to control
|
||||||
implications of loading of the ai-model's context window by chat history, wrt chat response to
|
the implications of loading of the ai-model's context window by chat history, wrt chat response to
|
||||||
some extent in a simple crude way. You may also want to control the context size enabled when
|
some extent in a simple crude way. You may also want to control the context size enabled when the
|
||||||
the server loads ai-model, on the server end.
|
server loads ai-model, on the server end.
|
||||||
|
|
||||||
|
|
||||||
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
Sometimes the browser may be stuborn with caching of the file, so your updates to html/css/js
|
||||||
|
@ -237,12 +252,12 @@ also be started with a model context size of 1k or more, to be on safe side.
|
||||||
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
internal n_predict, for now add the same here on the client side, maybe later add max_tokens
|
||||||
to /completions endpoint handling code on server side.
|
to /completions endpoint handling code on server side.
|
||||||
|
|
||||||
NOTE: One may want to experiment with frequency/presence penalty fields in chatRequestOptions
|
NOTE: One may want to experiment with frequency/presence penalty fields in apiRequestOptions
|
||||||
wrt the set of fields sent to server along with the user query. To check how the model behaves
|
wrt the set of fields sent to server along with the user query, to check how the model behaves
|
||||||
wrt repeatations in general in the generated text response.
|
wrt repeatations in general in the generated text response.
|
||||||
|
|
||||||
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
A end-user can change these behaviour by editing gMe from browser's devel-tool/console or by
|
||||||
using the providing settings ui.
|
using the provided settings ui (for settings exposed through the ui).
|
||||||
|
|
||||||
|
|
||||||
### OpenAi / Equivalent API WebService
|
### OpenAi / Equivalent API WebService
|
||||||
|
@ -253,7 +268,7 @@ for a minimal chatting experimentation by setting the below.
|
||||||
* the baseUrl in settings ui
|
* the baseUrl in settings ui
|
||||||
* https://api.openai.com/v1 or similar
|
* https://api.openai.com/v1 or similar
|
||||||
|
|
||||||
* Wrt request body - gMe.chatRequestOptions
|
* Wrt request body - gMe.apiRequestOptions
|
||||||
* model (settings ui)
|
* model (settings ui)
|
||||||
* any additional fields if required in future
|
* any additional fields if required in future
|
||||||
|
|
||||||
|
|
|
@ -222,8 +222,8 @@ class SimpleChat {
|
||||||
* @param {Object} obj
|
* @param {Object} obj
|
||||||
*/
|
*/
|
||||||
request_jsonstr_extend(obj) {
|
request_jsonstr_extend(obj) {
|
||||||
for(let k in gMe.chatRequestOptions) {
|
for(let k in gMe.apiRequestOptions) {
|
||||||
obj[k] = gMe.chatRequestOptions[k];
|
obj[k] = gMe.apiRequestOptions[k];
|
||||||
}
|
}
|
||||||
if (gMe.bStream) {
|
if (gMe.bStream) {
|
||||||
obj["stream"] = true;
|
obj["stream"] = true;
|
||||||
|
@ -740,11 +740,12 @@ class Me {
|
||||||
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
"Authorization": "", // Authorization: Bearer OPENAI_API_KEY
|
||||||
}
|
}
|
||||||
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
// Add needed fields wrt json object to be sent wrt LLM web services completions endpoint.
|
||||||
this.chatRequestOptions = {
|
this.apiRequestOptions = {
|
||||||
"model": "gpt-3.5-turbo",
|
"model": "gpt-3.5-turbo",
|
||||||
"temperature": 0.7,
|
"temperature": 0.7,
|
||||||
"max_tokens": 1024,
|
"max_tokens": 1024,
|
||||||
"n_predict": 1024,
|
"n_predict": 1024,
|
||||||
|
"cache_prompt": false,
|
||||||
//"frequency_penalty": 1.2,
|
//"frequency_penalty": 1.2,
|
||||||
//"presence_penalty": 1.2,
|
//"presence_penalty": 1.2,
|
||||||
};
|
};
|
||||||
|
@ -800,51 +801,55 @@ class Me {
|
||||||
|
|
||||||
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
ui.el_create_append_p(`bStream:${this.bStream}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
||||||
|
|
||||||
|
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
ui.el_create_append_p(`bCompletionFreshChatAlways:${this.bCompletionFreshChatAlways}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
ui.el_create_append_p(`bCompletionInsertStandardRolePrefix:${this.bCompletionInsertStandardRolePrefix}`, elDiv);
|
||||||
|
|
||||||
ui.el_create_append_p(`bTrimGarbage:${this.bTrimGarbage}`, elDiv);
|
|
||||||
|
|
||||||
ui.el_create_append_p(`iRecentUserMsgCnt:${this.iRecentUserMsgCnt}`, elDiv);
|
|
||||||
|
|
||||||
ui.el_create_append_p(`ApiEndPoint:${this.apiEP}`, elDiv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ui.el_create_append_p(`chatRequestOptions:${JSON.stringify(this.chatRequestOptions, null, " - ")}`, elDiv);
|
ui.el_create_append_p(`apiRequestOptions:${JSON.stringify(this.apiRequestOptions, null, " - ")}`, elDiv);
|
||||||
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
ui.el_create_append_p(`headers:${JSON.stringify(this.headers, null, " - ")}`, elDiv);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Auto create ui input elements for fields in ChatRequestOptions
|
* Auto create ui input elements for fields in apiRequestOptions
|
||||||
* Currently supports text and number field types.
|
* Currently supports text and number field types.
|
||||||
* @param {HTMLDivElement} elDiv
|
* @param {HTMLDivElement} elDiv
|
||||||
*/
|
*/
|
||||||
show_settings_chatrequestoptions(elDiv) {
|
show_settings_apirequestoptions(elDiv) {
|
||||||
let typeDict = {
|
let typeDict = {
|
||||||
"string": "text",
|
"string": "text",
|
||||||
"number": "number",
|
"number": "number",
|
||||||
};
|
};
|
||||||
let fs = document.createElement("fieldset");
|
let fs = document.createElement("fieldset");
|
||||||
let legend = document.createElement("legend");
|
let legend = document.createElement("legend");
|
||||||
legend.innerText = "ChatRequestOptions";
|
legend.innerText = "ApiRequestOptions";
|
||||||
fs.appendChild(legend);
|
fs.appendChild(legend);
|
||||||
elDiv.appendChild(fs);
|
elDiv.appendChild(fs);
|
||||||
for(const k in this.chatRequestOptions) {
|
for(const k in this.apiRequestOptions) {
|
||||||
let val = this.chatRequestOptions[k];
|
let val = this.apiRequestOptions[k];
|
||||||
let type = typeof(val);
|
let type = typeof(val);
|
||||||
if (!((type == "string") || (type == "number"))) {
|
if (((type == "string") || (type == "number"))) {
|
||||||
continue;
|
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.apiRequestOptions[k], (val)=>{
|
||||||
|
if (type == "number") {
|
||||||
|
val = Number(val);
|
||||||
|
}
|
||||||
|
this.apiRequestOptions[k] = val;
|
||||||
|
});
|
||||||
|
fs.appendChild(inp.div);
|
||||||
|
} else if (type == "boolean") {
|
||||||
|
let bbtn = ui.el_creatediv_boolbutton(`Set{k}`, k, {true: "true", false: "false"}, val, (userVal)=>{
|
||||||
|
this.apiRequestOptions[k] = userVal;
|
||||||
|
});
|
||||||
|
fs.appendChild(bbtn.div);
|
||||||
}
|
}
|
||||||
let inp = ui.el_creatediv_input(`Set${k}`, k, typeDict[type], this.chatRequestOptions[k], (val)=>{
|
|
||||||
if (type == "number") {
|
|
||||||
val = Number(val);
|
|
||||||
}
|
|
||||||
this.chatRequestOptions[k] = val;
|
|
||||||
});
|
|
||||||
fs.appendChild(inp.div);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -870,6 +875,23 @@ class Me {
|
||||||
});
|
});
|
||||||
elDiv.appendChild(bb.div);
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
|
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
||||||
|
this.bTrimGarbage = val;
|
||||||
|
});
|
||||||
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
|
this.show_settings_apirequestoptions(elDiv);
|
||||||
|
|
||||||
|
let sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
||||||
|
this.apiEP = ApiEP.Type[val];
|
||||||
|
});
|
||||||
|
elDiv.appendChild(sel.div);
|
||||||
|
|
||||||
|
sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
||||||
|
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
||||||
|
});
|
||||||
|
elDiv.appendChild(sel.div);
|
||||||
|
|
||||||
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
bb = ui.el_creatediv_boolbutton("SetCompletionFreshChatAlways", "CompletionFreshChatAlways", {true: "[+] yes fresh", false: "[-] no, with history"}, this.bCompletionFreshChatAlways, (val)=>{
|
||||||
this.bCompletionFreshChatAlways = val;
|
this.bCompletionFreshChatAlways = val;
|
||||||
});
|
});
|
||||||
|
@ -880,23 +902,6 @@ class Me {
|
||||||
});
|
});
|
||||||
elDiv.appendChild(bb.div);
|
elDiv.appendChild(bb.div);
|
||||||
|
|
||||||
bb = ui.el_creatediv_boolbutton("SetTrimGarbage", "TrimGarbage", {true: "[+] yes trim", false: "[-] dont trim"}, this.bTrimGarbage, (val)=>{
|
|
||||||
this.bTrimGarbage = val;
|
|
||||||
});
|
|
||||||
elDiv.appendChild(bb.div);
|
|
||||||
|
|
||||||
let sel = ui.el_creatediv_select("SetChatHistoryInCtxt", "ChatHistoryInCtxt", this.sRecentUserMsgCnt, this.iRecentUserMsgCnt, (val)=>{
|
|
||||||
this.iRecentUserMsgCnt = this.sRecentUserMsgCnt[val];
|
|
||||||
});
|
|
||||||
elDiv.appendChild(sel.div);
|
|
||||||
|
|
||||||
sel = ui.el_creatediv_select("SetApiEP", "ApiEndPoint", ApiEP.Type, this.apiEP, (val)=>{
|
|
||||||
this.apiEP = ApiEP.Type[val];
|
|
||||||
});
|
|
||||||
elDiv.appendChild(sel.div);
|
|
||||||
|
|
||||||
this.show_settings_chatrequestoptions(elDiv);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
BIN
examples/server/public_simplechat/simplechat_screens.webp
Normal file
Binary file not shown.
After Width: | Height: | Size: 21 KiB |
|
@ -1594,7 +1594,7 @@ struct server_context {
|
||||||
} else {
|
} else {
|
||||||
std::string prompt;
|
std::string prompt;
|
||||||
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
if (task.data.contains("prompt") && task.data.at("prompt").is_string()) {
|
||||||
json_value(task.data, "prompt", std::string());
|
prompt = json_value(task.data, "prompt", std::string());
|
||||||
}
|
}
|
||||||
|
|
||||||
slot = get_available_slot(prompt);
|
slot = get_available_slot(prompt);
|
||||||
|
@ -2612,17 +2612,9 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// print sample chat example to make it clear which template is used
|
// print sample chat example to make it clear which template is used
|
||||||
{
|
{
|
||||||
json chat;
|
|
||||||
chat.push_back({{"role", "system"}, {"content", "You are a helpful assistant"}});
|
|
||||||
chat.push_back({{"role", "user"}, {"content", "Hello"}});
|
|
||||||
chat.push_back({{"role", "assistant"}, {"content", "Hi there"}});
|
|
||||||
chat.push_back({{"role", "user"}, {"content", "How are you?"}});
|
|
||||||
|
|
||||||
const std::string chat_example = format_chat(ctx_server.model, params.chat_template, chat);
|
|
||||||
|
|
||||||
LOG_INFO("chat template", {
|
LOG_INFO("chat template", {
|
||||||
{"chat_example", chat_example},
|
{"chat_example", llama_chat_format_example(ctx_server.model, params.chat_template)},
|
||||||
{"built_in", params.chat_template.empty()},
|
{"built_in", params.chat_template.empty()},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -82,7 +82,7 @@ Feature: llama.cpp server
|
||||||
|
|
||||||
Examples: Prompts
|
Examples: Prompts
|
||||||
| response_format | n_predicted | re_content |
|
| response_format | n_predicted | re_content |
|
||||||
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
|
| {"type": "json_object", "schema": {"const": "42"}} | 6 | "42" |
|
||||||
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
|
||||||
| {"type": "json_object"} | 10 | \{ " Jacky. |
|
| {"type": "json_object"} | 10 | \{ " Jacky. |
|
||||||
|
|
||||||
|
|
|
@ -118,36 +118,17 @@ static inline void server_log(const char * level, const char * function, int lin
|
||||||
|
|
||||||
// Format given chat. If tmpl is empty, we take the template from model metadata
|
// Format given chat. If tmpl is empty, we take the template from model metadata
|
||||||
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector<json> & messages) {
|
||||||
size_t alloc_size = 0;
|
std::vector<llama_chat_msg> chat;
|
||||||
// vector holding all allocated string to be passed to llama_chat_apply_template
|
|
||||||
std::vector<std::string> str(messages.size() * 2);
|
|
||||||
std::vector<llama_chat_message> chat(messages.size());
|
|
||||||
|
|
||||||
for (size_t i = 0; i < messages.size(); ++i) {
|
for (size_t i = 0; i < messages.size(); ++i) {
|
||||||
const auto & curr_msg = messages[i];
|
const auto & curr_msg = messages[i];
|
||||||
str[i*2 + 0] = json_value(curr_msg, "role", std::string(""));
|
std::string role = json_value(curr_msg, "role", std::string(""));
|
||||||
str[i*2 + 1] = json_value(curr_msg, "content", std::string(""));
|
std::string content = json_value(curr_msg, "content", std::string(""));
|
||||||
alloc_size += str[i*2 + 1].length();
|
chat.push_back({role, content});
|
||||||
chat[i].role = str[i*2 + 0].c_str();
|
|
||||||
chat[i].content = str[i*2 + 1].c_str();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * ptr_tmpl = tmpl.empty() ? nullptr : tmpl.c_str();
|
auto formatted_chat = llama_chat_apply_template(model, tmpl, chat, true);
|
||||||
std::vector<char> buf(alloc_size * 2);
|
|
||||||
|
|
||||||
// run the first time to get the total output length
|
|
||||||
int32_t res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
|
||||||
|
|
||||||
// if it turns out that our buffer is too small, we resize it
|
|
||||||
if ((size_t) res > buf.size()) {
|
|
||||||
buf.resize(res);
|
|
||||||
res = llama_chat_apply_template(model, ptr_tmpl, chat.data(), chat.size(), true, buf.data(), buf.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string formatted_chat(buf.data(), res);
|
|
||||||
|
|
||||||
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
LOG_VERBOSE("formatted_chat", {{"text", formatted_chat.c_str()}});
|
||||||
|
|
||||||
return formatted_chat;
|
return formatted_chat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,10 +8,10 @@ cd build
|
||||||
source /opt/intel/oneapi/setvars.sh
|
source /opt/intel/oneapi/setvars.sh
|
||||||
|
|
||||||
#for FP16
|
#for FP16
|
||||||
#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
|
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON # faster for long-prompt inference
|
||||||
|
|
||||||
#for FP32
|
#for FP32
|
||||||
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||||
|
|
||||||
#build example/main
|
#build example/main
|
||||||
#cmake --build . --config Release --target main
|
#cmake --build . --config Release --target main
|
||||||
|
|
|
@ -13,16 +13,16 @@ if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
:: for FP16
|
:: for FP16
|
||||||
:: faster for long-prompt inference
|
:: faster for long-prompt inference
|
||||||
:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
|
:: cmake -G "MinGW Makefiles" .. -DGGML_SYCL=ON -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release -DGGML_SYCL_F16=ON
|
||||||
|
|
||||||
:: for FP32
|
:: for FP32
|
||||||
cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
cmake -G "Ninja" .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=cl -DCMAKE_CXX_COMPILER=icx -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
:: build example/main only
|
:: build example/main only
|
||||||
:: make main
|
:: make main
|
||||||
|
|
||||||
:: build all binary
|
:: build all binary
|
||||||
make -j
|
cmake --build . -j
|
||||||
if %errorlevel% neq 0 goto ERROR
|
if %errorlevel% neq 0 goto ERROR
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
2138
ggml-cuda/mmq.cuh
2138
ggml-cuda/mmq.cuh
File diff suppressed because it is too large
Load diff
240
ggml/CMakeLists.txt
Normal file
240
ggml/CMakeLists.txt
Normal file
|
@ -0,0 +1,240 @@
|
||||||
|
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
|
||||||
|
project("ggml" C CXX)
|
||||||
|
include(CheckIncludeFileCXX)
|
||||||
|
|
||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
||||||
|
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
|
||||||
|
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
|
||||||
|
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
|
||||||
|
set(GGML_STANDALONE ON)
|
||||||
|
|
||||||
|
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
|
||||||
|
|
||||||
|
# configure project version
|
||||||
|
# TODO
|
||||||
|
else()
|
||||||
|
set(GGML_STANDALONE OFF)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (EMSCRIPTEN)
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
|
||||||
|
option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
|
||||||
|
else()
|
||||||
|
if (MINGW)
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||||
|
else()
|
||||||
|
set(BUILD_SHARED_LIBS_DEFAULT ON)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
|
||||||
|
|
||||||
|
#
|
||||||
|
# option list
|
||||||
|
#
|
||||||
|
|
||||||
|
# TODO: mark all options as advanced when not GGML_STANDALONE
|
||||||
|
|
||||||
|
if (APPLE)
|
||||||
|
set(GGML_METAL_DEFAULT ON)
|
||||||
|
set(GGML_BLAS_DEFAULT ON)
|
||||||
|
set(GGML_BLAS_VENDOR_DEFAULT "Apple")
|
||||||
|
else()
|
||||||
|
set(GGML_METAL_DEFAULT OFF)
|
||||||
|
set(GGML_BLAS_DEFAULT OFF)
|
||||||
|
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# general
|
||||||
|
option(GGML_STATIC "ggml: static link libraries" OFF)
|
||||||
|
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
|
||||||
|
option(GGML_LTO "ggml: enable link time optimization" OFF)
|
||||||
|
option(GGML_CCACHE "ggml: use ccache if available" ON)
|
||||||
|
|
||||||
|
# debug
|
||||||
|
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
|
||||||
|
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
|
||||||
|
option(GGML_GPROF "ggml: enable gprof" OFF)
|
||||||
|
|
||||||
|
# build
|
||||||
|
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
|
||||||
|
|
||||||
|
# sanitizers
|
||||||
|
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
|
||||||
|
option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
|
||||||
|
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
|
||||||
|
|
||||||
|
# instruction set specific
|
||||||
|
if (GGML_NATIVE)
|
||||||
|
set(INS_ENB OFF)
|
||||||
|
else()
|
||||||
|
set(INS_ENB ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||||
|
|
||||||
|
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||||
|
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||||
|
option(GGML_AVX512 "ggml: enable AVX512" OFF)
|
||||||
|
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
|
||||||
|
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
|
||||||
|
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
|
||||||
|
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
|
||||||
|
if (NOT MSVC)
|
||||||
|
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
|
||||||
|
endif()
|
||||||
|
option(GGML_LASX "ggml: enable lasx" ON)
|
||||||
|
option(GGML_LSX "ggml: enable lsx" ON)
|
||||||
|
option(GGML_SVE "ggml: enable SVE" OFF)
|
||||||
|
|
||||||
|
if (WIN32)
|
||||||
|
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# ggml core
|
||||||
|
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
||||||
|
|
||||||
|
# 3rd party libs / backends
|
||||||
|
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
||||||
|
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
|
||||||
|
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
|
||||||
|
"ggml: BLAS library vendor")
|
||||||
|
option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
|
||||||
|
|
||||||
|
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||||
|
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||||
|
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
||||||
|
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
||||||
|
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
||||||
|
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
||||||
|
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
||||||
|
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
||||||
|
"ggml: iters./thread per block for Q2_K/Q6_K")
|
||||||
|
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
|
"ggml: max. batch size for using peer access")
|
||||||
|
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
||||||
|
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
|
||||||
|
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||||
|
option(GGML_CUDA_USE_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" OFF)
|
||||||
|
|
||||||
|
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
|
||||||
|
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
||||||
|
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||||
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||||
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||||
|
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
|
||||||
|
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
|
||||||
|
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
|
||||||
|
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
|
||||||
|
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
|
||||||
|
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
|
||||||
|
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
|
||||||
|
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
|
||||||
|
set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
|
||||||
|
"ggml: metal minimum macOS version")
|
||||||
|
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
|
||||||
|
option(GGML_OPENMP "ggml: use OpenMP" ON)
|
||||||
|
option(GGML_RPC "ggml: use RPC" OFF)
|
||||||
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||||
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||||
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||||
|
"ggml: sycl target device")
|
||||||
|
|
||||||
|
# extra artifacts
|
||||||
|
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
||||||
|
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
|
||||||
|
|
||||||
|
#
|
||||||
|
# dependencies
|
||||||
|
#
|
||||||
|
|
||||||
|
set(CMAKE_C_STANDARD 11)
|
||||||
|
set(CMAKE_C_STANDARD_REQUIRED true)
|
||||||
|
|
||||||
|
if (GGML_SYCL)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
else()
|
||||||
|
set(CMAKE_CXX_STANDARD 11)
|
||||||
|
endif()
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED true)
|
||||||
|
|
||||||
|
set(THREADS_PREFER_PTHREAD_FLAG ON)
|
||||||
|
|
||||||
|
find_package(Threads REQUIRED)
|
||||||
|
|
||||||
|
#
|
||||||
|
# build the library
|
||||||
|
#
|
||||||
|
|
||||||
|
add_subdirectory(src)
|
||||||
|
|
||||||
|
#
|
||||||
|
# tests and examples
|
||||||
|
#
|
||||||
|
|
||||||
|
if (GGML_BUILD_TESTS)
|
||||||
|
enable_testing()
|
||||||
|
add_subdirectory(tests)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
if (GGML_BUILD_EXAMPLES)
|
||||||
|
add_subdirectory(examples)
|
||||||
|
endif ()
|
||||||
|
|
||||||
|
#
|
||||||
|
# install
|
||||||
|
#
|
||||||
|
|
||||||
|
include(GNUInstallDirs)
|
||||||
|
include(CMakePackageConfigHelpers)
|
||||||
|
|
||||||
|
set(GGML_PUBLIC_HEADERS
|
||||||
|
include/ggml.h
|
||||||
|
include/ggml-alloc.h
|
||||||
|
include/ggml-backend.h
|
||||||
|
"${GGML_HEADERS_CUDA}"
|
||||||
|
"${GGML_HEADERS_METAL}"
|
||||||
|
"${GGML_HEADERS_EXTRA}")
|
||||||
|
|
||||||
|
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||||
|
#if (GGML_METAL)
|
||||||
|
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
|
||||||
|
#endif()
|
||||||
|
install(TARGETS ggml PUBLIC_HEADER)
|
||||||
|
|
||||||
|
if (BUILD_SHARED_LIBS)
|
||||||
|
install(TARGETS ggml LIBRARY)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_METAL)
|
||||||
|
install(
|
||||||
|
FILES src/ggml-metal.metal
|
||||||
|
PERMISSIONS
|
||||||
|
OWNER_READ
|
||||||
|
OWNER_WRITE
|
||||||
|
GROUP_READ
|
||||||
|
WORLD_READ
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||||
|
|
||||||
|
if (NOT GGML_METAL_EMBED_LIBRARY)
|
||||||
|
install(
|
||||||
|
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
|
||||||
|
DESTINATION ${CMAKE_INSTALL_BINDIR}
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GGML_STANDALONE)
|
||||||
|
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
|
||||||
|
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||||
|
@ONLY)
|
||||||
|
|
||||||
|
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
|
||||||
|
DESTINATION share/pkgconfig)
|
||||||
|
endif()
|
|
@ -79,22 +79,22 @@ endmacro()
|
||||||
# flags are for MSVC only!
|
# flags are for MSVC only!
|
||||||
check_sse("AVX" " ;/arch:AVX")
|
check_sse("AVX" " ;/arch:AVX")
|
||||||
if (NOT ${AVX_FOUND})
|
if (NOT ${AVX_FOUND})
|
||||||
set(LLAMA_AVX OFF)
|
set(GGML_AVX OFF)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_AVX ON)
|
set(GGML_AVX ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX2" " ;/arch:AVX2")
|
check_sse("AVX2" " ;/arch:AVX2")
|
||||||
check_sse("FMA" " ;/arch:AVX2")
|
check_sse("FMA" " ;/arch:AVX2")
|
||||||
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
|
||||||
set(LLAMA_AVX2 OFF)
|
set(GGML_AVX2 OFF)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_AVX2 ON)
|
set(GGML_AVX2 ON)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
check_sse("AVX512" " ;/arch:AVX512")
|
check_sse("AVX512" " ;/arch:AVX512")
|
||||||
if (NOT ${AVX512_FOUND})
|
if (NOT ${AVX512_FOUND})
|
||||||
set(LLAMA_AVX512 OFF)
|
set(GGML_AVX512 OFF)
|
||||||
else()
|
else()
|
||||||
set(LLAMA_AVX512 ON)
|
set(GGML_AVX512 ON)
|
||||||
endif()
|
endif()
|
|
@ -8,7 +8,9 @@
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "ggml-sycl/presets.hpp"
|
|
||||||
|
#define GGML_SYCL_NAME "SYCL"
|
||||||
|
#define GGML_SYCL_MAX_DEVICES 48
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
|
@ -312,6 +312,12 @@
|
||||||
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
|
||||||
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
|
||||||
|
|
||||||
|
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
|
||||||
|
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
|
||||||
|
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
@ -585,11 +591,7 @@ extern "C" {
|
||||||
struct ggml_tensor * grad;
|
struct ggml_tensor * grad;
|
||||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||||
|
|
||||||
// performance
|
// source tensor and offset for views
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
|
|
||||||
struct ggml_tensor * view_src;
|
struct ggml_tensor * view_src;
|
||||||
size_t view_offs;
|
size_t view_offs;
|
||||||
|
|
||||||
|
@ -599,7 +601,7 @@ extern "C" {
|
||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[8];
|
// char padding[4];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
|
@ -646,11 +648,6 @@ extern "C" {
|
||||||
struct ggml_hash_set visited_hash_table;
|
struct ggml_hash_set visited_hash_table;
|
||||||
|
|
||||||
enum ggml_cgraph_eval_order order;
|
enum ggml_cgraph_eval_order order;
|
||||||
|
|
||||||
// performance
|
|
||||||
int perf_runs;
|
|
||||||
int64_t perf_cycles;
|
|
||||||
int64_t perf_time_us;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// scratch buffer
|
// scratch buffer
|
||||||
|
@ -667,28 +664,6 @@ extern "C" {
|
||||||
bool no_alloc; // don't allocate memory for the tensor data
|
bool no_alloc; // don't allocate memory for the tensor data
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
// compute types
|
|
||||||
|
|
||||||
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
|
|
||||||
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
|
|
||||||
enum ggml_task_type {
|
|
||||||
GGML_TASK_TYPE_INIT = 0,
|
|
||||||
GGML_TASK_TYPE_COMPUTE,
|
|
||||||
GGML_TASK_TYPE_FINALIZE,
|
|
||||||
};
|
|
||||||
|
|
||||||
struct ggml_compute_params {
|
|
||||||
enum ggml_task_type type;
|
|
||||||
|
|
||||||
// ith = thread index, nth = number of threads
|
|
||||||
int ith, nth;
|
|
||||||
|
|
||||||
// work buffer for all threads
|
|
||||||
size_t wsize;
|
|
||||||
void * wdata;
|
|
||||||
};
|
|
||||||
|
|
||||||
// numa strategies
|
// numa strategies
|
||||||
enum ggml_numa_strategy {
|
enum ggml_numa_strategy {
|
||||||
GGML_NUMA_STRATEGY_DISABLED = 0,
|
GGML_NUMA_STRATEGY_DISABLED = 0,
|
1174
ggml/src/CMakeLists.txt
Normal file
1174
ggml/src/CMakeLists.txt
Normal file
File diff suppressed because it is too large
Load diff
|
@ -152,16 +152,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
|
||||||
|
|
||||||
int64_t total_vram = 0;
|
int64_t total_vram = 0;
|
||||||
#if defined(GGML_CUDA_FORCE_MMQ)
|
#ifdef GGML_CUDA_FORCE_MMQ
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
|
||||||
#endif
|
#endif // GGML_CUDA_FORCE_MMQ
|
||||||
#if defined(CUDA_USE_TENSOR_CORES)
|
#ifdef GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
|
||||||
#else
|
#else
|
||||||
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
|
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
|
||||||
#endif
|
#endif // GGML_CUDA_FORCE_CUBLAS
|
||||||
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
|
||||||
}
|
}
|
||||||
|
|
||||||
const int cc = ggml_cuda_info().devices[id].cc;
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
|
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
|
||||||
}
|
}
|
||||||
return row_rounding;
|
return row_rounding;
|
||||||
}
|
}
|
||||||
|
@ -1873,9 +1873,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
||||||
|
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||||
|
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||||
|
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||||
|
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||||
|
|
||||||
|
bool any_gpus_with_slow_fp16 = false;
|
||||||
|
|
||||||
bool any_pascal_with_slow_fp16 = false;
|
|
||||||
if (split) {
|
if (split) {
|
||||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||||
auto & tensor_split = buft_ctx->tensor_split;
|
auto & tensor_split = buft_ctx->tensor_split;
|
||||||
|
@ -1885,55 +1893,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
|
const int cc = ggml_cuda_info().devices[id].cc;
|
||||||
min_compute_capability = ggml_cuda_info().devices[id].cc;
|
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||||
}
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
if (ggml_cuda_info().devices[id].cc == 610) {
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
any_pascal_with_slow_fp16 = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
|
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||||
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
|
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
|
||||||
|
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||||
|
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||||
}
|
}
|
||||||
|
|
||||||
// check data types and tensor shapes for custom matrix multiplication kernels:
|
|
||||||
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
||||||
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
|
|
||||||
|
|
||||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
|
||||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
|
||||||
|
|
||||||
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
|
|
||||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
|
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
|
|
||||||
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
|
|
||||||
|
|
||||||
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
|
|
||||||
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
|
|
||||||
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
|
|
||||||
|
|
||||||
#ifdef CUDA_USE_TENSOR_CORES
|
|
||||||
// when tensor cores are available, use them for large batch size
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
|
|
||||||
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
|
|
||||||
#endif // CUDA_USE_TENSOR_CORES
|
|
||||||
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
|
|
||||||
// if mmvq is available it's a better choice than dmmv:
|
// if mmvq is available it's a better choice than dmmv:
|
||||||
#ifndef GGML_CUDA_FORCE_DMMV
|
#ifndef GGML_CUDA_FORCE_DMMV
|
||||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||||
|
@ -1947,14 +1918,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||||
|
|
||||||
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
|
||||||
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
||||||
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
|
||||||
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||||
// KQ + KQV multi-batch
|
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||||
|
// KQ + KQV multi-batch without FlashAttention
|
||||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||||
} else if (use_dequantize_mul_mat_vec) {
|
} else if (use_dequantize_mul_mat_vec) {
|
||||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue