diff --git a/.devops/cpu.Dockerfile b/.devops/cpu.Dockerfile new file mode 100644 index 000000000..8d020f16c --- /dev/null +++ b/.devops/cpu.Dockerfile @@ -0,0 +1,81 @@ +ARG UBUNTU_VERSION=22.04 + +FROM ubuntu:$UBUNTU_VERSION AS build + +RUN apt-get update && \ + apt-get install -y build-essential git cmake libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j $(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ubuntu:$UBUNTU_VERSION AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/cuda.Dockerfile b/.devops/cuda.Dockerfile new file mode 100644 index 000000000..974dd78a8 --- /dev/null +++ b/.devops/cuda.Dockerfile @@ -0,0 +1,94 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG CUDA_VERSION=12.6.0 +# Target the CUDA build image +ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_CUDA_DEV_CONTAINER} AS build + +# CUDA architecture to build for (defaults to all supported archs) +ARG CUDA_DOCKER_ARCH=default + +RUN apt-get update && \ + apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 + +WORKDIR /app + +COPY . . + +RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_CUDA_RUN_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile deleted file mode 100644 index 05bff1bdf..000000000 --- a/.devops/full-cuda.Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 -# Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_CUDA_DEV_CONTAINER} AS build - -# CUDA architecture to build for (defaults to all supported archs) -ARG CUDA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Use the default CUDA archs if not specified -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release -j$(nproc) && \ - cp build/bin/* . - -ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full-musa.Dockerfile b/.devops/full-musa.Dockerfile deleted file mode 100644 index 3193fea1e..000000000 --- a/.devops/full-musa.Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc3.1.0 -# Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_MUSA_DEV_CONTAINER} AS build - -# MUSA architecture to build for (defaults to all supported archs) -ARG MUSA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Use the default MUSA archs if not specified -RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release -j$(nproc) && \ - cp build/bin/* . - -ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile deleted file mode 100644 index df496bcd2..000000000 --- a/.devops/full-rocm.Dockerfile +++ /dev/null @@ -1,50 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -# This needs to generally match the container host's environment. -ARG ROCM_VERSION=5.6 - -# Target the CUDA build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete - -FROM ${BASE_ROCM_DEV_CONTAINER} AS build - -# Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 -# This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH="\ - gfx803 \ - gfx900 \ - gfx906 \ - gfx908 \ - gfx90a \ - gfx1010 \ - gfx1030 \ - gfx1100 \ - gfx1101 \ - gfx1102" - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Set nvcc architecture -ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -ENV GGML_HIPBLAS=1 -ENV CC=/opt/rocm/llvm/bin/clang -ENV CXX=/opt/rocm/llvm/bin/clang++ - -# Enable cURL -ENV LLAMA_CURL=1 -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev - -RUN make -j$(nproc) - -ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile deleted file mode 100644 index d93c0be6a..000000000 --- a/.devops/full.Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build -j $(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib/ \; - -FROM ubuntu:$UBUNTU_VERSION as runtime - -WORKDIR /app - -RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev libgomp1 - -COPY requirements.txt /app/requirements.txt -COPY requirements /app/requirements -COPY .devops/tools.sh /app/tools.sh - -RUN pip install --upgrade pip setuptools wheel && \ - pip install -r /app/requirements.txt - -COPY --from=build /app/build/bin/ /app/ -COPY --from=build /app/lib/ /app/ -COPY --from=build /app/convert_hf_to_gguf.py /app/ -COPY --from=build /app/gguf-py /app/gguf-py - -ENV LC_ALL=C.utf8 - -ENTRYPOINT ["/app/tools.sh"] diff --git a/.devops/intel.Dockerfile b/.devops/intel.Dockerfile new file mode 100644 index 000000000..af783f5e9 --- /dev/null +++ b/.devops/intel.Dockerfile @@ -0,0 +1,91 @@ +ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04 + +## Build Image + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build + +ARG GGML_SYCL_F16=OFF +RUN apt-get update && \ + apt-get install -y git libcurl4-openssl-dev + +WORKDIR /app + +COPY . . + +RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ + echo "GGML_SYCL_F16 is set" \ + && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ + fi && \ + echo "Building with dynamic libs" && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +### Full +FROM base AS full + +COPY --from=build /app/lib/ /app +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/lib/ /app +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/lib/ /app +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] + diff --git a/.devops/llama-cli-cuda.Dockerfile b/.devops/llama-cli-cuda.Dockerfile deleted file mode 100644 index 7796891d5..000000000 --- a/.devops/llama-cli-cuda.Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 -# Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the CUDA runtime image -ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_CUDA_DEV_CONTAINER} AS build - -# CUDA architecture to build for (defaults to all supported archs) -ARG CUDA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake - -WORKDIR /app - -COPY . . - -# Use the default CUDA archs if not specified -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libgomp1 - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-cli / - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-intel.Dockerfile b/.devops/llama-cli-intel.Dockerfile deleted file mode 100644 index 0706f732a..000000000 --- a/.devops/llama-cli-intel.Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04 - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build - -ARG GGML_SYCL_F16=OFF -RUN apt-get update && \ - apt-get install -y git - -WORKDIR /app - -COPY . . - -RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ - echo "GGML_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ - fi && \ - echo "Building with static libs" && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx \ - ${OPT_SYCL_F16} -DBUILD_SHARED_LIBS=OFF && \ - cmake --build build --config Release --target llama-cli - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime - -COPY --from=build /app/build/bin/llama-cli /llama-cli - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-musa.Dockerfile b/.devops/llama-cli-musa.Dockerfile deleted file mode 100644 index e7c75af20..000000000 --- a/.devops/llama-cli-musa.Dockerfile +++ /dev/null @@ -1,38 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc3.1.0 -# Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the MUSA runtime image -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_MUSA_DEV_CONTAINER} AS build - -# MUSA architecture to build for (defaults to all supported archs) -ARG MUSA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake - -WORKDIR /app - -COPY . . - -# Use the default MUSA archs if not specified -RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-cli -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libgomp1 - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-cli /llama-cli - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli-rocm.Dockerfile b/.devops/llama-cli-rocm.Dockerfile deleted file mode 100644 index e60c747bd..000000000 --- a/.devops/llama-cli-rocm.Dockerfile +++ /dev/null @@ -1,45 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -# This needs to generally match the container host's environment. -ARG ROCM_VERSION=5.6 - -# Target the CUDA build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete - -FROM ${BASE_ROCM_DEV_CONTAINER} AS build - -# Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 -# This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH="\ - gfx803 \ - gfx900 \ - gfx906 \ - gfx908 \ - gfx90a \ - gfx1010 \ - gfx1030 \ - gfx1100 \ - gfx1101 \ - gfx1102" - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Set nvcc architecture -ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -ENV GGML_HIPBLAS=1 -ENV CC=/opt/rocm/llvm/bin/clang -ENV CXX=/opt/rocm/llvm/bin/clang++ - -RUN make -j$(nproc) llama-cli - -ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-cli-vulkan.Dockerfile b/.devops/llama-cli-vulkan.Dockerfile deleted file mode 100644 index 92a6e0479..000000000 --- a/.devops/llama-cli-vulkan.Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -ARG UBUNTU_VERSION=jammy - -FROM ubuntu:$UBUNTU_VERSION AS build - -# Install build tools -RUN apt update && apt install -y git build-essential cmake wget libgomp1 - -# Install Vulkan SDK -RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ - apt update -y && \ - apt-get install -y vulkan-sdk - -# Build it -WORKDIR /app -COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 && \ - cmake --build build --config Release --target llama-cli - -# Clean up -WORKDIR / -RUN cp /app/build/bin/llama-cli /llama-cli && \ - rm -rf /app - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/llama-cli" ] diff --git a/.devops/llama-cli.Dockerfile b/.devops/llama-cli.Dockerfile deleted file mode 100644 index be234d55d..000000000 --- a/.devops/llama-cli.Dockerfile +++ /dev/null @@ -1,29 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build -j $(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib/ \; - -FROM ubuntu:$UBUNTU_VERSION AS runtime - -WORKDIR /app - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/build/bin/llama-cli /app/ -COPY --from=build /app/lib/ /app/ - -ENV LC_ALL=C.utf8 - -ENTRYPOINT [ "/app/llama-cli" ] diff --git a/.devops/llama-server-cuda.Dockerfile b/.devops/llama-server-cuda.Dockerfile deleted file mode 100644 index bf8a198f9..000000000 --- a/.devops/llama-server-cuda.Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG CUDA_VERSION=12.6.0 -# Target the CUDA build image -ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the CUDA runtime image -ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_CUDA_DEV_CONTAINER} AS build - -# CUDA architecture to build for (defaults to all supported archs) -ARG CUDA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -# Use the default CUDA archs if not specified -RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-server /llama-server - -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-intel.Dockerfile b/.devops/llama-server-intel.Dockerfile deleted file mode 100644 index b503b8cfe..000000000 --- a/.devops/llama-server-intel.Dockerfile +++ /dev/null @@ -1,34 +0,0 @@ -ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04 - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build - -ARG GGML_SYCL_F16=OFF -RUN apt-get update && \ - apt-get install -y git libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \ - echo "GGML_SYCL_F16 is set" && \ - export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \ - fi && \ - echo "Building with dynamic libs" && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ - cmake --build build --config Release --target llama-server - -FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev curl - -COPY --from=build /app/build/bin/llama-server /llama-server - -ENV LC_ALL=C.utf8 -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-musa.Dockerfile b/.devops/llama-server-musa.Dockerfile deleted file mode 100644 index cebe51d42..000000000 --- a/.devops/llama-server-musa.Dockerfile +++ /dev/null @@ -1,43 +0,0 @@ -ARG UBUNTU_VERSION=22.04 -# This needs to generally match the container host's environment. -ARG MUSA_VERSION=rc3.1.0 -# Target the MUSA build image -ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} -# Target the MUSA runtime image -ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} - -FROM ${BASE_MUSA_DEV_CONTAINER} AS build - -# MUSA architecture to build for (defaults to all supported archs) -ARG MUSA_DOCKER_ARCH=default - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -# Use the default MUSA archs if not specified -RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ - export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ - fi && \ - cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ - cmake --build build --config Release --target llama-server -j$(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib \; - -FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/lib/ / -COPY --from=build /app/build/bin/llama-server /llama-server - -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server-rocm.Dockerfile b/.devops/llama-server-rocm.Dockerfile deleted file mode 100644 index 8553af75b..000000000 --- a/.devops/llama-server-rocm.Dockerfile +++ /dev/null @@ -1,54 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -# This needs to generally match the container host's environment. -ARG ROCM_VERSION=5.6 - -# Target the CUDA build image -ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete - -FROM ${BASE_ROCM_DEV_CONTAINER} AS build - -# Unless otherwise specified, we make a fat build. -# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 -# This is mostly tied to rocBLAS supported archs. -ARG ROCM_DOCKER_ARCH="\ - gfx803 \ - gfx900 \ - gfx906 \ - gfx908 \ - gfx90a \ - gfx1010 \ - gfx1030 \ - gfx1100 \ - gfx1101 \ - gfx1102" - -COPY requirements.txt requirements.txt -COPY requirements requirements - -RUN pip install --upgrade pip setuptools wheel \ - && pip install -r requirements.txt - -WORKDIR /app - -COPY . . - -# Set nvcc architecture -ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} -# Enable ROCm -ENV GGML_HIPBLAS=1 -ENV CC=/opt/rocm/llvm/bin/clang -ENV CXX=/opt/rocm/llvm/bin/clang++ -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -# Enable cURL -ENV LLAMA_CURL=1 -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev curl - -RUN make -j$(nproc) llama-server - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/llama-server-vulkan.Dockerfile b/.devops/llama-server-vulkan.Dockerfile deleted file mode 100644 index 6aa786779..000000000 --- a/.devops/llama-server-vulkan.Dockerfile +++ /dev/null @@ -1,31 +0,0 @@ -ARG UBUNTU_VERSION=jammy - -FROM ubuntu:$UBUNTU_VERSION AS build - -# Install build tools -RUN apt update && apt install -y git build-essential cmake wget - -# Install Vulkan SDK and cURL -RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ - apt update -y && \ - apt-get install -y vulkan-sdk libcurl4-openssl-dev curl - -# Build it -WORKDIR /app -COPY . . -RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ - cmake --build build --config Release --target llama-server - -# Clean up -WORKDIR / -RUN cp /app/build/bin/llama-server /llama-server && \ - rm -rf /app - -ENV LC_ALL=C.utf8 -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/llama-server" ] diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile deleted file mode 100644 index 72ccde2fe..000000000 --- a/.devops/llama-server.Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -ARG UBUNTU_VERSION=22.04 - -FROM ubuntu:$UBUNTU_VERSION AS build - -RUN apt-get update && \ - apt-get install -y build-essential git cmake libcurl4-openssl-dev - -WORKDIR /app - -COPY . . - -RUN cmake -S . -B build -DGGML_BACKEND_DL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ALL_VARIANTS=ON -DLLAMA_CURL=ON -DCMAKE_BUILD_TYPE=Release && \ - cmake --build build -j $(nproc) && \ - mkdir -p /app/lib && \ - find build -name "*.so" -exec cp {} /app/lib/ \; - -FROM ubuntu:$UBUNTU_VERSION AS runtime - -WORKDIR /app - -RUN apt-get update && \ - apt-get install -y libcurl4-openssl-dev libgomp1 curl - -COPY --from=build /app/build/bin/llama-server /app/ -COPY --from=build /app/lib/ /app/ - -ENV LC_ALL=C.utf8 -# Must be set to 0.0.0.0 so it can listen to requests from host machine -ENV LLAMA_ARG_HOST=0.0.0.0 - -HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] - -ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/musa.Dockerfile b/.devops/musa.Dockerfile new file mode 100644 index 000000000..bfd7fc1c1 --- /dev/null +++ b/.devops/musa.Dockerfile @@ -0,0 +1,108 @@ +ARG UBUNTU_VERSION=22.04 +# This needs to generally match the container host's environment. +ARG MUSA_VERSION=rc3.1.0 +# Target the MUSA build image +ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION} + +ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION} + +FROM ${BASE_MUSA_DEV_CONTAINER} AS build + +# MUSA architecture to build for (defaults to all supported archs) +ARG MUSA_DOCKER_ARCH=default + +RUN apt-get update && \ + apt-get install -y \ + build-essential \ + cmake \ + python3 \ + python3-pip \ + git \ + libcurl4-openssl-dev \ + libgomp1 + +COPY requirements.txt requirements.txt +COPY requirements requirements + +RUN pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt + +WORKDIR /app + +COPY . . + +# Use the default MUSA archs if not specified +RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \ + export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \ + fi && \ + cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_MUSA_RUN_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index b88e6ca80..043c4364b 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -31,6 +31,7 @@ # Increases the runtime closure size by ~700M useMpi ? false, useRocm ? config.rocmSupport, + rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets, enableCurl ? true, useVulkan ? false, llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake @@ -188,7 +189,7 @@ effectiveStdenv.mkDerivation (finalAttrs: { ] ++ optionals useRocm [ (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") - (cmakeFeature "CMAKE_HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) + (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets) ] ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile new file mode 100644 index 000000000..a8088ea00 --- /dev/null +++ b/.devops/rocm.Dockerfile @@ -0,0 +1,113 @@ +ARG UBUNTU_VERSION=24.04 + +# This needs to generally match the container host's environment. +ARG ROCM_VERSION=6.3 +ARG AMDGPU_VERSION=6.3 + +# Target the CUDA build image +ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete + +### Build image +FROM ${BASE_ROCM_DEV_CONTAINER} AS build + +# Unless otherwise specified, we make a fat build. +# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878 +# This is mostly tied to rocBLAS supported archs. +# gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported +# gfx906 is deprecated +#check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html + +#ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102' +ARG ROCM_DOCKER_ARCH=gfx1100 + +# Set nvcc architectured +ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH} +# Enable ROCm +# ENV CC=/opt/rocm/llvm/bin/clang +# ENV CXX=/opt/rocm/llvm/bin/clang++ + +RUN apt-get update \ + && apt-get install -y \ + build-essential \ + cmake \ + git \ + libcurl4-openssl-dev \ + curl \ + libgomp1 + +WORKDIR /app + +COPY . . + +RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \ + cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \ + && cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib \ + && find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ${BASE_ROCM_DEV_CONTAINER} AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3-pip \ + python3 \ + python3-wheel\ + && pip install --break-system-packages --upgrade setuptools \ + && pip install --break-system-packages -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile new file mode 100644 index 000000000..cfc2162e3 --- /dev/null +++ b/.devops/vulkan.Dockerfile @@ -0,0 +1,88 @@ +ARG UBUNTU_VERSION=jammy + +FROM ubuntu:$UBUNTU_VERSION AS build + +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget + +# Install Vulkan SDK and cURL +RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ + apt update -y && \ + apt-get install -y vulkan-sdk libcurl4-openssl-dev curl + +# Build it +WORKDIR /app + +COPY . . + +RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \ + cmake --build build --config Release -j$(nproc) + +RUN mkdir -p /app/lib && \ + find build -name "*.so" -exec cp {} /app/lib \; + +RUN mkdir -p /app/full \ + && cp build/bin/* /app/full \ + && cp *.py /app/full \ + && cp -r gguf-py /app/full \ + && cp -r requirements /app/full \ + && cp requirements.txt /app/full \ + && cp .devops/tools.sh /app/full/tools.sh + +## Base image +FROM ubuntu:$UBUNTU_VERSION AS base + +RUN apt-get update \ + && apt-get install -y libgomp1 curl\ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +COPY --from=build /app/lib/ /app + +### Full +FROM base AS full + +COPY --from=build /app/full /app + +WORKDIR /app + +RUN apt-get update \ + && apt-get install -y \ + git \ + python3 \ + python3-pip \ + && pip install --upgrade pip setuptools wheel \ + && pip install -r requirements.txt \ + && apt autoremove -y \ + && apt clean -y \ + && rm -rf /tmp/* /var/tmp/* \ + && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \ + && find /var/cache -type f -delete + +ENTRYPOINT ["/app/tools.sh"] + +### Light, CLI only +FROM base AS light + +COPY --from=build /app/full/llama-cli /app + +WORKDIR /app + +ENTRYPOINT [ "/app/llama-cli" ] + +### Server, Server only +FROM base AS server + +ENV LLAMA_ARG_HOST=0.0.0.0 + +COPY --from=build /app/full/llama-server /app + +WORKDIR /app + +HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ] + +ENTRYPOINT [ "/app/llama-server" ] diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 022d31fb2..a377eff38 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -317,7 +317,7 @@ jobs: wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo apt-key add - sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list sudo apt-get update -y - sudo apt-get install -y build-essential vulkan-sdk + sudo apt-get install -y build-essential mesa-vulkan-drivers vulkan-sdk - name: Build id: cmake_build @@ -327,6 +327,12 @@ jobs: cmake -DGGML_VULKAN=ON .. cmake --build . --config Release -j $(nproc) + - name: Test + id: cmake_test + run: | + cd build + ctest -L main --verbose --timeout 900 + ubuntu-22-cmake-hip: runs-on: ubuntu-22.04 container: rocm/dev-ubuntu-22.04:6.0.2 diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index bc2e5020d..41f1a89ee 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -34,21 +34,14 @@ jobs: strategy: matrix: config: - - { tag: "light", dockerfile: ".devops/llama-cli.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "server", dockerfile: ".devops/llama-server.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-cuda", dockerfile: ".devops/llama-cli-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-cuda", dockerfile: ".devops/llama-server-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" } - - { tag: "light-musa", dockerfile: ".devops/llama-cli-musa.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-musa", dockerfile: ".devops/llama-server-musa.Dockerfile", platforms: "linux/amd64" } - - { tag: "full-musa", dockerfile: ".devops/full-musa.Dockerfile", platforms: "linux/amd64" } + # Multi-stage build + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} + - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete - #- { tag: "light-rocm", dockerfile: ".devops/llama-cli-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - #- { tag: "server-rocm", dockerfile: ".devops/llama-server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - #- { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" } - - { tag: "light-intel", dockerfile: ".devops/llama-cli-intel.Dockerfile", platforms: "linux/amd64" } - - { tag: "server-intel", dockerfile: ".devops/llama-server-intel.Dockerfile", platforms: "linux/amd64" } + #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true } steps: - name: Check out the repo uses: actions/checkout@v4 @@ -56,10 +49,10 @@ jobs: fetch-depth: 0 # preserve git history, so we can determine the build number - name: Set up QEMU - uses: docker/setup-qemu-action@v2 + uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Log in to Docker Hub uses: docker/login-action@v2 @@ -79,25 +72,34 @@ jobs: # determine tag name postfix (build number, commit hash) if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then - TAG_POSTFIX="b${BUILD_NUMBER}" + TAG_POSTFIX="-b${BUILD_NUMBER}" else SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-') - TAG_POSTFIX="${SAFE_NAME}-${SHORT_HASH}" + TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}" fi - # list all tags possible - TAGS="" - TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}," - TAGS="${TAGS}ghcr.io/${REPO_OWNER}/${REPO_NAME}:${{ matrix.config.tag }}-${TAG_POSTFIX}" - - echo "output_tags=$TAGS" >> $GITHUB_OUTPUT - echo "output_tags=$TAGS" # print out for debugging + if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then + TYPE="" + else + TYPE="-${{ matrix.config.tag }}" + fi + PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:" + FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}" + LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}" + SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}" + echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT + echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT + echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT + echo "full_output_tags=$FULLTAGS" # print out for debugging + echo "light_output_tags=$LIGHTTAGS" # print out for debugging + echo "server_output_tags=$SERVERTAGS" # print out for debugging env: GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example - name: Free Disk Space (Ubuntu) + if: ${{ matrix.config.free_disk_space == true }} uses: jlumbroso/free-disk-space@main with: # this might remove tools that are actually needed, @@ -113,13 +115,59 @@ jobs: docker-images: true swap-storage: true - - name: Build and push Docker image (tagged + versioned) - if: ${{ github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + - name: Build and push Full Docker image (tagged + versioned) + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }} uses: docker/build-push-action@v6 with: context: . push: true platforms: ${{ matrix.config.platforms }} # tag list is generated from step above - tags: ${{ steps.tag.outputs.output_tags }} + tags: ${{ steps.tag.outputs.full_output_tags }} file: ${{ matrix.config.dockerfile }} + target: full + provenance: false + # using github experimental cache + cache-from: type=gha + cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache + + - name: Build and push Light Docker image (tagged + versioned) + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }} + uses: docker/build-push-action@v6 + with: + context: . + push: true + platforms: ${{ matrix.config.platforms }} + # tag list is generated from step above + tags: ${{ steps.tag.outputs.light_output_tags }} + file: ${{ matrix.config.dockerfile }} + target: light + provenance: false + # using github experimental cache + cache-from: type=gha + cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache + + - name: Build and push Server Docker image (tagged + versioned) + if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }} + uses: docker/build-push-action@v6 + with: + context: . + push: true + platforms: ${{ matrix.config.platforms }} + # tag list is generated from step above + tags: ${{ steps.tag.outputs.server_output_tags }} + file: ${{ matrix.config.dockerfile }} + target: server + provenance: false + # using github experimental cache + cache-from: type=gha + cache-to: type=gha,mode=max + # return to this if the experimental github cache is having issues + #cache-to: type=local,dest=/tmp/.buildx-cache + #cache-from: type=local,src=/tmp/.buildx-cache diff --git a/README.md b/README.md index 49095acc8..d6d1958c8 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo - [x] [Jais](https://huggingface.co/inceptionai/jais-13b-chat) - [x] [Bielik-11B-v2.3](https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a) - [x] [RWKV-6](https://github.com/BlinkDL/RWKV-LM) +- [x] [GigaChat-20B-A3B](https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct) #### Multimodal @@ -220,7 +221,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo | [SYCL](docs/backend/SYCL.md) | Intel and Nvidia GPU | | [MUSA](docs/build.md#musa) | Moore Threads MTT GPU | | [CUDA](docs/build.md#cuda) | Nvidia GPU | -| [hipBLAS](docs/build.md#hipblas) | AMD GPU | +| [HIP](docs/build.md#hip) | AMD GPU | | [Vulkan](docs/build.md#vulkan) | GPU | | [CANN](docs/build.md#cann) | Ascend NPU | @@ -413,7 +414,7 @@ To learn more about model quantization, [read this documentation](examples/quant [^1]: [examples/perplexity/README.md](examples/perplexity/README.md) [^2]: [https://huggingface.co/docs/transformers/perplexity](https://huggingface.co/docs/transformers/perplexity) -## [`llama-bench`](example/bench) +## [`llama-bench`](examples/llama-bench) #### Benchmark the performance of the inference for various parameters. @@ -447,7 +448,7 @@ To learn more about model quantization, [read this documentation](examples/quant -[^3]: [https://github.com/containers/ramalama](RamaLama) +[^3]: [RamaLama](https://github.com/containers/ramalama) ## [`llama-simple`](examples/simple) diff --git a/common/arg.cpp b/common/arg.cpp index 416a468f8..edcda60e0 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -119,29 +119,33 @@ std::string common_arg::to_string() { // utils // -static void common_params_handle_model_default(common_params & params) { - if (!params.hf_repo.empty()) { +static void common_params_handle_model_default( + std::string & model, + std::string & model_url, + std::string & hf_repo, + std::string & hf_file) { + if (!hf_repo.empty()) { // short-hand to avoid specifying --hf-file -> default it to --model - if (params.hf_file.empty()) { - if (params.model.empty()) { + if (hf_file.empty()) { + if (model.empty()) { throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); } - params.hf_file = params.model; - } else if (params.model.empty()) { + hf_file = model; + } else if (model.empty()) { // this is to avoid different repo having same file name, or same file name in different subdirs - std::string filename = params.hf_repo + "_" + params.hf_file; + std::string filename = hf_repo + "_" + hf_file; // to make sure we don't have any slashes in the filename string_replace_all(filename, "/", "_"); - params.model = fs_get_cache_file(filename); + model = fs_get_cache_file(filename); } - } else if (!params.model_url.empty()) { - if (params.model.empty()) { - auto f = string_split(params.model_url, '#').front(); + } else if (!model_url.empty()) { + if (model.empty()) { + auto f = string_split(model_url, '#').front(); f = string_split(f, '?').front(); - params.model = fs_get_cache_file(string_split(f, '/').back()); + model = fs_get_cache_file(string_split(f, '/').back()); } - } else if (params.model.empty()) { - params.model = DEFAULT_MODEL_PATH; + } else if (model.empty()) { + model = DEFAULT_MODEL_PATH; } } @@ -276,7 +280,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - common_params_handle_model_default(params); + // TODO: refactor model params in a common struct + common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file); + common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file); if (params.escape) { string_process_escapes(params.prompt); @@ -620,7 +626,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex [](common_params & params) { params.ctx_shift = false; } - ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); + ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT")); add_opt(common_arg( {"--chunks"}, "N", string_format("max number of chunks to process (default: %d, -1 = all)", params.n_chunks), @@ -842,7 +848,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_sparam()); add_opt(common_arg( - {"--sampling-seq"}, "SEQUENCE", + {"--sampling-seq", "--sampler-seq"}, "SEQUENCE", string_format("simplified sequence for samplers that will be used (default: %s)", sampler_type_chars.c_str()), [](common_params & params, const std::string & value) { params.sampling.samplers = common_sampler_types_from_chars(value); @@ -855,13 +861,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.sampling.ignore_eos = true; } ).set_sparam()); - add_opt(common_arg( - {"--penalize-nl"}, - string_format("penalize newline tokens (default: %s)", params.sampling.penalize_nl ? "true" : "false"), - [](common_params & params) { - params.sampling.penalize_nl = true; - } - ).set_sparam()); add_opt(common_arg( {"--temp"}, "N", string_format("temperature (default: %.1f)", (double)params.sampling.temp), @@ -916,6 +915,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--repeat-last-n"}, "N", string_format("last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)", params.sampling.penalty_last_n), [](common_params & params, int value) { + if (value < -1) { + throw std::runtime_error(string_format("error: invalid repeat-last-n = %d\n", value)); + } params.sampling.penalty_last_n = value; params.sampling.n_prev = std::max(params.sampling.n_prev, params.sampling.penalty_last_n); } @@ -970,6 +972,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--dry-penalty-last-n"}, "N", string_format("set DRY penalty for the last n tokens (default: %d, 0 = disable, -1 = context size)", params.sampling.dry_penalty_last_n), [](common_params & params, int value) { + if (value < -1) { + throw std::runtime_error(string_format("error: invalid dry-penalty-last-n = %d\n", value)); + } params.sampling.dry_penalty_last_n = value; } ).set_sparam()); @@ -1582,6 +1587,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); + add_opt(common_arg( + {"-hfrv", "--hf-repo-v"}, "REPO", + "Hugging Face model repository for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.hf_repo = value; + } + ).set_env("LLAMA_ARG_HF_REPO_V")); + add_opt(common_arg( + {"-hffv", "--hf-file-v"}, "FILE", + "Hugging Face model file for the vocoder model (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.hf_file = value; + } + ).set_env("LLAMA_ARG_HF_FILE_V")); add_opt(common_arg( {"-hft", "--hf-token"}, "TOKEN", "Hugging Face access token (default: value from HF_TOKEN environment variable)", @@ -2214,5 +2233,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); + add_opt(common_arg( + {"-mv", "--model-vocoder"}, "FNAME", + "vocoder model for audio generation (default: unused)", + [](common_params & params, const std::string & value) { + params.vocoder.model = value; + } + ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); + + // model-specific + add_opt(common_arg( + {"--tts-oute-default"}, + string_format("use default OuteTTS models (note: can download weights from the internet)"), + [](common_params & params) { + params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; + params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; + params.vocoder.hf_repo = "ggml-org/WavTokenizer"; + params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; + } + ).set_examples({LLAMA_EXAMPLE_TTS})); + return ctx_arg; } diff --git a/common/common.cpp b/common/common.cpp index 8491ad9a4..1fd91f00b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -979,6 +979,25 @@ struct common_init_result common_init_from_params(common_params & params) { params.sampling.ignore_eos = false; } + if (params.sampling.ignore_eos) { + for (llama_token i = 0; i < llama_n_vocab(model); i++) { + if (llama_token_is_eog(model, i)) { + LOG_INF("%s: added %s logit bias = %f\n", __func__, common_token_to_piece(lctx, i).c_str(), -INFINITY); + params.sampling.logit_bias.push_back({i, -INFINITY}); + } + } + } + + if (params.sampling.penalty_last_n == -1) { + LOG_INF("%s: setting penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + params.sampling.penalty_last_n = llama_n_ctx(lctx); + } + + if (params.sampling.dry_penalty_last_n == -1) { + LOG_INF("%s: setting dry_penalty_last_n to ctx_size = %d\n", __func__, llama_n_ctx(lctx)); + params.sampling.dry_penalty_last_n = llama_n_ctx(lctx); + } + if (params.warmup) { LOG_WRN("%s: warming up the model with an empty run - please wait ... (--no-warmup to disable)\n", __func__); @@ -1115,7 +1134,7 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p #define CURL_MAX_RETRY 3 #define CURL_RETRY_DELAY_SECONDS 2 -static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_attempts, int retry_delay_seconds) { +static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) { int remaining_attempts = max_attempts; while (remaining_attempts > 0) { @@ -1139,7 +1158,6 @@ static bool curl_perform_with_retry(const std::string& url, CURL* curl, int max_ } static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { - // Initialize libcurl std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { @@ -1212,11 +1230,13 @@ static bool common_download_file(const std::string & url, const std::string & pa std::string etag; std::string last_modified; }; + common_load_model_from_url_headers headers; + { typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - common_load_model_from_url_headers *headers = (common_load_model_from_url_headers *) userdata; + common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; static std::regex header_regex("([^:]+): (.*)\r\n"); static std::regex etag_regex("ETag", std::regex_constants::icase); @@ -1858,7 +1878,9 @@ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm) break; case 0: // max absolute for (int i = 0; i < n; i++) { - if (sum < std::abs(inp[i])) sum = std::abs(inp[i]); + if (sum < std::abs(inp[i])) { + sum = std::abs(inp[i]); + } } sum /= 32760.0; // make an int16 range break; diff --git a/common/common.h b/common/common.h index 693561569..b5514084e 100644 --- a/common/common.h +++ b/common/common.h @@ -86,6 +86,7 @@ enum llama_example { LLAMA_EXAMPLE_LLAVA, LLAMA_EXAMPLE_LOOKUP, LLAMA_EXAMPLE_PARALLEL, + LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_COUNT, }; @@ -101,6 +102,7 @@ enum common_sampler_type { COMMON_SAMPLER_TYPE_TEMPERATURE = 7, COMMON_SAMPLER_TYPE_XTC = 8, COMMON_SAMPLER_TYPE_INFILL = 9, + COMMON_SAMPLER_TYPE_PENALTIES = 10, }; // dimensionality reduction methods, used by cvector-generator @@ -136,7 +138,6 @@ struct common_params_sampling { int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token bool ignore_eos = false; bool no_perf = false; // disable performance metrics bool timing_per_token = false; @@ -145,6 +146,7 @@ struct common_params_sampling { std::vector samplers = { + COMMON_SAMPLER_TYPE_PENALTIES, COMMON_SAMPLER_TYPE_DRY, COMMON_SAMPLER_TYPE_TOP_K, COMMON_SAMPLER_TYPE_TYPICAL_P, @@ -165,6 +167,7 @@ struct common_params_sampling { struct common_params_speculative { std::vector devices; // devices to use for offloading + int32_t n_ctx = 0; // draft context size int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding @@ -178,6 +181,14 @@ struct common_params_speculative { std::string model = ""; // draft model for speculative decoding // NOLINT }; +struct common_params_vocoder { + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT + + std::string model = ""; // model path // NOLINT + std::string model_url = ""; // model url to download // NOLINT +}; + struct common_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 4096; // context size @@ -200,11 +211,13 @@ struct common_params { float defrag_thold = 0.1f; // KV cache defragmentation threshold // offload params - std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs + std::vector devices; // devices to use for offloading + + int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + + enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs struct cpu_params cpuparams; struct cpu_params cpuparams_batch; @@ -218,8 +231,9 @@ struct common_params { enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings - struct common_params_sampling sampling; + struct common_params_sampling sampling; struct common_params_speculative speculative; + struct common_params_vocoder vocoder; std::string model = ""; // model path // NOLINT std::string model_alias = ""; // model alias // NOLINT @@ -608,7 +622,8 @@ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_si // Embedding utils // -void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2); +// TODO: repace embd_norm with an enum +void common_embd_normalize(const float * inp, float * out, int n, int embd_norm); float common_embd_similarity_cos(const float * embd1, const float * embd2, int n); diff --git a/common/sampling.cpp b/common/sampling.cpp index cd938f0b9..e86120f5f 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -170,32 +170,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co params.logit_bias.size(), params.logit_bias.data())); - llama_sampler_chain_add(result->chain, - llama_sampler_init_penalties( - llama_n_vocab (model), - llama_token_eos(model), - llama_token_nl (model), - params.penalty_last_n, - params.penalty_repeat, - params.penalty_freq, - params.penalty_present, - params.penalize_nl, - params.ignore_eos)); - if (params.mirostat == 0) { for (const auto & cnstr : params.samplers) { switch (cnstr) { - case COMMON_SAMPLER_TYPE_DRY: + case COMMON_SAMPLER_TYPE_DRY: { - std::vector c_breakers; + std::vector c_breakers; c_breakers.reserve(params.dry_sequence_breakers.size()); - for (const auto& str : params.dry_sequence_breakers) { + for (const auto & str : params.dry_sequence_breakers) { c_breakers.push_back(str.c_str()); } llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size())); } - break; + break; case COMMON_SAMPLER_TYPE_TOP_K: llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k)); break; @@ -217,6 +205,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co case COMMON_SAMPLER_TYPE_INFILL: llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model)); break; + case COMMON_SAMPLER_TYPE_PENALTIES: + llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present)); + break; default: GGML_ASSERT(false && "unknown sampler type"); } @@ -428,6 +419,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't'; case COMMON_SAMPLER_TYPE_XTC: return 'x'; case COMMON_SAMPLER_TYPE_INFILL: return 'i'; + case COMMON_SAMPLER_TYPE_PENALTIES: return 'e'; default : return '?'; } } @@ -442,6 +434,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) { case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature"; case COMMON_SAMPLER_TYPE_XTC: return "xtc"; case COMMON_SAMPLER_TYPE_INFILL: return "infill"; + case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties"; default : return ""; } } @@ -456,6 +449,7 @@ std::vector common_sampler_types_from_names(const std::vect { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE }, { "xtc", COMMON_SAMPLER_TYPE_XTC }, { "infill", COMMON_SAMPLER_TYPE_INFILL }, + { "penalties", COMMON_SAMPLER_TYPE_PENALTIES }, }; // since samplers names are written multiple ways @@ -502,6 +496,7 @@ std::vector common_sampler_types_from_chars(const std::stri { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC }, { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL }, + { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES }, }; std::vector samplers; diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 206ab502e..b6c15da94 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -221,17 +221,17 @@ class Model: self.gguf_writer.add_context_length(n_ctx) logger.info(f"gguf: context length = {n_ctx}") - n_embd = self.find_hparam(["hidden_size", "n_embd"]) - self.gguf_writer.add_embedding_length(n_embd) - logger.info(f"gguf: embedding length = {n_embd}") + if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None: + self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) logger.info(f"gguf: feed forward length = {n_ff}") - n_head = self.find_hparam(["num_attention_heads", "n_head"]) - self.gguf_writer.add_head_count(n_head) - logger.info(f"gguf: head count = {n_head}") + if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None: + self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) @@ -296,7 +296,9 @@ class Model: break for new_name, data_torch in (self.modify_tensors(data_torch, name, bid)): - data = data_torch.squeeze().numpy() + # TODO: why do we squeeze here? + # data = data_torch.squeeze().numpy() + data = data_torch.numpy() # if data ends up empty, it means data_torch was a scalar tensor -> restore if len(data.shape) == 0: @@ -324,6 +326,8 @@ class Model: gguf.MODEL_TENSOR.TIME_MIX_W2, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W1, gguf.MODEL_TENSOR.TIME_MIX_DECAY_W2, + gguf.MODEL_TENSOR.POSNET_NORM1, + gguf.MODEL_TENSOR.POSNET_NORM2, ) ) or not new_name.endswith(".weight") @@ -525,9 +529,19 @@ class Model: else: token: str = reverse_vocab[i] if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not tokenizer.added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token): toktypes.append(gguf.TokenType.CONTROL) else: + # NOTE: this was added for Gemma. + # Encoding and decoding the tokens above isn't sufficient for this case. token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces toktypes.append(gguf.TokenType.USER_DEFINED) else: @@ -571,6 +585,9 @@ class Model: if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": # ref: https://huggingface.co/tiiuae/falcon-7b res = "falcon" + if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": + # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base + res = "falcon3" if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 res = "bert-bge" @@ -664,6 +681,12 @@ class Model: if chkhsh == "8b5a93ed704057481f240da0be7e7dca721d7f8f4755263b6807227a2cbeae65": # ref: https://huggingface.co/sentence-transformers/stsb-roberta-base res = "roberta-bpe" + if chkhsh == "ad851be1dba641f2e3711822f816db2c265f788b37c63b4e1aeacb9ee92de8eb": + # ref: https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct + res = "gigachat" + if chkhsh == "d4c8f286ea6b520b3d495c4455483cfa2302c0cfcd4be05d781b6a8a0a7cdaf1": + # ref: https://huggingface.co/Infinigence/Megrez-3B-Instruct + res = "megrez" if res is None: logger.warning("\n") @@ -686,6 +709,9 @@ class Model: return res # Marker: End get_vocab_base_pre + def _set_vocab_none(self) -> None: + self.gguf_writer.add_tokenizer_model("none") + def _set_vocab_gpt2(self) -> None: tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") @@ -1669,6 +1695,184 @@ class LlamaModel(Model): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("DeciLMForCausalLM") +class DeciModel(Model): + model_arch = gguf.MODEL_ARCH.DECI + + @staticmethod + def _ffn_mult_to_intermediate_size(ffn_mult: float, n_embd: int) -> int: + # DeciLM-specific code + intermediate_size = int(2 * ffn_mult * n_embd / 3) + return DeciModel._find_multiple(intermediate_size, 256) + + @staticmethod + def _find_multiple(n: int, k: int) -> int: + # DeciLM-specific code + if n % k == 0: + return n + return n + k - (n % k) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + _block_configs: list[dict[str,Any]] = self.hparams["block_configs"] + assert self.block_count == len(_block_configs) + self._num_kv_heads = list() + self._num_heads = list() + _ffn_multipliers = list() + # ***linear attention layer*** + # if n_heads_in_group is None and replace_with_linear is True + # then _num_kv_heads[il] is 0 and _num_heads[il] is num_attention_heads + # ***attention-free layer*** + # if n_heads_in_group is None and replace_with_linear is False + # then _num_kv_heads[il] is 0 and _num_heads[il] is 0 + # ***normal attention-layer*** + # if n_heads_in_group is not None, then + # _num_kv_heads[il] is num_attention_head // n_heads_in_group and + # _num_heads[il] is num_attention_head + for il in range(len(_block_configs)): + if _block_configs[il]["attention"]["n_heads_in_group"] is None: + if _block_configs[il]["attention"]["replace_with_linear"] is True: + self._num_kv_heads.append(0) + self._num_heads.append(self.hparams["num_attention_heads"]) + else: + self._num_kv_heads.append(0) + self._num_heads.append(0) + else: + self._num_kv_heads.append(self.hparams["num_attention_heads"] // _block_configs[il]["attention"]["n_heads_in_group"]) + self._num_heads.append(self.hparams["num_attention_heads"]) + _ffn_multipliers.append(_block_configs[il]["ffn"]["ffn_mult"]) + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(_ffn_multipliers) + assert isinstance(self._num_kv_heads, list) and isinstance(self._num_kv_heads[0], int) + assert isinstance(self._num_heads, list) and isinstance(self._num_heads[0], int) + assert isinstance(_ffn_multipliers, list) and isinstance(_ffn_multipliers[0], float) + self._ffn_dims: list[int] = [ + DeciModel._ffn_mult_to_intermediate_size(multiplier, self.hparams["hidden_size"]) + for multiplier in _ffn_multipliers + ] + + def set_vocab(self): + # Please change tokenizer_config.json of Llama-3_1-Nemotron-51B's + # eos_token from '|eot_id|' to '|end_of_text|' + if self.hparams.get("vocab_size", 128256) == 128256: + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=True, + special_token_types = ['bos', 'eos', 'eom', 'eot'] + ) + special_vocab._set_special_token("bos", 128000) + special_vocab._set_special_token("eos", 128001) + special_vocab._set_special_token("eom", 128008) + special_vocab._set_special_token("eot", 128009) + special_vocab.add_to_gguf(self.gguf_writer) + else: + # DeciLM-7B + self._set_vocab_llama_hf() +# self._set_vocab_gpt2() + + def set_gguf_parameters(self): + if "block_configs" in self.hparams: # Llama-3_1-Nemotron-51B + assert self.block_count == len(self._num_kv_heads) + assert self.block_count == len(self._num_heads) + assert self.block_count == len(self._ffn_dims) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + self.gguf_writer.add_head_count(self._num_heads) + self.gguf_writer.add_feed_forward_length(self._ffn_dims) + self.gguf_writer.add_block_count(self.block_count) + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + self.gguf_writer.add_key_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_value_length(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_file_type(self.ftype) + else: # DeciLM-7B + super().set_gguf_parameters() + if "num_key_value_heads_per_layer" in self.hparams: # DeciLM-7B + self._num_kv_heads: list[int] = self.hparams["num_key_value_heads_per_layer"] + assert self.block_count == len(self._num_kv_heads) + self.gguf_writer.add_head_count_kv(self._num_kv_heads) + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + self.gguf_writer.add_rope_dimension_count(rope_dim) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + if bid is not None: + if "num_key_value_heads_per_layer" in self.hparams: + n_kv_head = self.hparams["num_key_value_heads_per_layer"][bid] + elif "block_configs" in self.hparams: + n_kv_head = self._num_kv_heads[bid] + n_head = self._num_heads[bid] + else: + n_kv_head = self.hparams.get("num_key_value_heads") + else: + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeciModel.permute(data_torch, n_head, n_kv_head) + return [(self.map_tensor_name(name), data_torch)] + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if rope_scaling := self.find_hparam(["rope_scaling"], optional=True): + if rope_scaling.get("rope_type", '').lower() == "llama3": + base = self.hparams.get("rope_theta", 10000.0) + dim = self.hparams.get("head_dim", self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + freqs = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)) + + factor = rope_scaling.get("factor", 8.0) + low_freq_factor = rope_scaling.get("low_freq_factor", 1.0) + high_freq_factor = rope_scaling.get("high_freq_factor", 4.0) + old_context_len = self.hparams.get("original_max_position_embeddings", 8192) + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + assert low_freq_wavelen != high_freq_wavelen + + rope_factors = [] + for freq in freqs: + wavelen = 2 * math.pi / freq + if wavelen < high_freq_wavelen: + rope_factors.append(1) + elif wavelen > low_freq_wavelen: + rope_factors.append(factor) + else: + smooth = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + rope_factors.append(1 / ((1 - smooth) / factor + smooth)) + + yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32)) + + def prepare_tensors(self): + super().prepare_tensors() + + @Model.register("BitnetForCausalLM") class BitnetModel(Model): model_arch = gguf.MODEL_ARCH.BITNET @@ -2024,6 +2228,44 @@ class Qwen2VLModel(Model): yield name, data +@Model.register("WavTokenizerDec") +class WavTokenizerDecModel(Model): + model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + if \ + name.endswith("codebook.cluster_size") or \ + name.endswith("codebook.embed_avg") or \ + name.endswith("codebook.inited"): + logger.debug(f"Skipping {name!r}") + return [] + + logger.info(f"{self.map_tensor_name(name)} -> {data_torch.shape}") + + return [(self.map_tensor_name(name), data_torch)] + + def set_vocab(self): + self._set_vocab_none() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size (self.hparams["vocab_size"]) + self.gguf_writer.add_features_length (self.hparams["n_embd_features"]) + self.gguf_writer.add_feed_forward_length(self.hparams["n_ff"]) + self.gguf_writer.add_group_norm_eps (self.hparams["group_norm_epsilon"]) + self.gguf_writer.add_group_norm_groups (self.hparams["group_norm_groups"]) + + self.gguf_writer.add_posnet_embedding_length(self.hparams["posnet"]["n_embd"]) + self.gguf_writer.add_posnet_block_count (self.hparams["posnet"]["n_layer"]) + + self.gguf_writer.add_convnext_embedding_length(self.hparams["convnext"]["n_embd"]) + self.gguf_writer.add_convnext_block_count (self.hparams["convnext"]["n_layer"]) + + self.gguf_writer.add_causal_attention(False) + + @Model.register("Qwen2MoeForCausalLM") class Qwen2MoeModel(Model): model_arch = gguf.MODEL_ARCH.QWEN2MOE @@ -2152,6 +2394,15 @@ class Phi3MiniModel(Model): model_arch = gguf.MODEL_ARCH.PHI3 def set_vocab(self): + # Phi-4 model uses GPT2Tokenizer + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + tokenizer_class = tokenizer_config_json['tokenizer_class'] + if tokenizer_class == 'GPT2Tokenizer': + return self._set_vocab_gpt2() + from sentencepiece import SentencePieceProcessor tokenizer_path = self.dir_model / 'tokenizer.model' @@ -2268,7 +2519,11 @@ class Phi3MiniModel(Model): self.gguf_writer.add_rope_dimension_count(rope_dims) self.gguf_writer.add_rope_freq_base(self.find_hparam(["rope_theta"])) self.gguf_writer.add_file_type(self.ftype) - self.gguf_writer.add_sliding_window(self.find_hparam(["sliding_window"])) + sliding_window = self.hparams.get("sliding_window") + # use zero value of sliding_window to distinguish Phi-4 from other PHI3 models + if sliding_window is None: + sliding_window = 0 + self.gguf_writer.add_sliding_window(sliding_window) def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: n_embd = self.find_hparam(["hidden_size", "n_embd"]) @@ -2567,7 +2822,7 @@ class InternLM2Model(Model): return [(self.map_tensor_name(name), data_torch)] -@Model.register("BertModel", "CamembertModel", "RobertaModel") +@Model.register("BertModel", "BertForMaskedLM", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT @@ -2633,13 +2888,73 @@ class BertModel(Model): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + if name.startswith("bert."): + name = name[5:] + + if name.endswith(".gamma"): + name = name[:-6] + ".weight" + + if name.endswith(".beta"): + name = name[:-5] + ".bias" + # we are only using BERT for embeddings so we don't need the pooling layer if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): return [] # we don't need these + if name.startswith("cls.predictions"): + return [] + + if name.startswith("cls.seq_relationship"): + return [] + return [(self.map_tensor_name(name), data_torch)] +@Model.register("RobertaModel") +class RobertaModel(BertModel): + model_arch = gguf.MODEL_ARCH.BERT + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # we need the pad_token_id to know how to chop down position_embd matrix + if (pad_token_id := self.hparams.get("pad_token_id")) is not None: + self._position_offset = 1 + pad_token_id + if "max_position_embeddings" in self.hparams: + self.hparams["max_position_embeddings"] -= self._position_offset + else: + self._position_offset = None + + def set_vocab(self): + """Support BPE tokenizers for roberta models""" + bpe_tok_path = self.dir_model / "tokenizer.json" + if bpe_tok_path.exists(): + self._set_vocab_gpt2() + self.gguf_writer.add_add_bos_token(True) + self.gguf_writer.add_add_eos_token(True) + + # we need this to validate the size of the token_type embeddings + # though currently we are passing all zeros to the token_type embeddings + # "Sequence A" or "Sequence B" + self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1)) + + else: + return super().set_vocab() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # if name starts with "roberta.", remove the prefix + # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main + if name.startswith("roberta."): + name = name[8:] + + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor + if name == "embeddings.position_embeddings.weight": + if self._position_offset is not None: + data_torch = data_torch[self._position_offset:,:] + + return super().modify_tensors(data_torch, name, bid) + + @Model.register("NomicBertModel") class NomicBertModel(BertModel): model_arch = gguf.MODEL_ARCH.NOMIC_BERT @@ -2959,6 +3274,9 @@ class Rwkv6Model(Model): if new_name.endswith("time_mix_w2.weight"): data_torch = data_torch.permute(0, 2, 1) + if new_name.endswith("time_mix_decay.weight") or "lerp" in new_name: + data_torch = data_torch.squeeze() + rescale_every_n_layers = self.hparams["rescale_every"] if rescale_every_n_layers > 0: if new_name.endswith("time_mix_output.weight") or new_name.endswith("channel_mix_value.weight"): @@ -3427,6 +3745,97 @@ class ArcticModel(Model): raise ValueError(f"Unprocessed experts: {experts}") +@Model.register("DeepseekForCausalLM") +class DeepseekModel(Model): + model_arch = gguf.MODEL_ARCH.DEEPSEEK + + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if "head_dim" in hparams: + rope_dim = hparams["head_dim"] + else: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(rope_dim) + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_weights_scale(1.0) + self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) + + _experts: list[dict[str, Tensor]] | None = None + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith(("q_proj.weight", "q_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight", "k_proj.bias")): + data_torch = DeepseekModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("mlp.experts") != -1: + n_experts = self.hparams["n_routed_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @Model.register("DeepseekV2ForCausalLM") class DeepseekV2Model(Model): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index aa10e5db7..fea23ddb4 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -72,6 +72,7 @@ models = [ {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "falcon3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/Falcon3-7B-Base", }, {"name": "bert-bge-large", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/BAAI/bge-large-zh-v1.5", }, {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, @@ -104,6 +105,8 @@ models = [ {"name": "chameleon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/facebook/chameleon-7b", }, {"name": "minerva-7b", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0", }, {"name": "roberta-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/sentence-transformers/stsb-roberta-base"}, + {"name": "gigachat", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct"}, + {"name": "megrez", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Infinigence/Megrez-3B-Instruct"}, ] diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 21b31392e..66cfab2c3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -51,6 +51,7 @@ else() add_subdirectory(speculative) add_subdirectory(speculative-simple) add_subdirectory(tokenize) + add_subdirectory(tts) add_subdirectory(gen-docs) if (NOT GGML_BACKEND_DL) # these examples use the backends directly and cannot be built with dynamic loading diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index ba219cd4b..e2e01f2d5 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -65,6 +65,7 @@ int main(int argc, char ** argv) { llama_context * ctx = llama_new_context_with_model(model, ctx_params); auto sparams = llama_sampler_chain_default_params(); + sparams.no_perf = false; llama_sampler * smpl = llama_sampler_chain_init(sparams); diff --git a/examples/cvector-generator/mean.hpp b/examples/cvector-generator/mean.hpp index 16be5ce3e..4eeac1eeb 100644 --- a/examples/cvector-generator/mean.hpp +++ b/examples/cvector-generator/mean.hpp @@ -15,7 +15,7 @@ static void run( for (size_t il = 0; il < v_input.size(); ++il) { // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%ld", il+1); + ggml_format_name(ctrl_out, "direction.%zu", il+1); // calculate mean vector struct ggml_tensor * t_layer = v_input[il]; diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp index f6e307fbc..e88bbdde9 100644 --- a/examples/cvector-generator/pca.hpp +++ b/examples/cvector-generator/pca.hpp @@ -302,7 +302,7 @@ static void run_pca( // prepare output vector struct ggml_tensor * ctrl_out = v_output[il]; - ggml_format_name(ctrl_out, "direction.%ld", il+1); + ggml_format_name(ctrl_out, "direction.%zu", il+1); // run power_iteration params.i_layer = il; diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 67662313d..058b5cc86 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -265,8 +265,8 @@ struct lora_merge_ctx { fout.write((const char *)data.data(), data.size()); } - printf("%s : merged %ld tensors with lora adapters\n", __func__, n_merged); - printf("%s : wrote %ld tensors to output file\n", __func__, trans.size()); + printf("%s : merged %zu tensors with lora adapters\n", __func__, n_merged); + printf("%s : wrote %zu tensors to output file\n", __func__, trans.size()); } void copy_tensor(struct ggml_tensor * base) { @@ -352,7 +352,7 @@ struct lora_merge_ctx { const float scale = alpha ? adapters[i]->scale * alpha / rank : adapters[i]->scale; delta = ggml_scale(ctx0, delta, scale); cur = ggml_add(ctx0, delta, cur); - printf("%s : + merging from adapter[%ld] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); + printf("%s : + merging from adapter[%zu] type=%s\n", __func__, i, ggml_type_name(inp_a[i]->type)); printf("%s : input_scale=%f calculated_scale=%f rank=%d\n", __func__, adapters[i]->scale, scale, (int) inp_b[i]->ne[0]); } cur = ggml_cast(ctx0, cur, out->type); diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp index 7493af9d3..17a0e27c4 100644 --- a/examples/gbnf-validator/gbnf-validator.cpp +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -11,19 +11,15 @@ static bool llama_grammar_validate(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { const auto cpts = unicode_cpts_from_utf8(input_str); - const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); - llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); + auto & stacks_cur = llama_grammar_get_stacks(grammar); size_t pos = 0; for (const auto & cpt : cpts) { - const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy - - llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur); + llama_grammar_accept(grammar, cpt); if (stacks_cur.empty()) { error_pos = pos; error_msg = "Unexpected character '" + unicode_cpt_to_utf8(cpt) + "'"; - stacks_cur = stacks_prev; return false; } ++pos; @@ -82,7 +78,8 @@ int main(int argc, char** argv) { llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_str.c_str(), "root"); if (grammar == nullptr) { - throw std::runtime_error("Failed to initialize llama_grammar"); + fprintf(stdout, "Failed to initialize llama_grammar\n"); + return 1; } // Read the input file std::string input_str; diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index 6e42fa073..18a945b33 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -75,7 +75,7 @@ static std::vector> encode(llama_context * ctx, const std::ve } std::vector emb_norm(emb_unorm.size()); - common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd); + common_embd_normalize(emb_unorm.data(), emb_norm.data(), n_embd, 2); result.push_back(emb_norm); #ifdef GRIT_DEBUG diff --git a/examples/llama.android/llama/build.gradle.kts b/examples/llama.android/llama/build.gradle.kts index 2d1dfba20..28dbc1904 100644 --- a/examples/llama.android/llama/build.gradle.kts +++ b/examples/llama.android/llama/build.gradle.kts @@ -19,6 +19,7 @@ android { externalNativeBuild { cmake { arguments += "-DLLAMA_BUILD_COMMON=ON" + arguments += "-DGGML_LLAMAFILE=OFF" arguments += "-DCMAKE_BUILD_TYPE=Release" cppFlags += listOf() arguments += listOf() diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index ba28c07c6..3cd0d2fa8 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -8,25 +8,25 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_SYCL -#include "ggml-sycl.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#ifdef GGML_USE_CANN -#include "ggml-cann.h" -#endif - -#ifdef GGML_USE_VULKAN -#include "ggml-vulkan.h" -#endif +//#ifdef GGML_USE_CUDA +//#include "ggml-cuda.h" +//#endif +// +//#ifdef GGML_USE_SYCL +//#include "ggml-sycl.h" +//#endif +// +//#ifdef GGML_USE_METAL +//#include "ggml-metal.h" +//#endif +// +//#ifdef GGML_USE_CANN +//#include "ggml-cann.h" +//#endif +// +//#ifdef GGML_USE_VULKAN +//#include "ggml-vulkan.h" +//#endif #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" @@ -896,7 +896,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_3 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_3, 1, 0, 2, 3)); mlp_3 = ggml_reshape_4d(ctx0, mlp_3, n_patch, n_patch, mlp_3->ne[1], mlp_3->ne[2]); // stride = 1, padding = 1, bias is nullptr - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_1_block_0_0_w, mlp_3, 1, 1, 1, 1, 1, 1); // layer norm // // block_1 shape = [1, 2048, 24, 24], ne = [24, 24, 2048, 1] @@ -944,7 +944,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // block_2 { // stride = 2 - block_1 = ggml_conv_depthwise_2d(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); + block_1 = ggml_conv_2d_dw(ctx0, model.mm_model_block_2_block_0_0_w, block_1, 2, 2, 1, 1, 1, 1); // block_1 shape = [1, 2048, 12, 12], ne = [12, 12, 2048, 1] // layer norm @@ -1005,7 +1005,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 // mlp_2 ne [24, 24, 2048, 1] mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + struct ggml_tensor * peg_0 = ggml_conv_2d_dw(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); @@ -1222,30 +1222,30 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -#ifdef GGML_USE_CUDA - new_clip->backend = ggml_backend_cuda_init(0); - LOG_INF("%s: CLIP using CUDA backend\n", __func__); -#endif - -#ifdef GGML_USE_METAL - new_clip->backend = ggml_backend_metal_init(); - LOG_INF("%s: CLIP using Metal backend\n", __func__); -#endif - -#ifdef GGML_USE_CANN - new_clip->backend = ggml_backend_cann_init(0); - LOG_INF("%s: CLIP using CANN backend\n", __func__); -#endif - -#ifdef GGML_USE_VULKAN - new_clip->backend = ggml_backend_vk_init(0); - LOG_INF("%s: CLIP using Vulkan backend\n", __func__); -#endif - -#ifdef GGML_USE_SYCL - new_clip->backend = ggml_backend_sycl_init(0); - LOG_INF("%s: CLIP using SYCL backend\n", __func__); -#endif +//#ifdef GGML_USE_CUDA +// new_clip->backend = ggml_backend_cuda_init(0); +// LOG_INF("%s: CLIP using CUDA backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_METAL +// new_clip->backend = ggml_backend_metal_init(); +// LOG_INF("%s: CLIP using Metal backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_CANN +// new_clip->backend = ggml_backend_cann_init(0); +// LOG_INF("%s: CLIP using CANN backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_VULKAN +// new_clip->backend = ggml_backend_vk_init(0); +// LOG_INF("%s: CLIP using Vulkan backend\n", __func__); +//#endif +// +//#ifdef GGML_USE_SYCL +// new_clip->backend = ggml_backend_sycl_init(0); +// LOG_INF("%s: CLIP using SYCL backend\n", __func__); +//#endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py index 464ab80d3..c87606b4f 100644 --- a/examples/llava/qwen2_vl_surgery.py +++ b/examples/llava/qwen2_vl_surgery.py @@ -88,6 +88,8 @@ def main(args): else: raise ValueError() + local_model = False + model_path = "" model_name = args.model_name print("model_name: ", model_name) qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained( @@ -97,8 +99,10 @@ def main(args): vcfg = cfg.vision_config if os.path.isdir(model_name): + local_model = True if model_name.endswith(os.sep): model_name = model_name[:-1] + model_path = model_name model_name = os.path.basename(model_name) fname_out = f"{model_name.replace('/', '-').lower()}-vision.gguf" @@ -139,7 +143,10 @@ def main(args): it will be hardcoded in the `clip_image_build_graph` from `clip.cpp`. """ - processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name) + if local_model: + processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_path) + else: + processor: Qwen2VLProcessor = AutoProcessor.from_pretrained(model_name) fout.add_array("clip.vision.image_mean", processor.image_processor.image_mean) # type: ignore[reportAttributeAccessIssue] fout.add_array("clip.vision.image_std", processor.image_processor.image_std) # type: ignore[reportAttributeAccessIssue] diff --git a/examples/main/README.md b/examples/main/README.md index 7787f7b11..17d80a622 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -177,16 +177,11 @@ Example usage: `--temp 0` - `--repeat-penalty N`: Control the repetition of token sequences in the generated text default: 1.0, 1.0 = disabled). - `--repeat-last-n N`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). -- `--no-penalize-nl`: Disable penalization for newline tokens when applying the repeat penalty. The `repeat-penalty` option helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. The default value is 1. The `repeat-last-n` option controls the number of tokens in the history to consider for penalizing repetition. A larger value will look further back in the generated text to prevent repetitions, while a smaller value will only consider recent tokens. A value of 0 disables the penalty, and a value of -1 sets the number of tokens considered equal to the context size (`ctx-size`). -Use the `--no-penalize-nl` option to disable newline penalization when applying the repeat penalty. This option is particularly useful for generating chat conversations, dialogues, code, poetry, or any text where newline tokens play a significant role in structure and formatting. Disabling newline penalization helps maintain the natural flow and intended formatting in these specific use cases. - -Example usage: `--repeat-penalty 1.15 --repeat-last-n 128 --no-penalize-nl` - ### DRY Repetition Penalty DRY (Don't Repeat Yourself) sampling is an effective technique for reducing repetition in generated text even across long contexts by penalizing tokens based on their recent usage patterns (original [PR link](https://github.com/oobabooga/text-generation-webui/pull/5677)). diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp index 23ff4db27..a5c6fe7e5 100644 --- a/examples/retrieval/retrieval.cpp +++ b/examples/retrieval/retrieval.cpp @@ -107,7 +107,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu } float * out = output + batch.seq_id[i][0] * n_embd; - common_embd_normalize(embd, out, n_embd); + common_embd_normalize(embd, out, n_embd, 2); } } diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp index 5fe70dac7..8b1b23eda 100644 --- a/examples/rpc/rpc-server.cpp +++ b/examples/rpc/rpc-server.cpp @@ -12,6 +12,10 @@ #include "ggml-vulkan.h" #endif +#ifdef GGML_USE_SYCL +#include "ggml-sycl.h" +#endif + #include "ggml-rpc.h" #ifdef _WIN32 # include @@ -91,6 +95,12 @@ static ggml_backend_t create_backend() { if (!backend) { fprintf(stderr, "%s: ggml_backend_vulkan_init() failed\n", __func__); } +#elif GGML_USE_SYCL + fprintf(stderr, "%s: using SYCL backend\n", __func__); + backend = ggml_backend_sycl_init(0); // init device 0 + if (!backend) { + fprintf(stderr, "%s: ggml_backend_sycl_init() failed\n", __func__); + } #endif // if there aren't GPU Backends fallback to CPU backend @@ -106,6 +116,8 @@ static void get_backend_memory(size_t * free_mem, size_t * total_mem) { ggml_backend_cuda_get_device_memory(0, free_mem, total_mem); #elif GGML_USE_VULKAN ggml_backend_vk_get_device_memory(0, free_mem, total_mem); +#elif GGML_USE_SYCL + ggml_backend_sycl_get_device_memory(0, free_mem, total_mem); #else #ifdef _WIN32 MEMORYSTATUSEX status; diff --git a/examples/run/README.md b/examples/run/README.md index 6162658e9..a06805441 100644 --- a/examples/run/README.md +++ b/examples/run/README.md @@ -4,7 +4,7 @@ The purpose of this example is to demonstrate a minimal usage of llama.cpp for r ```bash llama-run granite-code -... +``` ```bash llama-run -h @@ -19,6 +19,10 @@ Options: Context size (default: 2048) -n, --ngl Number of GPU layers (default: 0) + --temp + Temperature (default: 0.8) + -v, --verbose, --log-verbose + Set verbosity level to infinity (i.e. log all messages, useful for debugging) -h, --help Show help message @@ -42,6 +46,6 @@ Examples: llama-run https://example.com/some-file1.gguf llama-run some-file2.gguf llama-run file://some-file3.gguf - llama-run --ngl 99 some-file4.gguf - llama-run --ngl 99 some-file5.gguf Hello World -... + llama-run --ngl 999 some-file4.gguf + llama-run --ngl 999 some-file5.gguf Hello World +``` diff --git a/examples/run/run.cpp b/examples/run/run.cpp index 834ea8f7b..f89d041c4 100644 --- a/examples/run/run.cpp +++ b/examples/run/run.cpp @@ -1,6 +1,8 @@ #if defined(_WIN32) # include #else +# include +# include # include #endif @@ -8,6 +10,7 @@ # include #endif +#include #include #include #include @@ -21,41 +24,150 @@ #include "json.hpp" #include "llama-cpp.h" -#define printe(...) \ - do { \ - fprintf(stderr, __VA_ARGS__); \ - } while (0) +GGML_ATTRIBUTE_FORMAT(1, 2) +static std::string fmt(const char * fmt, ...) { + va_list ap; + va_list ap2; + va_start(ap, fmt); + va_copy(ap2, ap); + const int size = vsnprintf(NULL, 0, fmt, ap); + GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT + std::string buf; + buf.resize(size); + const int size2 = vsnprintf(const_cast(buf.data()), buf.size() + 1, fmt, ap2); + GGML_ASSERT(size2 == size); + va_end(ap2); + va_end(ap); + + return buf; +} + +GGML_ATTRIBUTE_FORMAT(1, 2) +static int printe(const char * fmt, ...) { + va_list args; + va_start(args, fmt); + const int ret = vfprintf(stderr, fmt, args); + va_end(args); + + return ret; +} class Opt { public: int init(int argc, const char ** argv) { - construct_help_str_(); + ctx_params = llama_context_default_params(); + model_params = llama_model_default_params(); + context_size_default = ctx_params.n_batch; + ngl_default = model_params.n_gpu_layers; + common_params_sampling sampling; + temperature_default = sampling.temp; + + if (argc < 2) { + printe("Error: No arguments provided.\n"); + print_help(); + return 1; + } + // Parse arguments if (parse(argc, argv)) { printe("Error: Failed to parse arguments.\n"); - help(); + print_help(); return 1; } // If help is requested, show help and exit - if (help_) { - help(); + if (help) { + print_help(); return 2; } + ctx_params.n_batch = context_size >= 0 ? context_size : context_size_default; + model_params.n_gpu_layers = ngl >= 0 ? ngl : ngl_default; + temperature = temperature >= 0 ? temperature : temperature_default; + return 0; // Success } + llama_context_params ctx_params; + llama_model_params model_params; std::string model_; - std::string user_; - int context_size_ = 2048, ngl_ = -1; + std::string user; + int context_size = -1, ngl = -1; + float temperature = -1; + bool verbose = false; private: - std::string help_str_; - bool help_ = false; + int context_size_default = -1, ngl_default = -1; + float temperature_default = -1; + bool help = false; - void construct_help_str_() { - help_str_ = + bool parse_flag(const char ** argv, int i, const char * short_opt, const char * long_opt) { + return strcmp(argv[i], short_opt) == 0 || strcmp(argv[i], long_opt) == 0; + } + + int handle_option_with_value(int argc, const char ** argv, int & i, int & option_value) { + if (i + 1 >= argc) { + return 1; + } + + option_value = std::atoi(argv[++i]); + + return 0; + } + + int handle_option_with_value(int argc, const char ** argv, int & i, float & option_value) { + if (i + 1 >= argc) { + return 1; + } + + option_value = std::atof(argv[++i]); + + return 0; + } + + int parse(int argc, const char ** argv) { + bool options_parsing = true; + for (int i = 1, positional_args_i = 0; i < argc; ++i) { + if (options_parsing && (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0)) { + if (handle_option_with_value(argc, argv, i, context_size) == 1) { + return 1; + } + } else if (options_parsing && (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0)) { + if (handle_option_with_value(argc, argv, i, ngl) == 1) { + return 1; + } + } else if (options_parsing && strcmp(argv[i], "--temp") == 0) { + if (handle_option_with_value(argc, argv, i, temperature) == 1) { + return 1; + } + } else if (options_parsing && + (parse_flag(argv, i, "-v", "--verbose") || parse_flag(argv, i, "-v", "--log-verbose"))) { + verbose = true; + } else if (options_parsing && parse_flag(argv, i, "-h", "--help")) { + help = true; + return 0; + } else if (options_parsing && strcmp(argv[i], "--") == 0) { + options_parsing = false; + } else if (positional_args_i == 0) { + if (!argv[i][0] || argv[i][0] == '-') { + return 1; + } + + ++positional_args_i; + model_ = argv[i]; + } else if (positional_args_i == 1) { + ++positional_args_i; + user = argv[i]; + } else { + user += " " + std::string(argv[i]); + } + } + + return 0; + } + + void print_help() const { + printf( "Description:\n" " Runs a llm\n" "\n" @@ -64,15 +176,13 @@ class Opt { "\n" "Options:\n" " -c, --context-size \n" - " Context size (default: " + - std::to_string(context_size_); - help_str_ += - ")\n" + " Context size (default: %d)\n" " -n, --ngl \n" - " Number of GPU layers (default: " + - std::to_string(ngl_); - help_str_ += - ")\n" + " Number of GPU layers (default: %d)\n" + " --temp \n" + " Temperature (default: %.1f)\n" + " -v, --verbose, --log-verbose\n" + " Set verbosity level to infinity (i.e. log all messages, useful for debugging)\n" " -h, --help\n" " Show help message\n" "\n" @@ -92,67 +202,102 @@ class Opt { " llama-run ollama://granite-code\n" " llama-run ollama://smollm:135m\n" " llama-run hf://QuantFactory/SmolLM-135M-GGUF/SmolLM-135M.Q2_K.gguf\n" - " llama-run huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" + " llama-run " + "huggingface://bartowski/SmolLM-1.7B-Instruct-v0.2-GGUF/SmolLM-1.7B-Instruct-v0.2-IQ3_M.gguf\n" " llama-run https://example.com/some-file1.gguf\n" " llama-run some-file2.gguf\n" " llama-run file://some-file3.gguf\n" - " llama-run --ngl 99 some-file4.gguf\n" - " llama-run --ngl 99 some-file5.gguf Hello World\n"; + " llama-run --ngl 999 some-file4.gguf\n" + " llama-run --ngl 999 some-file5.gguf Hello World\n", + context_size_default, ngl_default, temperature_default); } - - int parse(int argc, const char ** argv) { - int positional_args_i = 0; - for (int i = 1; i < argc; ++i) { - if (strcmp(argv[i], "-c") == 0 || strcmp(argv[i], "--context-size") == 0) { - if (i + 1 >= argc) { - return 1; - } - - context_size_ = std::atoi(argv[++i]); - } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--ngl") == 0) { - if (i + 1 >= argc) { - return 1; - } - - ngl_ = std::atoi(argv[++i]); - } else if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { - help_ = true; - return 0; - } else if (!positional_args_i) { - ++positional_args_i; - model_ = argv[i]; - } else if (positional_args_i == 1) { - ++positional_args_i; - user_ = argv[i]; - } else { - user_ += " " + std::string(argv[i]); - } - } - - return model_.empty(); // model_ is the only required value - } - - void help() const { printf("%s", help_str_.c_str()); } }; struct progress_data { - size_t file_size = 0; + size_t file_size = 0; std::chrono::steady_clock::time_point start_time = std::chrono::steady_clock::now(); - bool printed = false; + bool printed = false; }; -struct FileDeleter { - void operator()(FILE * file) const { +static int get_terminal_width() { +#if defined(_WIN32) + CONSOLE_SCREEN_BUFFER_INFO csbi; + GetConsoleScreenBufferInfo(GetStdHandle(STD_OUTPUT_HANDLE), &csbi); + return csbi.srWindow.Right - csbi.srWindow.Left + 1; +#else + struct winsize w; + ioctl(STDOUT_FILENO, TIOCGWINSZ, &w); + return w.ws_col; +#endif +} + +#ifdef LLAMA_USE_CURL +class File { + public: + FILE * file = nullptr; + + FILE * open(const std::string & filename, const char * mode) { + file = fopen(filename.c_str(), mode); + + return file; + } + + int lock() { + if (file) { +# ifdef _WIN32 + fd = _fileno(file); + hFile = (HANDLE) _get_osfhandle(fd); + if (hFile == INVALID_HANDLE_VALUE) { + fd = -1; + + return 1; + } + + OVERLAPPED overlapped = { 0 }; + if (!LockFileEx(hFile, LOCKFILE_EXCLUSIVE_LOCK | LOCKFILE_FAIL_IMMEDIATELY, 0, MAXDWORD, MAXDWORD, + &overlapped)) { + fd = -1; + + return 1; + } +# else + fd = fileno(file); + if (flock(fd, LOCK_EX | LOCK_NB) != 0) { + fd = -1; + + return 1; + } +# endif + } + + return 0; + } + + ~File() { + if (fd >= 0) { +# ifdef _WIN32 + if (hFile != INVALID_HANDLE_VALUE) { + OVERLAPPED overlapped = { 0 }; + UnlockFileEx(hFile, 0, MAXDWORD, MAXDWORD, &overlapped); + } +# else + flock(fd, LOCK_UN); +# endif + } + if (file) { fclose(file); } } + + private: + int fd = -1; +# ifdef _WIN32 + HANDLE hFile; +# endif }; -typedef std::unique_ptr FILE_ptr; - -#ifdef LLAMA_USE_CURL -class CurlWrapper { +class HttpClient { public: int init(const std::string & url, const std::vector & headers, const std::string & output_file, const bool progress, std::string * response_str = nullptr) { @@ -163,10 +308,20 @@ class CurlWrapper { } progress_data data; - FILE_ptr out; + File out; if (!output_file.empty()) { output_file_partial = output_file + ".partial"; - out.reset(fopen(output_file_partial.c_str(), "ab")); + if (!out.open(output_file_partial, "ab")) { + printe("Failed to open file\n"); + + return 1; + } + + if (out.lock()) { + printe("Failed to exclusively lock file\n"); + + return 1; + } } set_write_options(response_str, out); @@ -181,7 +336,7 @@ class CurlWrapper { return 0; } - ~CurlWrapper() { + ~HttpClient() { if (chunk) { curl_slist_free_all(chunk); } @@ -195,13 +350,13 @@ class CurlWrapper { CURL * curl = nullptr; struct curl_slist * chunk = nullptr; - void set_write_options(std::string * response_str, const FILE_ptr & out) { + void set_write_options(std::string * response_str, const File & out) { if (response_str) { curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, capture_data); curl_easy_setopt(curl, CURLOPT_WRITEDATA, response_str); } else { curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.get()); + curl_easy_setopt(curl, CURLOPT_WRITEDATA, out.file); } } @@ -219,7 +374,7 @@ class CurlWrapper { if (progress) { curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); curl_easy_setopt(curl, CURLOPT_XFERINFODATA, &data); - curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, progress_callback); + curl_easy_setopt(curl, CURLOPT_XFERINFOFUNCTION, update_progress); } } @@ -255,37 +410,31 @@ class CurlWrapper { int mins = (static_cast(seconds) % 3600) / 60; int secs = static_cast(seconds) % 60; - std::ostringstream out; if (hrs > 0) { - out << hrs << "h " << std::setw(2) << std::setfill('0') << mins << "m " << std::setw(2) << std::setfill('0') - << secs << "s"; + return fmt("%dh %02dm %02ds", hrs, mins, secs); } else if (mins > 0) { - out << mins << "m " << std::setw(2) << std::setfill('0') << secs << "s"; + return fmt("%dm %02ds", mins, secs); } else { - out << secs << "s"; + return fmt("%ds", secs); } - - return out.str(); } static std::string human_readable_size(curl_off_t size) { static const char * suffix[] = { "B", "KB", "MB", "GB", "TB" }; - char length = sizeof(suffix) / sizeof(suffix[0]); - int i = 0; - double dbl_size = size; + char length = sizeof(suffix) / sizeof(suffix[0]); + int i = 0; + double dbl_size = size; if (size > 1024) { for (i = 0; (size / 1024) > 0 && i < length - 1; i++, size /= 1024) { dbl_size = size / 1024.0; } } - std::ostringstream out; - out << std::fixed << std::setprecision(2) << dbl_size << " " << suffix[i]; - return out.str(); + return fmt("%.2f %s", dbl_size, suffix[i]); } - static int progress_callback(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, - curl_off_t) { + static int update_progress(void * ptr, curl_off_t total_to_download, curl_off_t now_downloaded, curl_off_t, + curl_off_t) { progress_data * data = static_cast(ptr); if (total_to_download <= 0) { return 0; @@ -293,27 +442,68 @@ class CurlWrapper { total_to_download += data->file_size; const curl_off_t now_downloaded_plus_file_size = now_downloaded + data->file_size; - const curl_off_t percentage = (now_downloaded_plus_file_size * 100) / total_to_download; - const curl_off_t pos = (percentage / 5); - std::string progress_bar; - for (int i = 0; i < 20; ++i) { - progress_bar.append((i < pos) ? "█" : " "); - } + const curl_off_t percentage = calculate_percentage(now_downloaded_plus_file_size, total_to_download); + std::string progress_prefix = generate_progress_prefix(percentage); - // Calculate download speed and estimated time to completion - const auto now = std::chrono::steady_clock::now(); - const std::chrono::duration elapsed_seconds = now - data->start_time; - const double speed = now_downloaded / elapsed_seconds.count(); - const double estimated_time = (total_to_download - now_downloaded) / speed; - printe("\r%ld%% |%s| %s/%s %.2f MB/s %s ", percentage, progress_bar.c_str(), - human_readable_size(now_downloaded).c_str(), human_readable_size(total_to_download).c_str(), - speed / (1024 * 1024), human_readable_time(estimated_time).c_str()); - fflush(stderr); + const double speed = calculate_speed(now_downloaded, data->start_time); + const double tim = (total_to_download - now_downloaded) / speed; + std::string progress_suffix = + generate_progress_suffix(now_downloaded_plus_file_size, total_to_download, speed, tim); + + int progress_bar_width = calculate_progress_bar_width(progress_prefix, progress_suffix); + std::string progress_bar; + generate_progress_bar(progress_bar_width, percentage, progress_bar); + + print_progress(progress_prefix, progress_bar, progress_suffix); data->printed = true; return 0; } + static curl_off_t calculate_percentage(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download) { + return (now_downloaded_plus_file_size * 100) / total_to_download; + } + + static std::string generate_progress_prefix(curl_off_t percentage) { return fmt("%3ld%% |", percentage); } + + static double calculate_speed(curl_off_t now_downloaded, const std::chrono::steady_clock::time_point & start_time) { + const auto now = std::chrono::steady_clock::now(); + const std::chrono::duration elapsed_seconds = now - start_time; + return now_downloaded / elapsed_seconds.count(); + } + + static std::string generate_progress_suffix(curl_off_t now_downloaded_plus_file_size, curl_off_t total_to_download, + double speed, double estimated_time) { + const int width = 10; + return fmt("%*s/%*s%*s/s%*s", width, human_readable_size(now_downloaded_plus_file_size).c_str(), width, + human_readable_size(total_to_download).c_str(), width, human_readable_size(speed).c_str(), width, + human_readable_time(estimated_time).c_str()); + } + + static int calculate_progress_bar_width(const std::string & progress_prefix, const std::string & progress_suffix) { + int progress_bar_width = get_terminal_width() - progress_prefix.size() - progress_suffix.size() - 3; + if (progress_bar_width < 1) { + progress_bar_width = 1; + } + + return progress_bar_width; + } + + static std::string generate_progress_bar(int progress_bar_width, curl_off_t percentage, + std::string & progress_bar) { + const curl_off_t pos = (percentage * progress_bar_width) / 100; + for (int i = 0; i < progress_bar_width; ++i) { + progress_bar.append((i < pos) ? "█" : " "); + } + + return progress_bar; + } + + static void print_progress(const std::string & progress_prefix, const std::string & progress_bar, + const std::string & progress_suffix) { + printe("\r%*s\r%s%s| %s", get_terminal_width(), " ", progress_prefix.c_str(), progress_bar.c_str(), + progress_suffix.c_str()); + } // Function to write data to a file static size_t write_data(void * ptr, size_t size, size_t nmemb, void * stream) { FILE * out = static_cast(stream); @@ -344,12 +534,12 @@ class LlamaData { return 1; } - context = initialize_context(model, opt.context_size_); + context = initialize_context(model, opt); if (!context) { return 1; } - sampler = initialize_sampler(); + sampler = initialize_sampler(opt); return 0; } @@ -357,8 +547,8 @@ class LlamaData { #ifdef LLAMA_USE_CURL int download(const std::string & url, const std::vector & headers, const std::string & output_file, const bool progress, std::string * response_str = nullptr) { - CurlWrapper curl; - if (curl.init(url, headers, output_file, progress, response_str)) { + HttpClient http; + if (http.init(url, headers, output_file, progress, response_str)) { return 1; } @@ -438,13 +628,17 @@ class LlamaData { } int resolve_model(std::string & model_) { + int ret = 0; + if (string_starts_with(model_, "file://") || std::filesystem::exists(model_)) { + remove_proto(model_); + + return ret; + } + const std::string bn = basename(model_); const std::vector headers = { "--header", "Accept: application/vnd.docker.distribution.manifest.v2+json" }; - int ret = 0; - if (string_starts_with(model_, "file://") || std::filesystem::exists(bn)) { - remove_proto(model_); - } else if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) { + if (string_starts_with(model_, "hf://") || string_starts_with(model_, "huggingface://")) { remove_proto(model_); ret = huggingface_dl(model_, headers, bn); } else if (string_starts_with(model_, "ollama://")) { @@ -464,23 +658,23 @@ class LlamaData { // Initializes the model and returns a unique pointer to it llama_model_ptr initialize_model(Opt & opt) { ggml_backend_load_all(); - llama_model_params model_params = llama_model_default_params(); - model_params.n_gpu_layers = opt.ngl_ >= 0 ? opt.ngl_ : model_params.n_gpu_layers; resolve_model(opt.model_); - llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), model_params)); + printe( + "\r%*s" + "\rLoading model", + get_terminal_width(), " "); + llama_model_ptr model(llama_load_model_from_file(opt.model_.c_str(), opt.model_params)); if (!model) { printe("%s: error: unable to load model from file: %s\n", __func__, opt.model_.c_str()); } + printe("\r%*s\r", static_cast(sizeof("Loading model")), " "); return model; } // Initializes the context with the specified parameters - llama_context_ptr initialize_context(const llama_model_ptr & model, const int n_ctx) { - llama_context_params ctx_params = llama_context_default_params(); - ctx_params.n_ctx = n_ctx; - ctx_params.n_batch = n_ctx; - llama_context_ptr context(llama_new_context_with_model(model.get(), ctx_params)); + llama_context_ptr initialize_context(const llama_model_ptr & model, const Opt & opt) { + llama_context_ptr context(llama_new_context_with_model(model.get(), opt.ctx_params)); if (!context) { printe("%s: error: failed to create the llama_context\n", __func__); } @@ -489,10 +683,10 @@ class LlamaData { } // Initializes and configures the sampler - llama_sampler_ptr initialize_sampler() { + llama_sampler_ptr initialize_sampler(const Opt & opt) { llama_sampler_ptr sampler(llama_sampler_chain_init(llama_sampler_chain_default_params())); llama_sampler_chain_add(sampler.get(), llama_sampler_init_min_p(0.05f, 1)); - llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(0.8f)); + llama_sampler_chain_add(sampler.get(), llama_sampler_init_temp(opt.temperature)); llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(LLAMA_DEFAULT_SEED)); return sampler; @@ -609,16 +803,20 @@ static int read_user_input(std::string & user) { } // Function to generate a response based on the prompt -static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response) { +static int generate_response(LlamaData & llama_data, const std::string & prompt, std::string & response, + const bool stdout_a_terminal) { // Set response color - printf("\033[33m"); + if (stdout_a_terminal) { + printf("\033[33m"); + } + if (generate(llama_data, prompt, response)) { printe("failed to generate response\n"); return 1; } // End response with color reset and newline - printf("\n\033[0m"); + printf("\n%s", stdout_a_terminal ? "\033[0m" : ""); return 0; } @@ -635,29 +833,51 @@ static int apply_chat_template_with_error_handling(LlamaData & llama_data, const } // Helper function to handle user input -static int handle_user_input(std::string & user_input, const std::string & user_) { - if (!user_.empty()) { - user_input = user_; +static int handle_user_input(std::string & user_input, const std::string & user) { + if (!user.empty()) { + user_input = user; return 0; // No need for interactive input } printf( - "\r " - "\r\033[32m> \033[0m"); + "\r%*s" + "\r\033[32m> \033[0m", + get_terminal_width(), " "); return read_user_input(user_input); // Returns true if input ends the loop } +static bool is_stdin_a_terminal() { +#if defined(_WIN32) + HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdin, &mode); +#else + return isatty(STDIN_FILENO); +#endif +} + +static bool is_stdout_a_terminal() { +#if defined(_WIN32) + HANDLE hStdout = GetStdHandle(STD_OUTPUT_HANDLE); + DWORD mode; + return GetConsoleMode(hStdout, &mode); +#else + return isatty(STDOUT_FILENO); +#endif +} + // Function to tokenize the prompt -static int chat_loop(LlamaData & llama_data, const std::string & user_) { +static int chat_loop(LlamaData & llama_data, const std::string & user) { int prev_len = 0; llama_data.fmtted.resize(llama_n_ctx(llama_data.context.get())); + static const bool stdout_a_terminal = is_stdout_a_terminal(); while (true) { // Get user input std::string user_input; - while (handle_user_input(user_input, user_)) { + while (handle_user_input(user_input, user)) { } - add_message("user", user_.empty() ? user_input : user_, llama_data); + add_message("user", user.empty() ? user_input : user, llama_data); int new_len; if (apply_chat_template_with_error_handling(llama_data, true, new_len) < 0) { return 1; @@ -665,11 +885,11 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) { std::string prompt(llama_data.fmtted.begin() + prev_len, llama_data.fmtted.begin() + new_len); std::string response; - if (generate_response(llama_data, prompt, response)) { + if (generate_response(llama_data, prompt, response, stdout_a_terminal)) { return 1; } - if (!user_.empty()) { + if (!user.empty()) { break; } @@ -682,22 +902,13 @@ static int chat_loop(LlamaData & llama_data, const std::string & user_) { return 0; } -static void log_callback(const enum ggml_log_level level, const char * text, void *) { - if (level == GGML_LOG_LEVEL_ERROR) { +static void log_callback(const enum ggml_log_level level, const char * text, void * p) { + const Opt * opt = static_cast(p); + if (opt->verbose || level == GGML_LOG_LEVEL_ERROR) { printe("%s", text); } } -static bool is_stdin_a_terminal() { -#if defined(_WIN32) - HANDLE hStdin = GetStdHandle(STD_INPUT_HANDLE); - DWORD mode; - return GetConsoleMode(hStdin, &mode); -#else - return isatty(STDIN_FILENO); -#endif -} - static std::string read_pipe_data() { std::ostringstream result; result << std::cin.rdbuf(); // Read all data from std::cin @@ -714,20 +925,20 @@ int main(int argc, const char ** argv) { } if (!is_stdin_a_terminal()) { - if (!opt.user_.empty()) { - opt.user_ += "\n\n"; + if (!opt.user.empty()) { + opt.user += "\n\n"; } - opt.user_ += read_pipe_data(); + opt.user += read_pipe_data(); } - llama_log_set(log_callback, nullptr); + llama_log_set(log_callback, &opt); LlamaData llama_data; if (llama_data.init(opt)) { return 1; } - if (chat_loop(llama_data, opt.user_)) { + if (chat_loop(llama_data, opt.user)) { return 1; } diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index 63fca1d59..1b7cc8c13 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -15,7 +15,7 @@ set(TARGET_SRCS httplib.h ) set(PUBLIC_ASSETS - index.html + index.html.gz loading.html ) @@ -34,6 +34,7 @@ endforeach() add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) +target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) diff --git a/examples/server/README.md b/examples/server/README.md index 8e74808be..99c47e4e9 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -104,7 +104,6 @@ The project is under active development, and we are [looking for feedback and co | `-s, --seed SEED` | RNG seed (default: -1, use random seed for -1) | | `--sampling-seq SEQUENCE` | simplified sequence for samplers that will be used (default: dkypmxt) | | `--ignore-eos` | ignore end of stream token and continue generating (implies --logit-bias EOS-inf) | -| `--penalize-nl` | penalize newline tokens (default: false) | | `--temp N` | temperature (default: 0.8) | | `--top-k N` | top-k sampling (default: 40, 0 = disabled) | | `--top-p N` | top-p sampling (default: 0.9, 1.0 = disabled) | @@ -344,6 +343,10 @@ node index.js ### POST `/completion`: Given a `prompt`, it returns the predicted completion. +> [!IMPORTANT] +> +> This endpoint is **not** OAI-compatible + *Options:* `prompt`: Provide the prompt for this completion as a string or as an array of strings or numbers representing tokens. Internally, if `cache_prompt` is `true`, the prompt is compared to the previous completion and only the "unseen" suffix is evaluated. A `BOS` token is inserted at the start, if all of the following conditions are true: @@ -393,8 +396,6 @@ These words will not be included in the completion, so make sure to add them to `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. -`penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true` - `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled. `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. @@ -441,40 +442,76 @@ These words will not be included in the completion, so make sure to add them to `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` +`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false` + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values. `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false` +`post_sampling_probs`: Returns the probabilities of top `n_probs` tokens after applying sampling chain. + +`response_fields`: A list of response fields, for example: `"response_fields": ["content", "generation_settings/n_predict"]`. If the specified field is missing, it will simply be omitted from the response without triggering an error. + **Response format** -- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. +- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support. -- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: - -```json -{ - "content": "", - "probs": [ - { - "prob": float, - "tok_str": "" - }, - { - "prob": float, - "tok_str": "" - }, +- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has a nested array `top_logprobs`. It contains at **maximum** `n_probs` elements: + ```json + { + "content": "", + "tokens": [ generated token ids if requested ], ... - ] -}, -``` - -Notice that each `probs` is an array of length `n_probs`. + "probs": [ + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + "top_logprobs": [ + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + }, + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + }, + ... + ] + }, + { + "id": , + "logprob": float, + "token": "", + "bytes": [int, int, ...], + "top_logprobs": [ + ... + ] + }, + ... + ] + }, + ``` + Please note that if `post_sampling_probs` is set to `true`: + - `logprob` will be replaced with `prob`, with the value between 0.0 and 1.0 + - `top_logprobs` will be replaced with `top_probs`. Each element contains: + - `id`: token ID + - `token`: token in string + - `bytes`: token in bytes + - `prob`: token probability, with the value between 0.0 and 1.0 + - Number of elements in `top_probs` may be less than `n_probs` - `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string. +- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request. - `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options) - `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.). -- `model`: The path to the model loaded with `-m` -- `prompt`: The provided `prompt` +- `model`: The model alias (for model path, please use `/props` endpoint) +- `prompt`: The processed `prompt` (special tokens may be added) - `stop_type`: Indicating whether the completion has stopped. Possible values are: - `none`: Generating (not stopped) - `eos`: Stopped because it encountered the EOS token @@ -655,7 +692,6 @@ This endpoint is public (no API key check). By default, it is read-only. To make "mirostat": 0, "mirostat_tau": 5.0, "mirostat_eta": 0.10000000149011612, - "penalize_nl": false, "stop": [], "max_tokens": -1, "n_keep": 0, @@ -690,7 +726,8 @@ This endpoint is public (no API key check). By default, it is read-only. To make }, "total_slots": 1, "model_path": "../models/Meta-Llama-3.1-8B-Instruct-Q4_K_M.gguf", - "chat_template": "..." + "chat_template": "...", + "build_info": "b(build number)-(build commit hash)" } ``` @@ -842,6 +879,8 @@ curl http://localhost:8080/v1/chat/completions \ ### POST `/v1/embeddings`: OpenAI-compatible embeddings API +This endpoint requires that the model uses a pooling different than type `none`. The embeddings are normalized using the Eucledian norm. + *Options:* See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-reference/embeddings). @@ -874,6 +913,46 @@ See [OpenAI Embeddings API documentation](https://platform.openai.com/docs/api-r }' ``` +### POST `/embeddings`: non-OpenAI-compatible embeddings API + +This endpoint supports all poolings, including `--pooling none`. When the pooling is `none`, the responses will contain the *unnormalized* embeddings for *all* input tokens. For all other pooling types, only the pooled embeddings are returned, normalized using Euclidian norm. + +Note that the response format of this endpoint is different from `/v1/embeddings`. + +*Options:* + +Same as the `/v1/embeddings` endpoint. + +*Examples:* + +Same as the `/v1/embeddings` endpoint. + +**Response format** + +```json +[ + { + "index": 0, + "embedding": [ + [ ... embeddings for token 0 ... ], + [ ... embeddings for token 1 ... ], + [ ... ] + [ ... embeddings for token N-1 ... ], + ] + }, + ... + { + "index": P, + "embedding": [ + [ ... embeddings for token 0 ... ], + [ ... embeddings for token 1 ... ], + [ ... ] + [ ... embeddings for token N-1 ... ], + ] + } +] +``` + ### GET `/slots`: Returns the current slots processing state > [!WARNING] @@ -924,7 +1003,6 @@ Example: "mirostat": 0, "mirostat_tau": 5.0, "mirostat_eta": 0.10000000149011612, - "penalize_nl": false, "stop": [], "max_tokens": -1, "n_keep": 0, diff --git a/examples/server/public/index.html b/examples/server/public/index.html deleted file mode 100644 index 9a19c5e83..000000000 --- a/examples/server/public/index.html +++ /dev/null @@ -1,389 +0,0 @@ - - - - - - - - 🦙 llama.cpp - chat - - - - - -
-
- - - -
- -
-
-

Conversations

- - - -
- - -
- + New conversation -
-
- {{ conv.messages[0].content }} -
-
- Conversations are saved to browser's localStorage -
-
-
- - -
- -
- - - -
llama.cpp
- - -
- - - - - -
-
- - -
-
- - {{ messages.length === 0 ? 'Send a message to start' : '' }} -
-
- -
- - -
- -
-
- - -
- - - -
-
- -
- - - - - - - -
- - - - - - - - - - - - diff --git a/examples/server/public/index.html.gz b/examples/server/public/index.html.gz new file mode 100644 index 000000000..36f9c9fe9 Binary files /dev/null and b/examples/server/public/index.html.gz differ diff --git a/examples/server/public_legacy/index-new.html b/examples/server/public_legacy/index-new.html index 8bfa380e5..cbfbbdf28 100644 --- a/examples/server/public_legacy/index-new.html +++ b/examples/server/public_legacy/index-new.html @@ -39,7 +39,6 @@ temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower repeat_last_n: 0, // 0 = disable penalty, -1 = context size repeat_penalty: 1.0, // 1.0 = disabled - penalize_nl: false, // true only useful for infinite completion dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well dry_base: 1.75, // 0.0 = disabled dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well diff --git a/examples/server/public_legacy/index.html b/examples/server/public_legacy/index.html index a95f5c6df..75f39330a 100644 --- a/examples/server/public_legacy/index.html +++ b/examples/server/public_legacy/index.html @@ -303,7 +303,6 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled - penalize_nl: false, dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well dry_base: 1.75, // 0.0 = disabled dry_allowed_length: 2, // tokens extending repetitions beyond this receive penalty, 2 works well @@ -1006,7 +1005,6 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 3a18844b6..a13a65594 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -15,7 +15,7 @@ #define MIMETYPE_JSON "application/json; charset=utf-8" // auto generated files (update with ./deps.sh) -#include "index.html.hpp" +#include "index.html.gz.hpp" #include "loading.html.hpp" #include @@ -79,8 +79,9 @@ enum error_type { }; struct slot_params { - bool stream = true; - bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool stream = true; + bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt + bool return_tokens = false; int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half @@ -93,7 +94,9 @@ struct slot_params { json input_prefix; json input_suffix; std::vector antiprompt; + std::vector response_fields; bool timings_per_token = false; + bool post_sampling_probs = false; bool ignore_eos = false; struct common_params_sampling sampling; @@ -139,7 +142,6 @@ struct slot_params { {"mirostat", sampling.mirostat}, {"mirostat_tau", sampling.mirostat_tau}, {"mirostat_eta", sampling.mirostat_eta}, - {"penalize_nl", sampling.penalize_nl}, {"stop", antiprompt}, {"max_tokens", n_predict}, // User configured n_predict {"n_keep", n_keep}, @@ -150,11 +152,13 @@ struct slot_params { {"n_probs", sampling.n_probs}, {"min_keep", sampling.min_keep}, {"grammar", sampling.grammar}, + {"grammar_trigger_words", sampling.grammar_trigger_words}, {"samplers", samplers}, {"speculative.n_max", speculative.n_max}, {"speculative.n_min", speculative.n_min}, {"speculative.p_min", speculative.p_min}, {"timings_per_token", timings_per_token}, + {"post_sampling_probs", post_sampling_probs}, }; } }; @@ -188,6 +192,7 @@ struct server_task { static slot_params params_from_json_cmpl( const llama_model * model, + const llama_context * ctx, const common_params & params_base, const json & data) { slot_params params; @@ -203,12 +208,14 @@ struct server_task { params.stream = json_value(data, "stream", false); params.cache_prompt = json_value(data, "cache_prompt", true); + params.return_tokens = json_value(data, "return_tokens", false); params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict)); params.n_indent = json_value(data, "n_indent", defaults.n_indent); params.n_keep = json_value(data, "n_keep", defaults.n_keep); params.n_discard = json_value(data, "n_discard", defaults.n_discard); //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms); + params.response_fields = json_value(data, "response_fields", std::vector()); params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k); params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p); @@ -230,10 +237,10 @@ struct server_task { params.sampling.mirostat = json_value(data, "mirostat", defaults.sampling.mirostat); params.sampling.mirostat_tau = json_value(data, "mirostat_tau", defaults.sampling.mirostat_tau); params.sampling.mirostat_eta = json_value(data, "mirostat_eta", defaults.sampling.mirostat_eta); - params.sampling.penalize_nl = json_value(data, "penalize_nl", defaults.sampling.penalize_nl); params.sampling.seed = json_value(data, "seed", defaults.sampling.seed); params.sampling.n_probs = json_value(data, "n_probs", defaults.sampling.n_probs); params.sampling.min_keep = json_value(data, "min_keep", defaults.sampling.min_keep); + params.post_sampling_probs = json_value(data, "post_sampling_probs", defaults.post_sampling_probs); params.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min); params.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max); @@ -243,8 +250,27 @@ struct server_task { params.speculative.n_min = std::max(params.speculative.n_min, 2); params.speculative.n_max = std::max(params.speculative.n_max, 0); + // TODO: add more sanity checks for the input parameters + + if (params.sampling.penalty_last_n < -1) { + throw std::runtime_error("Error: repeat_last_n must be >= -1"); + } + + if (params.sampling.dry_penalty_last_n < -1) { + throw std::runtime_error("Error: dry_penalty_last_n must be >= -1"); + } + + if (params.sampling.penalty_last_n == -1) { + // note: should be the slot's context and not the full context, but it's ok + params.sampling.penalty_last_n = llama_n_ctx(ctx); + } + + if (params.sampling.dry_penalty_last_n == -1) { + params.sampling.dry_penalty_last_n = llama_n_ctx(ctx); + } + if (params.sampling.dry_base < 1.0f) { - params.sampling.dry_base = defaults.sampling.dry_base; + params.sampling.dry_base = defaults.sampling.dry_base; } // sequence breakers for DRY @@ -430,41 +456,75 @@ inline std::string stop_type_to_str(stop_type type) { struct completion_token_output { llama_token tok; + float prob; std::string text_to_send; - struct token_prob { + struct prob_info { llama_token tok; - std::string tok_str; + std::string txt; float prob; }; - std::vector probs; + std::vector probs; - json to_json() const { + json to_json(bool post_sampling_probs) const { json probs_for_token = json::array(); for (const auto & p : probs) { + std::string txt(p.txt); + txt.resize(validate_utf8(txt)); probs_for_token.push_back(json { - {"tok_str", p.tok_str}, - {"prob", p.prob}, + {"id", p.tok}, + {"token", txt}, + {"bytes", str_to_bytes(p.txt)}, + { + post_sampling_probs ? "prob" : "logprob", + post_sampling_probs ? p.prob : logarithm(p.prob) + }, }); } return probs_for_token; } - static json probs_vector_to_json(const std::vector & probs) { + static json probs_vector_to_json(const std::vector & probs, bool post_sampling_probs) { json out = json::array(); - for (const auto & prob : probs) { - const std::string tok_str = prob.text_to_send; + for (const auto & p : probs) { + std::string txt(p.text_to_send); + txt.resize(validate_utf8(txt)); out.push_back(json { - {"content", tok_str}, - {"probs", prob.to_json()}, + {"id", p.tok}, + {"token", txt}, + {"bytes", str_to_bytes(p.text_to_send)}, + { + post_sampling_probs ? "prob" : "logprob", + post_sampling_probs ? p.prob : logarithm(p.prob) + }, + { + post_sampling_probs ? "top_probs" : "top_logprobs", + p.to_json(post_sampling_probs) + }, }); } return out; } + + static float logarithm(float x) { + // nlohmann::json converts -inf to null, so we need to prevent that + return x == 0.0f ? std::numeric_limits::lowest() : std::log(x); + } + + static std::vector str_to_bytes(const std::string & str) { + std::vector bytes; + for (unsigned char c : str) { + bytes.push_back(c); + } + return bytes; + } }; struct server_task_result_cmpl_final : server_task_result { int index = 0; - std::string content; + + std::string content; + llama_tokens tokens; + bool stream; result_timings timings; std::string prompt; @@ -473,11 +533,13 @@ struct server_task_result_cmpl_final : server_task_result { int32_t n_decoded; int32_t n_prompt_tokens; int32_t n_tokens_cached; - int32_t has_new_line; + bool has_new_line; std::string stopping_word; stop_type stop = STOP_TYPE_NONE; + bool post_sampling_probs; std::vector probs_output; + std::vector response_fields; slot_params generation_params; @@ -508,6 +570,7 @@ struct server_task_result_cmpl_final : server_task_result { json res = json { {"index", index}, {"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk + {"tokens", stream ? llama_tokens {} : tokens}, {"id_slot", id_slot}, {"stop", true}, {"model", oaicompat_model}, @@ -522,10 +585,10 @@ struct server_task_result_cmpl_final : server_task_result { {"tokens_cached", n_tokens_cached}, {"timings", timings.to_json()}, }; - if (!probs_output.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + if (!stream && !probs_output.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs); } - return res; + return response_fields.empty() ? res : json_get_nested_values(response_fields, res); } json to_json_oaicompat_chat() { @@ -560,22 +623,30 @@ struct server_task_result_cmpl_final : server_task_result { message_content = content; } - json choices = json::array({json{ + json choice { {"finish_reason", finish_reason}, {"index", 0}, - {"message", json{{"content", message_content}, - {"tool_calls", tool_calls}, - {"role", "assistant"} - } - }}}); + {"message", { + {"content", message_content}, + {"tool_calls", tool_calls}, + {"role", "assistant"}, + }}, + }; + + if (!stream && probs_output.size() > 0) { + choice["logprobs"] = json{ + {"content", completion_token_output::probs_vector_to_json(probs_output, post_sampling_probs)}, + }; + } std::time_t t = std::time(0); - json res = json { - {"choices", choices}, - {"created", t}, - {"model", oaicompat_model}, - {"object", "chat.completion"}, + json res { + {"choices", json::array({choice})}, + {"created", t}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion"}, {"usage", json { {"completion_tokens", n_decoded}, {"prompt_tokens", n_prompt_tokens}, @@ -602,16 +673,19 @@ struct server_task_result_cmpl_final : server_task_result { finish_reason = "stop"; } - json choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); + json choice { + {"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()} + }; - json ret = json { - {"choices", choices}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"}, + json ret { + {"choices", json::array({choice})}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"}, {"usage", json { {"completion_tokens", n_decoded}, {"prompt_tokens", n_prompt_tokens}, @@ -629,12 +703,15 @@ struct server_task_result_cmpl_final : server_task_result { struct server_task_result_cmpl_partial : server_task_result { int index = 0; - std::string content; + + std::string content; + llama_tokens tokens; int32_t n_decoded; int32_t n_prompt_tokens; - std::vector probs_output; + bool post_sampling_probs; + completion_token_output prob_output; result_timings timings; // OAI-compat fields @@ -663,6 +740,7 @@ struct server_task_result_cmpl_partial : server_task_result { json res = json { {"index", index}, {"content", content}, + {"tokens", tokens}, {"stop", false}, {"id_slot", id_slot}, {"tokens_predicted", n_decoded}, @@ -672,8 +750,8 @@ struct server_task_result_cmpl_partial : server_task_result { if (timings.prompt_n > 0) { res.push_back({"timings", timings.to_json()}); } - if (!probs_output.empty()) { - res["completion_probabilities"] = completion_token_output::probs_vector_to_json(probs_output); + if (!prob_output.probs.empty()) { + res["completion_probabilities"] = completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs); } return res; } @@ -744,7 +822,7 @@ struct server_task_result_cmpl_partial : server_task_result { json second_ret = json{ {"choices", json::array({json{{"finish_reason", nullptr}, {"index", 0}, - {"delta", json{ + {"delta", json { {"content", content}}} }})}, {"created", t}, @@ -759,18 +837,27 @@ struct server_task_result_cmpl_partial : server_task_result { {"finish_reason", nullptr}, {"index", 0}, {"delta", - json{ + json { {"content", content}, }}, }}); } + GGML_ASSERT(choices.size() >= 1); + + if (prob_output.probs.size() > 0) { + choices[0]["logprobs"] = json{ + {"content", completion_token_output::probs_vector_to_json({prob_output}, post_sampling_probs)}, + }; + } + json ret = json { - {"choices", choices}, - {"created", t}, - {"id", oaicompat_cmpl_id}, - {"model", oaicompat_model}, - {"object", "chat.completion.chunk"} + {"choices", choices}, + {"created", t}, + {"id", oaicompat_cmpl_id}, + {"model", oaicompat_model}, + {"system_fingerprint", build_info}, + {"object", "chat.completion.chunk"} }; if (timings.prompt_n >= 0) { @@ -783,32 +870,52 @@ struct server_task_result_cmpl_partial : server_task_result { struct server_task_result_embd : server_task_result { int index = 0; - std::vector embedding; + std::vector> embedding; + + int32_t n_tokens; + + // OAI-compat fields + bool oaicompat = false; virtual int get_index() override { return index; } virtual json to_json() override { + return oaicompat ? to_json_oaicompat() : to_json_non_oaicompat(); + } + + json to_json_non_oaicompat() { return json { {"index", index}, {"embedding", embedding}, }; } + + json to_json_oaicompat() { + return json { + {"index", index}, + {"embedding", embedding[0]}, + {"tokens_evaluated", n_tokens}, + }; + } }; struct server_task_result_rerank : server_task_result { int index = 0; float score = -1e6; + int32_t n_tokens; + virtual int get_index() override { return index; } virtual json to_json() override { return json { - {"index", index}, - {"score", score}, + {"index", index}, + {"score", score}, + {"tokens_evaluated", n_tokens}, }; } }; @@ -1015,8 +1122,11 @@ struct server_slot { size_t last_nl_pos = 0; - std::string generated_text; + std::string generated_text; + llama_tokens generated_tokens; + llama_tokens cache_tokens; + std::vector generated_token_probs; bool has_next_token = true; @@ -1037,7 +1147,6 @@ struct server_slot { // stats size_t n_sent_text = 0; // number of sent text character - size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_generation; @@ -1059,9 +1168,9 @@ struct server_slot { stopping_word = ""; n_past = 0; n_sent_text = 0; - n_sent_token_probs = 0; task_type = SERVER_TASK_TYPE_COMPLETION; + generated_tokens.clear(); generated_token_probs.clear(); } @@ -1526,7 +1635,7 @@ struct server_context { n_ctx = llama_n_ctx(ctx); add_bos_token = llama_add_bos_token(model); - has_eos_token = !llama_add_eos_token(model); + has_eos_token = llama_token_eos(model) != LLAMA_TOKEN_NULL; if (!params_base.speculative.model.empty()) { SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); @@ -1791,7 +1900,8 @@ struct server_context { auto match = slot.antiprompts.findSingleTokenMatch(result.tok); // remember which tokens were sampled - used for repetition penalties during sampling - const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special || (match.pos != std::string::npos && match.is_grammar_trigger)); + const std::string token_str = result.text_to_send; + // const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special || (match.pos != std::string::npos && match.is_grammar_trigger)); slot.sampled = result.tok; if (match.pos != std::string::npos && !match.is_partial) { @@ -1807,30 +1917,15 @@ struct server_context { // search stop word and delete it slot.generated_text += token_str; + if (slot.params.return_tokens) { + slot.generated_tokens.push_back(result.tok); + } slot.has_next_token = true; // check if there is incomplete UTF-8 character at the end - bool incomplete = false; - for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { - unsigned char c = slot.generated_text[slot.generated_text.size() - i]; - if ((c & 0xC0) == 0x80) { - // continuation byte: 10xxxxxx - continue; - } - if ((c & 0xE0) == 0xC0) { - // 2-byte character: 110xxxxx ... - incomplete = i < 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character: 1110xxxx ... - incomplete = i < 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character: 11110xxx ... - incomplete = i < 4; - } - // else 1-byte character or invalid byte - break; - } + bool incomplete = validate_utf8(slot.generated_text) < slot.generated_text.size(); + // search stop word and delete it if (!incomplete) { size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); @@ -1969,58 +2064,53 @@ struct server_context { return slot.has_next_token; // continue } - json get_formated_generation(const server_slot & slot) const { - std::vector samplers; - samplers.reserve(slot.params.sampling.samplers.size()); - for (const auto & sampler : slot.params.sampling.samplers) { - samplers.emplace_back(common_sampler_type_to_str(sampler)); - } + void populate_token_probs(const server_slot & slot, completion_token_output & result, bool post_sampling, bool special, int idx) { + size_t n_probs = slot.params.sampling.n_probs; + size_t n_vocab = llama_n_vocab(llama_get_model(ctx)); + if (post_sampling) { + const auto * cur_p = common_sampler_get_candidates(slot.smpl); + const size_t max_probs = cur_p->size; - return json { - {"n_ctx", slot.n_ctx}, - {"n_predict", slot.n_predict}, // Server configured n_predict - {"model", params_base.model_alias}, - {"seed", slot.params.sampling.seed}, - {"seed_cur", slot.smpl ? common_sampler_get_seed(slot.smpl) : 0}, - {"temperature", slot.params.sampling.temp}, - {"dynatemp_range", slot.params.sampling.dynatemp_range}, - {"dynatemp_exponent", slot.params.sampling.dynatemp_exponent}, - {"top_k", slot.params.sampling.top_k}, - {"top_p", slot.params.sampling.top_p}, - {"min_p", slot.params.sampling.min_p}, - {"xtc_probability", slot.params.sampling.xtc_probability}, - {"xtc_threshold", slot.params.sampling.xtc_threshold}, - {"typical_p", slot.params.sampling.typ_p}, - {"repeat_last_n", slot.params.sampling.penalty_last_n}, - {"repeat_penalty", slot.params.sampling.penalty_repeat}, - {"presence_penalty", slot.params.sampling.penalty_present}, - {"frequency_penalty", slot.params.sampling.penalty_freq}, - {"dry_multiplier", slot.params.sampling.dry_multiplier}, - {"dry_base", slot.params.sampling.dry_base}, - {"dry_allowed_length", slot.params.sampling.dry_allowed_length}, - {"dry_penalty_last_n", slot.params.sampling.dry_penalty_last_n}, - {"dry_sequence_breakers", slot.params.sampling.dry_sequence_breakers}, - {"mirostat", slot.params.sampling.mirostat}, - {"mirostat_tau", slot.params.sampling.mirostat_tau}, - {"mirostat_eta", slot.params.sampling.mirostat_eta}, - {"penalize_nl", slot.params.sampling.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"grammar_trigger_words", slot.params.sampling.grammar_trigger_words}, - {"max_tokens", slot.params.n_predict}, // User configured n_predict - {"n_keep", slot.params.n_keep}, - {"n_discard", slot.params.n_discard}, - {"ignore_eos", slot.params.sampling.ignore_eos}, - {"stream", slot.params.stream}, - //{"logit_bias", slot.params.sampling.logit_bias}, - {"n_probs", slot.params.sampling.n_probs}, - {"min_keep", slot.params.sampling.min_keep}, - {"grammar", slot.params.sampling.grammar}, - {"samplers", samplers}, - {"speculative", slot.can_speculate()}, - {"speculative.n_max", slot.params.speculative.n_max}, - {"speculative.n_min", slot.params.speculative.n_min}, - {"speculative.p_min", slot.params.speculative.p_min}, - }; + // set probability for sampled token + for (size_t i = 0; i < max_probs; i++) { + if (cur_p->data[i].id == result.tok) { + result.prob = cur_p->data[i].p; + break; + } + } + + // set probability for top n_probs tokens + result.probs.reserve(max_probs); + for (size_t i = 0; i < std::min(max_probs, n_probs); i++) { + result.probs.push_back({ + cur_p->data[i].id, + common_detokenize(ctx, {cur_p->data[i].id}, special), + cur_p->data[i].p + }); + } + } else { + // TODO: optimize this with min-p optimization + std::vector cur = get_token_probabilities(ctx, idx); + + // set probability for sampled token + for (size_t i = 0; i < n_vocab; i++) { + // set probability for sampled token + if (cur[i].id == result.tok) { + result.prob = cur[i].p; + break; + } + } + + // set probability for top n_probs tokens + result.probs.reserve(n_probs); + for (size_t i = 0; i < std::min(n_vocab, n_probs); i++) { + result.probs.push_back({ + cur[i].id, + common_detokenize(ctx, {cur[i].id}, special), + cur[i].p + }); + } + } } void send_error(const server_task & task, const std::string & error, const enum error_type type = ERROR_TYPE_SERVER) { @@ -2048,9 +2138,11 @@ struct server_context { res->id = slot.id_task; res->index = slot.index; res->content = tkn.text_to_send; + res->tokens = { tkn.tok }; - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.n_prompt_tokens; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + res->post_sampling_probs = slot.params.post_sampling_probs; res->verbose = slot.params.verbose; res->oaicompat = slot.params.oaicompat; @@ -2062,17 +2154,7 @@ struct server_context { // populate res.probs_output if (slot.params.sampling.n_probs > 0) { - const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false); - - const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); - const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); - - std::vector probs_output; - if (probs_pos < probs_stop_pos) { - res->probs_output = std::vector( - slot.generated_token_probs.begin() + probs_pos, - slot.generated_token_probs.begin() + probs_stop_pos); - } + res->prob_output = tkn; // copy the token probs } // populate timings if this is final response or timings_per_token is enabled @@ -2090,16 +2172,19 @@ struct server_context { res->index = slot.index; res->content = slot.generated_text; + res->tokens = slot.generated_tokens; res->timings = slot.get_timings(); res->prompt = common_detokenize(ctx, slot.prompt_tokens, true); + res->response_fields = slot.params.response_fields; - res->truncated = slot.truncated; - res->n_decoded = slot.n_decoded; - res->n_prompt_tokens = slot.n_prompt_tokens; - res->n_tokens_cached = slot.n_past; - res->has_new_line = slot.has_new_line; - res->stopping_word = slot.stopping_word; - res->stop = slot.stop; + res->truncated = slot.truncated; + res->n_decoded = slot.n_decoded; + res->n_prompt_tokens = slot.n_prompt_tokens; + res->n_tokens_cached = slot.n_past; + res->has_new_line = slot.has_new_line; + res->stopping_word = slot.stopping_word; + res->stop = slot.stop; + res->post_sampling_probs = slot.params.post_sampling_probs; res->verbose = slot.params.verbose; res->stream = slot.params.stream; @@ -2133,8 +2218,10 @@ struct server_context { void send_embedding(const server_slot & slot, const llama_batch & batch) { auto res = std::make_unique(); - res->id = slot.id_task; - res->index = slot.index; + res->id = slot.id_task; + res->index = slot.index; + res->n_tokens = slot.n_prompt_tokens; + res->oaicompat = slot.params.oaicompat; const int n_embd = llama_n_embd(model); @@ -2153,12 +2240,18 @@ struct server_context { if (embd == NULL) { SLT_ERR(slot, "failed to get embeddings, token = %d, seq_id = %d\n", batch.token[i], batch.seq_id[i][0]); - res->embedding = std::vector(n_embd, 0.0f); + res->embedding.push_back(std::vector(n_embd, 0.0f)); continue; } - common_embd_normalize(embd, embd_res.data(), n_embd); - res->embedding = embd_res; + // normalize only when there is pooling + // TODO: configurable + if (llama_pooling_type(slot.ctx) != LLAMA_POOLING_TYPE_NONE) { + common_embd_normalize(embd, embd_res.data(), n_embd, 2); + res->embedding.push_back(embd_res); + } else { + res->embedding.push_back({ embd, embd + n_embd }); + } } SLT_DBG(slot, "%s", "sending embeddings\n"); @@ -2170,6 +2263,7 @@ struct server_context { auto res = std::make_unique(); res->id = slot.id_task; res->index = slot.index; + res->n_tokens = slot.n_prompt_tokens; for (int i = 0; i < batch.n_tokens; ++i) { if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { @@ -2771,7 +2865,10 @@ struct server_context { // add prompt tokens for processing in the current batch while (slot.n_past < slot.n_prompt_tokens && batch.n_tokens < n_batch) { - common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, false); + // without pooling, we want to output the embeddings for all the tokens in the batch + const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE; + + common_batch_add(batch, prompt_tokens[slot.n_past], slot.n_past, { slot.id }, need_embd); if (slot.params.cache_prompt) { slot.cache_tokens.push_back(prompt_tokens[slot.n_past]); @@ -2886,7 +2983,9 @@ struct server_context { continue; // continue loop of slots } - llama_token id = common_sampler_sample(slot.smpl, ctx, slot.i_batch - i); + const int tok_idx = slot.i_batch - i; + + llama_token id = common_sampler_sample(slot.smpl, ctx, tok_idx); slot.i_batch = -1; @@ -2905,17 +3004,12 @@ struct server_context { slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3; completion_token_output result; - result.tok = id; + result.tok = id; + result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special); + result.prob = 1.0f; // TODO: set it here instead of doing inside populate_token_probs - const auto * cur_p = common_sampler_get_candidates(slot.smpl); - - for (size_t i = 0; i < (size_t) slot.params.sampling.n_probs; ++i) { - auto tok_id = cur_p->data[i].id; - result.probs.push_back({ - tok_id, - tokens_to_output_formatted_string(ctx, tok_id), - i >= cur_p->size ? 0.0f : cur_p->data[i].p, - }); + if (slot.params.sampling.n_probs > 0) { + populate_token_probs(slot, result, slot.params.post_sampling_probs, params_base.special, tok_idx); } if (!process_token(result, slot)) { @@ -2999,7 +3093,11 @@ struct server_context { for (size_t i = 0; i < ids.size(); ++i) { completion_token_output result; - result.tok = ids[i]; + result.tok = ids[i]; + result.text_to_send = common_token_to_piece(ctx, result.tok, params_base.special); + result.prob = 1.0f; // set later + + // TODO: set result.probs if (!process_token(result, slot)) { // release slot because of stop condition @@ -3495,6 +3593,7 @@ int main(int argc, char ** argv) { { "bos_token", common_token_to_piece(ctx_server.ctx, llama_token_bos(ctx_server.model), true) }, { "eos_token", common_token_to_piece(ctx_server.ctx, llama_token_eos(ctx_server.model), true) }, { "chat_template", chat_template.source()}, + { "build_info", build_info }, }; if (ctx_server.params_base.use_jinja) { auto tool_use_chat_template = llama_chat_template_from_model(ctx_server.model, ctx_server.params_base.chat_template, /* prefer_tool_use= */ true); @@ -3548,7 +3647,7 @@ int main(int argc, char ** argv) { task.index = i; task.prompt_tokens = std::move(tokenized_prompts[i]); - task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.params_base, data); + task.params = server_task::params_from_json_cmpl(ctx_server.model, ctx_server.ctx, ctx_server.params_base, data); task.id_selected_slot = json_value(data, "id_slot", -1); // OAI-compat @@ -3732,7 +3831,7 @@ int main(int argc, char ** argv) { {"object", "list"}, {"data", { { - {"id", params.model_alias}, + {"id", params.model_alias.empty() ? params.model : params.model_alias}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, @@ -3797,34 +3896,61 @@ int main(int argc, char ** argv) { res_ok(res, data); }; - const auto handle_embeddings = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { + const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, bool oaicompat) { const json body = json::parse(req.body); - bool oaicompat = false; - // an input prompt can be a string or a list of tokens (integer) + if (oaicompat && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) { + res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST)); + return; + } + + // for the shape of input/content, see tokenize_input_prompts() json prompt; if (body.count("input") != 0) { - oaicompat = true; prompt = body.at("input"); - } else if (body.count("content") != 0) { - // with "content", we only support single prompt - prompt = std::vector{body.at("content")}; + } else if (body.contains("content")) { + oaicompat = false; + prompt = body.at("content"); } else { res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST)); return; } + bool use_base64 = false; + if (body.count("encoding_format") != 0) { + const std::string& format = body.at("encoding_format"); + if (format == "base64") { + use_base64 = true; + } else if (format != "float") { + res_error(res, format_error_response("The format to return the embeddings in. Can be either float or base64", ERROR_TYPE_INVALID_REQUEST)); + return; + } + } + + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, true, true); + for (const auto & tokens : tokenized_prompts) { + // this check is necessary for models that do not add BOS token to the input + if (tokens.empty()) { + res_error(res, format_error_response("Input content cannot be empty", ERROR_TYPE_INVALID_REQUEST)); + return; + } + } + // create and queue the task json responses = json::array(); bool error = false; { std::vector tasks; - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.ctx, prompt, /* add_special */ false, true); for (size_t i = 0; i < tokenized_prompts.size(); i++) { - server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + server_task task = server_task(SERVER_TASK_TYPE_EMBEDDING); + task.id = ctx_server.queue_tasks.get_new_id(); task.index = i; task.prompt_tokens = std::move(tokenized_prompts[i]); + + // OAI-compat + task.params.oaicompat = oaicompat; + tasks.push_back(task); } @@ -3852,12 +3978,18 @@ int main(int argc, char ** argv) { } // write JSON response - json root = oaicompat - ? format_embeddings_response_oaicompat(body, responses) - : responses.size() == 1 ? responses[0] : json(responses); + json root = oaicompat ? format_embeddings_response_oaicompat(body, responses, use_base64) : json(responses); res_ok(res, root); }; + const auto handle_embeddings = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { + handle_embeddings_impl(req, res, false); + }; + + const auto handle_embeddings_oai = [&handle_embeddings_impl](const httplib::Request & req, httplib::Response & res) { + handle_embeddings_impl(req, res, true); + }; + const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) { if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) { res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED)); @@ -4004,8 +4136,13 @@ int main(int argc, char ** argv) { } } else { // using embedded static index.html - svr->Get("/", [](const httplib::Request &, httplib::Response & res) { - res.set_content(reinterpret_cast(index_html), index_html_len, "text/html; charset=utf-8"); + svr->Get("/", [](const httplib::Request & req, httplib::Response & res) { + if (req.get_header_value("Accept-Encoding").find("gzip") == std::string::npos) { + res.set_content("Error: gzip is not supported by this browser", "text/plain"); + } else { + res.set_header("Content-Encoding", "gzip"); + res.set_content(reinterpret_cast(index_html_gz), index_html_gz_len, "text/html; charset=utf-8"); + } return false; }); } @@ -4026,7 +4163,7 @@ int main(int argc, char ** argv) { svr->Post("/infill", handle_infill); svr->Post("/embedding", handle_embeddings); // legacy svr->Post("/embeddings", handle_embeddings); - svr->Post("/v1/embeddings", handle_embeddings); + svr->Post("/v1/embeddings", handle_embeddings_oai); svr->Post("/rerank", handle_rerank); svr->Post("/reranking", handle_rerank); svr->Post("/v1/rerank", handle_rerank); diff --git a/examples/server/tests/unit/test_chat_completion.py b/examples/server/tests/unit/test_chat_completion.py index 1da9f8c4b..154176d32 100644 --- a/examples/server/tests/unit/test_chat_completion.py +++ b/examples/server/tests/unit/test_chat_completion.py @@ -31,6 +31,7 @@ def test_chat_completion(model, system_prompt, user_prompt, max_tokens, re_conte }) assert res.status_code == 200 assert "cmpl" in res.body["id"] # make sure the completion id has the expected format + assert res.body["system_fingerprint"].startswith("b") assert res.body["model"] == model if model is not None else server.model_alias assert res.body["usage"]["prompt_tokens"] == n_prompt assert res.body["usage"]["completion_tokens"] == n_predicted @@ -63,6 +64,7 @@ def test_chat_completion_stream(system_prompt, user_prompt, max_tokens, re_conte last_cmpl_id = None for data in res: choice = data["choices"][0] + assert data["system_fingerprint"].startswith("b") assert "gpt-3.5" in data["model"] # DEFAULT_OAICOMPAT_MODEL, maybe changed in the future if last_cmpl_id is None: last_cmpl_id = data["id"] @@ -92,7 +94,7 @@ def test_chat_completion_with_openai_library(): seed=42, temperature=0.8, ) - print(res) + assert res.system_fingerprint is not None and res.system_fingerprint.startswith("b") assert res.choices[0].finish_reason == "length" assert res.choices[0].message.content is not None assert match_regex("(Suddenly)+", res.choices[0].message.content) @@ -323,3 +325,64 @@ def test_hello_world_tool_call(tool: dict, expected_arguments: dict, hf_repo: st assert tool["function"]["name"] == tool_call["function"]["name"] actual_arguments = json.loads(tool_call["function"]["arguments"]) assert json.dumps(expected_arguments) == json.dumps(actual_arguments), f"tool arguments: {json.dumps(actual_arguments)}, expected: {json.dumps(expected_arguments)}" + + +def test_logprobs(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + res = client.chat.completions.create( + model="gpt-3.5-turbo-instruct", + temperature=0.0, + messages=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_tokens=5, + logprobs=True, + top_logprobs=10, + ) + output_text = res.choices[0].message.content + aggregated_text = '' + assert res.choices[0].logprobs is not None + assert res.choices[0].logprobs.content is not None + for token in res.choices[0].logprobs.content: + aggregated_text += token.token + assert token.logprob <= 0.0 + assert token.bytes is not None + assert len(token.top_logprobs) > 0 + assert aggregated_text == output_text + + +def test_logprobs_stream(): + global server + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + res = client.chat.completions.create( + model="gpt-3.5-turbo-instruct", + temperature=0.0, + messages=[ + {"role": "system", "content": "Book"}, + {"role": "user", "content": "What is the best book"}, + ], + max_tokens=5, + logprobs=True, + top_logprobs=10, + stream=True, + ) + output_text = '' + aggregated_text = '' + for data in res: + choice = data.choices[0] + if choice.finish_reason is None: + if choice.delta.content: + output_text += choice.delta.content + assert choice.logprobs is not None + assert choice.logprobs.content is not None + for token in choice.logprobs.content: + aggregated_text += token.token + assert token.logprob <= 0.0 + assert token.bytes is not None + assert token.top_logprobs is not None + assert len(token.top_logprobs) > 0 + assert aggregated_text == output_text diff --git a/examples/server/tests/unit/test_completion.py b/examples/server/tests/unit/test_completion.py index 7f4f9cd03..a6b215944 100644 --- a/examples/server/tests/unit/test_completion.py +++ b/examples/server/tests/unit/test_completion.py @@ -10,22 +10,29 @@ def create_server(): global server server = ServerPreset.tinyllama2() -@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ - ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False), - ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False), +@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [ + ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False), + ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True), ]) -def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool): +def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool): global server server.start() res = server.make_request("POST", "/completion", data={ "n_predict": n_predict, "prompt": prompt, + "return_tokens": return_tokens, }) assert res.status_code == 200 assert res.body["timings"]["prompt_n"] == n_prompt assert res.body["timings"]["predicted_n"] == n_predicted assert res.body["truncated"] == truncated + assert type(res.body["has_new_line"]) == bool assert match_regex(re_content, res.body["content"]) + if return_tokens: + assert len(res.body["tokens"]) > 0 + assert all(type(tok) == int for tok in res.body["tokens"]) + else: + assert res.body["tokens"] == [] @pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [ @@ -48,12 +55,15 @@ def test_completion_stream(prompt: str, n_predict: int, re_content: str, n_promp assert data["timings"]["predicted_n"] == n_predicted assert data["truncated"] == truncated assert data["stop_type"] == "limit" + assert type(data["has_new_line"]) == bool assert "generation_settings" in data assert server.n_predict is not None assert data["generation_settings"]["n_predict"] == min(n_predict, server.n_predict) assert data["generation_settings"]["seed"] == server.seed assert match_regex(re_content, content) else: + assert len(data["tokens"]) > 0 + assert all(type(tok) == int for tok in data["tokens"]) content += data["content"] @@ -85,7 +95,7 @@ def test_consistent_result_same_seed(n_slots: int): res = server.make_request("POST", "/completion", data={ "prompt": "I believe the meaning of life is", "seed": 42, - "temperature": 1.0, + "temperature": 0.0, "cache_prompt": False, # TODO: remove this once test_cache_vs_nocache_prompt is fixed }) if last_res is not None: @@ -110,9 +120,10 @@ def test_different_result_different_seed(n_slots: int): assert res.body["content"] != last_res.body["content"] last_res = res - +# TODO figure why it don't work with temperature = 1 +# @pytest.mark.parametrize("temperature", [0.0, 1.0]) @pytest.mark.parametrize("n_batch", [16, 32]) -@pytest.mark.parametrize("temperature", [0.0, 1.0]) +@pytest.mark.parametrize("temperature", [0.0]) def test_consistent_result_different_batch_size(n_batch: int, temperature: float): global server server.n_batch = n_batch @@ -247,6 +258,40 @@ def test_completion_parallel_slots(n_slots: int, n_requests: int): # assert match_regex(re_content, res.body["content"]) +@pytest.mark.parametrize( + "prompt,n_predict,response_fields", + [ + ("I believe the meaning of life is", 8, []), + ("I believe the meaning of life is", 32, ["content", "generation_settings/n_predict", "prompt"]), + ], +) +def test_completion_response_fields( + prompt: str, n_predict: int, response_fields: list[str] +): + global server + server.start() + res = server.make_request( + "POST", + "/completion", + data={ + "n_predict": n_predict, + "prompt": prompt, + "response_fields": response_fields, + }, + ) + assert res.status_code == 200 + assert "content" in res.body + assert len(res.body["content"]) + if len(response_fields): + assert res.body["generation_settings/n_predict"] == n_predict + assert res.body["prompt"] == " " + prompt + assert isinstance(res.body["content"], str) + assert len(res.body) == len(response_fields) + else: + assert len(res.body) + assert "generation_settings" in res.body + + def test_n_probs(): global server server.start() @@ -260,9 +305,68 @@ def test_n_probs(): assert "completion_probabilities" in res.body assert len(res.body["completion_probabilities"]) == 5 for tok in res.body["completion_probabilities"]: - assert "probs" in tok - assert len(tok["probs"]) == 10 - for prob in tok["probs"]: - assert "prob" in prob - assert "tok_str" in prob - assert 0.0 <= prob["prob"] <= 1.0 + assert "id" in tok and tok["id"] > 0 + assert "token" in tok and type(tok["token"]) == str + assert "logprob" in tok and tok["logprob"] <= 0.0 + assert "bytes" in tok and type(tok["bytes"]) == list + assert len(tok["top_logprobs"]) == 10 + for prob in tok["top_logprobs"]: + assert "id" in prob and prob["id"] > 0 + assert "token" in prob and type(prob["token"]) == str + assert "logprob" in prob and prob["logprob"] <= 0.0 + assert "bytes" in prob and type(prob["bytes"]) == list + + +def test_n_probs_stream(): + global server + server.start() + res = server.make_stream_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "n_probs": 10, + "temperature": 0.0, + "n_predict": 5, + "stream": True, + }) + for data in res: + if data["stop"] == False: + assert "completion_probabilities" in data + assert len(data["completion_probabilities"]) == 1 + for tok in data["completion_probabilities"]: + assert "id" in tok and tok["id"] > 0 + assert "token" in tok and type(tok["token"]) == str + assert "logprob" in tok and tok["logprob"] <= 0.0 + assert "bytes" in tok and type(tok["bytes"]) == list + assert len(tok["top_logprobs"]) == 10 + for prob in tok["top_logprobs"]: + assert "id" in prob and prob["id"] > 0 + assert "token" in prob and type(prob["token"]) == str + assert "logprob" in prob and prob["logprob"] <= 0.0 + assert "bytes" in prob and type(prob["bytes"]) == list + + +def test_n_probs_post_sampling(): + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": "I believe the meaning of life is", + "n_probs": 10, + "temperature": 0.0, + "n_predict": 5, + "post_sampling_probs": True, + }) + assert res.status_code == 200 + assert "completion_probabilities" in res.body + assert len(res.body["completion_probabilities"]) == 5 + for tok in res.body["completion_probabilities"]: + assert "id" in tok and tok["id"] > 0 + assert "token" in tok and type(tok["token"]) == str + assert "prob" in tok and 0.0 < tok["prob"] <= 1.0 + assert "bytes" in tok and type(tok["bytes"]) == list + assert len(tok["top_probs"]) == 10 + for prob in tok["top_probs"]: + assert "id" in prob and prob["id"] > 0 + assert "token" in prob and type(prob["token"]) == str + assert "prob" in prob and 0.0 <= prob["prob"] <= 1.0 + assert "bytes" in prob and type(prob["bytes"]) == list + # because the test model usually output token with either 100% or 0% probability, we need to check all the top_probs + assert any(prob["prob"] == 1.0 for prob in tok["top_probs"]) diff --git a/examples/server/tests/unit/test_embedding.py b/examples/server/tests/unit/test_embedding.py index fc7c20064..8b0eb42b0 100644 --- a/examples/server/tests/unit/test_embedding.py +++ b/examples/server/tests/unit/test_embedding.py @@ -1,3 +1,5 @@ +import base64 +import struct import pytest from openai import OpenAI from utils import * @@ -14,8 +16,9 @@ def create_server(): def test_embedding_single(): global server + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": "I believe the meaning of life is", }) assert res.status_code == 200 @@ -29,8 +32,9 @@ def test_embedding_single(): def test_embedding_multiple(): global server + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": [ "I believe the meaning of life is", "Write a joke about AI from a very long prompt which will not be truncated", @@ -45,10 +49,72 @@ def test_embedding_multiple(): assert len(d['embedding']) > 1 -def test_embedding_openai_library_single(): +@pytest.mark.parametrize( + "input,is_multi_prompt", + [ + # do not crash on empty input + ("", False), + # single prompt + ("string", False), + ([12, 34, 56], False), + ([12, 34, "string", 56, 78], False), + # multiple prompts + (["string1", "string2"], True), + (["string1", [12, 34, 56]], True), + ([[12, 34, 56], [12, 34, 56]], True), + ([[12, 34, 56], [12, "string", 34, 56]], True), + ] +) +def test_embedding_mixed_input(input, is_multi_prompt: bool): global server server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + res = server.make_request("POST", "/v1/embeddings", data={"input": input}) + assert res.status_code == 200 + data = res.body['data'] + if is_multi_prompt: + assert len(data) == len(input) + for d in data: + assert 'embedding' in d + assert len(d['embedding']) > 1 + else: + assert 'embedding' in data[0] + assert len(data[0]['embedding']) > 1 + + +def test_embedding_pooling_none(): + global server + server.pooling = 'none' + server.start() + res = server.make_request("POST", "/embeddings", data={ + "input": "hello hello hello", + }) + assert res.status_code == 200 + assert 'embedding' in res.body[0] + assert len(res.body[0]['embedding']) == 5 # 3 text tokens + 2 special + + # make sure embedding vector is not normalized + for x in res.body[0]['embedding']: + assert abs(sum([x ** 2 for x in x]) - 1) > EPSILON + + +def test_embedding_pooling_none_oai(): + global server + server.pooling = 'none' + server.start() + res = server.make_request("POST", "/v1/embeddings", data={ + "input": "hello hello hello", + }) + + # /v1/embeddings does not support pooling type 'none' + assert res.status_code == 400 + assert "error" in res.body + + +def test_embedding_openai_library_single(): + global server + server.pooling = 'last' + server.start() + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.embeddings.create(model="text-embedding-3-small", input="I believe the meaning of life is") assert len(res.data) == 1 assert len(res.data[0].embedding) > 1 @@ -56,8 +122,9 @@ def test_embedding_openai_library_single(): def test_embedding_openai_library_multiple(): global server + server.pooling = 'last' server.start() - client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}") + client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1") res = client.embeddings.create(model="text-embedding-3-small", input=[ "I believe the meaning of life is", "Write a joke about AI from a very long prompt which will not be truncated", @@ -71,8 +138,9 @@ def test_embedding_openai_library_multiple(): def test_embedding_error_prompt_too_long(): global server + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": "This is a test " * 512, }) assert res.status_code != 200 @@ -80,8 +148,9 @@ def test_embedding_error_prompt_too_long(): def test_same_prompt_give_same_result(): + server.pooling = 'last' server.start() - res = server.make_request("POST", "/embeddings", data={ + res = server.make_request("POST", "/v1/embeddings", data={ "input": [ "I believe the meaning of life is", "I believe the meaning of life is", @@ -97,3 +166,72 @@ def test_same_prompt_give_same_result(): vi = res.body['data'][i]['embedding'] for x, y in zip(v0, vi): assert abs(x - y) < EPSILON + + +@pytest.mark.parametrize( + "content,n_tokens", + [ + ("I believe the meaning of life is", 9), + ("This is a test", 6), + ] +) +def test_embedding_usage_single(content, n_tokens): + global server + server.start() + res = server.make_request("POST", "/v1/embeddings", data={"input": content}) + assert res.status_code == 200 + assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] + assert res.body['usage']['prompt_tokens'] == n_tokens + + +def test_embedding_usage_multiple(): + global server + server.start() + res = server.make_request("POST", "/v1/embeddings", data={ + "input": [ + "I believe the meaning of life is", + "I believe the meaning of life is", + ], + }) + assert res.status_code == 200 + assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] + assert res.body['usage']['prompt_tokens'] == 2 * 9 + + +def test_embedding_openai_library_base64(): + server.start() + test_input = "Test base64 embedding output" + + # get embedding in default format + res = server.make_request("POST", "/v1/embeddings", data={ + "input": test_input + }) + assert res.status_code == 200 + vec0 = res.body["data"][0]["embedding"] + + # get embedding in base64 format + res = server.make_request("POST", "/v1/embeddings", data={ + "input": test_input, + "encoding_format": "base64" + }) + + assert res.status_code == 200 + assert "data" in res.body + assert len(res.body["data"]) == 1 + + embedding_data = res.body["data"][0] + assert "embedding" in embedding_data + assert isinstance(embedding_data["embedding"], str) + + # Verify embedding is valid base64 + decoded = base64.b64decode(embedding_data["embedding"]) + # Verify decoded data can be converted back to float array + float_count = len(decoded) // 4 # 4 bytes per float + floats = struct.unpack(f'{float_count}f', decoded) + assert len(floats) > 0 + assert all(isinstance(x, float) for x in floats) + assert len(floats) == len(vec0) + + # make sure the decoded data is the same as the original + for x, y in zip(floats, vec0): + assert abs(x - y) < EPSILON diff --git a/examples/server/tests/unit/test_rerank.py b/examples/server/tests/unit/test_rerank.py index 189bc4c96..7203d7943 100644 --- a/examples/server/tests/unit/test_rerank.py +++ b/examples/server/tests/unit/test_rerank.py @@ -53,3 +53,26 @@ def test_invalid_rerank_req(documents): }) assert res.status_code == 400 assert "error" in res.body + + +@pytest.mark.parametrize( + "query,doc1,doc2,n_tokens", + [ + ("Machine learning is", "A machine", "Learning is", 19), + ("Which city?", "Machine learning is ", "Paris, capitale de la", 26), + ] +) +def test_rerank_usage(query, doc1, doc2, n_tokens): + global server + server.start() + + res = server.make_request("POST", "/rerank", data={ + "query": query, + "documents": [ + doc1, + doc2, + ] + }) + assert res.status_code == 200 + assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens'] + assert res.body['usage']['prompt_tokens'] == n_tokens diff --git a/examples/server/tests/utils.py b/examples/server/tests/utils.py index 54a3138d6..7f508e6d8 100644 --- a/examples/server/tests/utils.py +++ b/examples/server/tests/utils.py @@ -65,6 +65,7 @@ class ServerProcess: server_reranking: bool | None = False server_metrics: bool | None = False server_slots: bool | None = False + pooling: str | None = None draft: int | None = None api_key: str | None = None response_format: str | None = None @@ -134,6 +135,8 @@ class ServerProcess: server_args.append("--metrics") if self.server_slots: server_args.append("--slots") + if self.pooling: + server_args.extend(["--pooling", self.pooling]) if self.model_alias: server_args.extend(["--alias", self.model_alias]) if self.n_ctx: diff --git a/examples/server/themes/buttons-top/index.html b/examples/server/themes/buttons-top/index.html index 2797c37c9..3fb88fcc8 100644 --- a/examples/server/themes/buttons-top/index.html +++ b/examples/server/themes/buttons-top/index.html @@ -222,7 +222,6 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled - penalize_nl: false, top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled @@ -779,7 +778,6 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html index dbe23c402..73f36d4b2 100644 --- a/examples/server/themes/wild/index.html +++ b/examples/server/themes/wild/index.html @@ -225,7 +225,6 @@ temperature: 0.7, repeat_last_n: 256, // 0 = disable penalty, -1 = context size repeat_penalty: 1.18, // 1.0 = disabled - penalize_nl: false, top_k: 40, // <= 0 to use vocab size top_p: 0.95, // 1.0 = disabled min_p: 0.05, // 0 = disabled @@ -782,7 +781,6 @@ ${FloatField({ label: "Temperature", max: 2.0, min: 0.0, name: "temperature", step: 0.01, value: params.value.temperature })} ${FloatField({ label: "Penalize repeat sequence", max: 2.0, min: 0.0, name: "repeat_penalty", step: 0.01, value: params.value.repeat_penalty })} ${IntField({ label: "Consider N tokens for penalize", max: 2048, min: 0, name: "repeat_last_n", value: params.value.repeat_last_n })} - ${BoolField({ label: "Penalize repetition of newlines", name: "penalize_nl", value: params.value.penalize_nl })} ${IntField({ label: "Top-K sampling", max: 100, min: -1, name: "top_k", value: params.value.top_k })} ${FloatField({ label: "Top-P sampling", max: 1.0, min: 0.0, name: "top_p", step: 0.01, value: params.value.top_p })} ${FloatField({ label: "Min-P sampling", max: 1.0, min: 0.0, name: "min_p", step: 0.01, value: params.value.min_p })} diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index e5ae16a70..260ac07cb 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -3,6 +3,7 @@ #include "common.h" #include "log.h" #include "llama.h" +#include "common/base64.hpp" #ifndef NDEBUG // crash the server in debug mode, otherwise send an http 500 error @@ -24,7 +25,7 @@ #include #include -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo" using json = nlohmann::ordered_json; @@ -58,6 +59,8 @@ static T json_value(const json & body, const std::string & key, const T & defaul } } +const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); + // // tokenizer and input processing utils // @@ -90,6 +93,28 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) { return false; } +// get value by path(key1 / key2) +static json json_get_nested_values(const std::vector & paths, const json & js) { + json result = json::object(); + + for (const std::string & path : paths) { + json current = js; + const auto keys = string_split(path, /*separator*/ '/'); + bool valid_path = true; + for (const std::string & k : keys) { + if (valid_path && current.is_object() && current.contains(k)) { + current = current[k]; + } else { + valid_path = false; + } + } + if (valid_path) { + result[path] = current; + } + } + return result; +} + /** * this handles 2 cases: * - only string, example: "string" @@ -140,6 +165,7 @@ static llama_tokens tokenize_mixed(const llama_context * ctx, const json & json_ * and multiple prompts (multi-tasks): * - "prompt": ["string1", "string2"] * - "prompt": ["string1", [12, 34, 56]] + * - "prompt": [[12, 34, 56], [78, 90, 12]] * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] */ static std::vector tokenize_input_prompts(llama_context * ctx, const json & json_prompt, bool add_special, bool parse_special) { @@ -172,6 +198,36 @@ static std::vector tokenize_input_prompts(llama_context * ctx, con return result; } +// return the last index of character that can form a valid string +// if the last character is potentially cut in half, return the index before the cut +// if validate_utf8(text) == text.size(), then the whole text is valid utf8 +static size_t validate_utf8(const std::string& text) { + size_t len = text.size(); + if (len == 0) return 0; + + // Check the last few bytes to see if a multi-byte character is cut off + for (size_t i = 1; i <= 4 && i <= len; ++i) { + unsigned char c = text[len - i]; + // Check for start of a multi-byte sequence from the end + if ((c & 0xE0) == 0xC0) { + // 2-byte character start: 110xxxxx + // Needs at least 2 bytes + if (i < 2) return len - i; + } else if ((c & 0xF0) == 0xE0) { + // 3-byte character start: 1110xxxx + // Needs at least 3 bytes + if (i < 3) return len - i; + } else if ((c & 0xF8) == 0xF0) { + // 4-byte character start: 11110xxx + // Needs at least 4 bytes + if (i < 4) return len - i; + } + } + + // If no cut-off multi-byte character is found, return full length + return len; +} + // // template utils // @@ -618,23 +674,41 @@ static json oaicompat_completion_params_parse( return llama_params; } -static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) { +static json format_embeddings_response_oaicompat(const json & request, const json & embeddings, bool use_base64 = false) { json data = json::array(); + int32_t n_tokens = 0; int i = 0; for (const auto & elem : embeddings) { - data.push_back(json{ - {"embedding", json_value(elem, "embedding", json::array())}, - {"index", i++}, - {"object", "embedding"} - }); + json embedding_obj; + + if (use_base64) { + const auto& vec = json_value(elem, "embedding", json::array()).get>(); + const char* data_ptr = reinterpret_cast(vec.data()); + size_t data_size = vec.size() * sizeof(float); + embedding_obj = { + {"embedding", base64::encode(data_ptr, data_size)}, + {"index", i++}, + {"object", "embedding"}, + {"encoding_format", "base64"} + }; + } else { + embedding_obj = { + {"embedding", json_value(elem, "embedding", json::array())}, + {"index", i++}, + {"object", "embedding"} + }; + } + data.push_back(embedding_obj); + + n_tokens += json_value(elem, "tokens_evaluated", 0); } json res = json { {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", "list"}, - {"usage", json { // TODO: fill - {"prompt_tokens", 0}, - {"total_tokens", 0} + {"usage", json { + {"prompt_tokens", n_tokens}, + {"total_tokens", n_tokens} }}, {"data", data} }; @@ -644,20 +718,23 @@ static json format_embeddings_response_oaicompat(const json & request, const jso static json format_response_rerank(const json & request, const json & ranks) { json data = json::array(); + int32_t n_tokens = 0; int i = 0; for (const auto & rank : ranks) { data.push_back(json{ {"index", i++}, {"relevance_score", json_value(rank, "score", 0.0)}, }); + + n_tokens += json_value(rank, "tokens_evaluated", 0); } json res = json { {"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, {"object", "list"}, - {"usage", json { // TODO: fill - {"prompt_tokens", 0}, - {"total_tokens", 0} + {"usage", json { + {"prompt_tokens", n_tokens}, + {"total_tokens", n_tokens} }}, {"results", data} }; @@ -724,3 +801,33 @@ static json format_logit_bias(const std::vector & logit_bias) static std::string safe_json_to_str(json data) { return data.dump(-1, ' ', false, json::error_handler_t::replace); } + +static std::vector get_token_probabilities(llama_context * ctx, int idx) { + std::vector cur; + const auto * logits = llama_get_logits_ith(ctx, idx); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + + cur.resize(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f}; + } + + // sort tokens by logits + std::sort(cur.begin(), cur.end(), [](const llama_token_data & a, const llama_token_data & b) { + return a.logit > b.logit; + }); + + // apply softmax + float max_l = cur[0].logit; + float cum_sum = 0.0f; + for (size_t i = 0; i < cur.size(); ++i) { + float p = expf(cur[i].logit - max_l); + cur[i].p = p; + cum_sum += p; + } + for (size_t i = 0; i < cur.size(); ++i) { + cur[i].p /= cum_sum; + } + + return cur; +} diff --git a/examples/server/webui/index.html b/examples/server/webui/index.html index 9d29fe005..dcdd41079 100644 --- a/examples/server/webui/index.html +++ b/examples/server/webui/index.html @@ -201,6 +201,10 @@
Advanced config
+
+ + +
Show tokens per second diff --git a/examples/server/webui/package-lock.json b/examples/server/webui/package-lock.json index f9104f65f..bbebccbf2 100644 --- a/examples/server/webui/package-lock.json +++ b/examples/server/webui/package-lock.json @@ -8,8 +8,12 @@ "name": "webui", "version": "0.0.0", "dependencies": { + "@sec-ant/readable-stream": "^0.6.0", + "@vscode/markdown-it-katex": "^1.1.1", "autoprefixer": "^10.4.20", "daisyui": "^4.12.14", + "highlight.js": "^11.10.0", + "katex": "^0.16.15", "markdown-it": "^14.1.0", "postcss": "^8.4.49", "tailwindcss": "^3.4.15", @@ -18,6 +22,7 @@ "vue": "^3.5.13" }, "devDependencies": { + "sass-embedded": "^1.83.0", "vite": "^5.4.10" } }, @@ -33,6 +38,13 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/@bufbuild/protobuf": { + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/@bufbuild/protobuf/-/protobuf-2.2.3.tgz", + "integrity": "sha512-tFQoXHJdkEOSwj5tRIZSPNUuXK3RaR7T1nUrPgbYX1pUbvqqaaZAsfo+NXBPsz5rZMSKVFrgK1WL8Q/MSLvprg==", + "devOptional": true, + "license": "(Apache-2.0 AND BSD-3-Clause)" + }, "node_modules/@esbuild/aix-ppc64": { "version": "0.21.5", "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.21.5.tgz", @@ -606,6 +618,21 @@ "win32" ] }, + "node_modules/@sec-ant/readable-stream": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@sec-ant/readable-stream/-/readable-stream-0.6.0.tgz", + "integrity": "sha512-uiBh8DrB5FN35gP6/o8JEhEQ7/ci1jUsOZO/VMUjyvTpjtV54VstOXVj1TvTj/wsT23pfX6butxxh3qufsW3+g==", + "license": "MIT" + }, + "node_modules/@vscode/markdown-it-katex": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@vscode/markdown-it-katex/-/markdown-it-katex-1.1.1.tgz", + "integrity": "sha512-3KTlbsRBPJQLE2YmLL7K6nunTlU+W9T5+FjfNdWuIUKgxSS6HWLQHaO3L4MkJi7z7MpIPpY+g4N+cWNBPE/MSA==", + "license": "MIT", + "dependencies": { + "katex": "^0.16.4" + } + }, "node_modules/@vue/compiler-dom": { "version": "3.5.13", "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.5.13.tgz", @@ -1004,6 +1031,13 @@ "browserslist": ">= 4.21.0" } }, + "node_modules/buffer-builder": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/buffer-builder/-/buffer-builder-0.2.0.tgz", + "integrity": "sha512-7VPMEPuYznPSoR21NE1zvd2Xna6c/CloiZCfcMXR1Jny6PjX0N4Nsa38zcBFo/FMK+BlA+FLKbJCQ0i2yxp+Xg==", + "devOptional": true, + "license": "MIT/X11" + }, "node_modules/caniuse-lite": { "version": "1.0.30001684", "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001684.tgz", @@ -1166,6 +1200,22 @@ "node": ">=8.0" } }, + "node_modules/colorjs.io": { + "version": "0.5.2", + "resolved": "https://registry.npmjs.org/colorjs.io/-/colorjs.io-0.5.2.tgz", + "integrity": "sha512-twmVoizEW7ylZSN32OgKdXRmo1qg+wT5/6C3xu5b9QsWzSFAhHLn2xd8ro0diCsKfCj1RdaTP/nrcW+vAoQPIw==", + "devOptional": true, + "license": "MIT" + }, + "node_modules/commander": { + "version": "8.3.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-8.3.0.tgz", + "integrity": "sha512-OkTL9umf+He2DZkUq8f8J9of7yL6RJKI24dVITBmNfZBmri9zYZQrKkuXiKhyfPSu8tUhnVBB1iKXevvnlR4Ww==", + "license": "MIT", + "engines": { + "node": ">= 12" + } + }, "node_modules/css-selector-tokenizer": { "version": "0.8.0", "resolved": "https://registry.npmjs.org/css-selector-tokenizer/-/css-selector-tokenizer-0.8.0.tgz", @@ -1473,6 +1523,31 @@ "node": ">=10.13.0" } }, + "node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "devOptional": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/highlight.js": { + "version": "11.10.0", + "resolved": "https://registry.npmjs.org/highlight.js/-/highlight.js-11.10.0.tgz", + "integrity": "sha512-SYVnVFswQER+zu1laSya563s+F8VDGt7o35d4utbamowvUNLLMovFqwCLSocpZTz3MgaSRA1IbqRWZv97dtErQ==", + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/immutable": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/immutable/-/immutable-5.0.3.tgz", + "integrity": "sha512-P8IdPQHq3lA1xVeBRi5VPqUm5HDgKnx0Ru51wZz5mjxHr5n3RWhjIpOFU7ybkUxfB+5IToy+OLaHYDBIWsv+uw==", + "devOptional": true, + "license": "MIT" + }, "node_modules/is-glob": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", @@ -1503,6 +1578,22 @@ "jiti": "bin/jiti.js" } }, + "node_modules/katex": { + "version": "0.16.15", + "resolved": "https://registry.npmjs.org/katex/-/katex-0.16.15.tgz", + "integrity": "sha512-yE9YJIEAk2aZ+FL/G8r+UGw0CTUzEA8ZFy6E+8tc3spHUKq3qBnzCkI1CQwGoI9atJhVyFPEypQsTY7mJ1Pi9w==", + "funding": [ + "https://opencollective.com/katex", + "https://github.com/sponsors/katex" + ], + "license": "MIT", + "dependencies": { + "commander": "^8.3.0" + }, + "bin": { + "katex": "cli.js" + } + }, "node_modules/lilconfig": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", @@ -2022,6 +2113,381 @@ "integrity": "sha512-AYnb1nQyY49te+VRAVgmzfcgjYS91mY5P0TKUDCLEM+gNnA+3T6rWITXRLYCpahpqSQbN5cE+gHpnPyXjHWxcw==", "license": "MIT" }, + "node_modules/rxjs": { + "version": "7.8.1", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", + "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==", + "devOptional": true, + "license": "Apache-2.0", + "dependencies": { + "tslib": "^2.1.0" + } + }, + "node_modules/sass-embedded": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded/-/sass-embedded-1.83.0.tgz", + "integrity": "sha512-/8cYZeL39evUqe0o//193na51Q1VWZ61qhxioQvLJwOtWIrX+PgNhCyD8RSuTtmzc4+6+waFZf899bfp/MCUwA==", + "devOptional": true, + "license": "MIT", + "dependencies": { + "@bufbuild/protobuf": "^2.0.0", + "buffer-builder": "^0.2.0", + "colorjs.io": "^0.5.0", + "immutable": "^5.0.2", + "rxjs": "^7.4.0", + "supports-color": "^8.1.1", + "sync-child-process": "^1.0.2", + "varint": "^6.0.0" + }, + "bin": { + "sass": "dist/bin/sass.js" + }, + "engines": { + "node": ">=16.0.0" + }, + "optionalDependencies": { + "sass-embedded-android-arm": "1.83.0", + "sass-embedded-android-arm64": "1.83.0", + "sass-embedded-android-ia32": "1.83.0", + "sass-embedded-android-riscv64": "1.83.0", + "sass-embedded-android-x64": "1.83.0", + "sass-embedded-darwin-arm64": "1.83.0", + "sass-embedded-darwin-x64": "1.83.0", + "sass-embedded-linux-arm": "1.83.0", + "sass-embedded-linux-arm64": "1.83.0", + "sass-embedded-linux-ia32": "1.83.0", + "sass-embedded-linux-musl-arm": "1.83.0", + "sass-embedded-linux-musl-arm64": "1.83.0", + "sass-embedded-linux-musl-ia32": "1.83.0", + "sass-embedded-linux-musl-riscv64": "1.83.0", + "sass-embedded-linux-musl-x64": "1.83.0", + "sass-embedded-linux-riscv64": "1.83.0", + "sass-embedded-linux-x64": "1.83.0", + "sass-embedded-win32-arm64": "1.83.0", + "sass-embedded-win32-ia32": "1.83.0", + "sass-embedded-win32-x64": "1.83.0" + } + }, + "node_modules/sass-embedded-android-arm": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-arm/-/sass-embedded-android-arm-1.83.0.tgz", + "integrity": "sha512-uwFSXzJlfbd4Px189xE5l+cxN8+TQpXdQgJec7TIrb4HEY7imabtpYufpVdqUVwT1/uiis5V4+qIEC4Vl5XObQ==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-android-arm64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-arm64/-/sass-embedded-android-arm64-1.83.0.tgz", + "integrity": "sha512-GBiCvM4a2rkWBLdYDxI6XYnprfk5U5c81g69RC2X6kqPuzxzx8qTArQ9M6keFK4+iDQ5N9QTwFCr0KbZTn+ZNQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-android-ia32": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-ia32/-/sass-embedded-android-ia32-1.83.0.tgz", + "integrity": "sha512-5ATPdGo2SICqAhiJl/Z8KQ23zH4sGgobGgux0TnrNtt83uHZ+r+To/ubVJ7xTkZxed+KJZnIpolGD8dQyQqoTg==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-android-riscv64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-riscv64/-/sass-embedded-android-riscv64-1.83.0.tgz", + "integrity": "sha512-aveknUOB8GZewOzVn2Uwk+DKcncTR50Q6vtzslNMGbYnxtgQNHzy8A1qVEviNUruex+pHofppeMK4iMPFAbiEQ==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-android-x64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-android-x64/-/sass-embedded-android-x64-1.83.0.tgz", + "integrity": "sha512-WqIay/72ncyf9Ph4vS742J3a73wZihWmzFUwpn1OD6lme1Aj4eWzWIve5IVnlTEJgcZcDHu6ECID9IZgehJKoA==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-darwin-arm64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-darwin-arm64/-/sass-embedded-darwin-arm64-1.83.0.tgz", + "integrity": "sha512-XQl9QqgxFFIPm/CzHhmppse5o9ocxrbaAdC2/DAnlAqvYWBBtgFqPjGoYlej13h9SzfvNoogx+y9r+Ap+e+hYg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-darwin-x64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-darwin-x64/-/sass-embedded-darwin-x64-1.83.0.tgz", + "integrity": "sha512-ERQ7Tvp1kFOW3ux4VDFIxb7tkYXHYc+zJpcrbs0hzcIO5ilIRU2tIOK1OrNwrFO6Qxyf7AUuBwYKLAtIU/Nz7g==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-arm": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm/-/sass-embedded-linux-arm-1.83.0.tgz", + "integrity": "sha512-baG9RYBJxUFmqwDNC9h9ZFElgJoyO3jgHGjzEZ1wHhIS9anpG+zZQvO8bHx3dBpKEImX+DBeLX+CxsFR9n81gQ==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-arm64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-arm64/-/sass-embedded-linux-arm64-1.83.0.tgz", + "integrity": "sha512-syEAVTJt4qhaMLxrSwOWa46zdqHJdnqJkLUK+t9aCr8xqBZLPxSUeIGji76uOehQZ1C+KGFj6n9xstHN6wzOJw==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-ia32": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-ia32/-/sass-embedded-linux-ia32-1.83.0.tgz", + "integrity": "sha512-RRBxQxMpoxu5+XcSSc6QR/o9asEwUzR8AbCS83RaXcdTIHTa/CccQsiAoDDoPlRsMTLqnzs0LKL4CfOsf7zBbA==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-arm": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm/-/sass-embedded-linux-musl-arm-1.83.0.tgz", + "integrity": "sha512-Yc7u2TelCfBab+PRob9/MNJFh3EooMiz4urvhejXkihTiKSHGCv5YqDdtWzvyb9tY2Jb7YtYREVuHwfdVn3dTQ==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-arm64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-arm64/-/sass-embedded-linux-musl-arm64-1.83.0.tgz", + "integrity": "sha512-Y7juhPHClUO2H5O+u+StRy6SEAcwZ+hTEk5WJdEmo1Bb1gDtfHvJaWB/iFZJ2tW0W1e865AZeUrC4OcOFjyAQA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-ia32": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-ia32/-/sass-embedded-linux-musl-ia32-1.83.0.tgz", + "integrity": "sha512-arQeYwGmwXV8byx5G1PtSzZWW1jbkfR5qrIHMEbTFSAvAxpqjgSvCvrHMOFd73FcMxVaYh4BX9LQNbKinkbEdg==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-riscv64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-riscv64/-/sass-embedded-linux-musl-riscv64-1.83.0.tgz", + "integrity": "sha512-E6uzlIWz59rut+Z3XR6mLG915zNzv07ISvj3GUNZENdHM7dF8GQ//ANoIpl5PljMQKp89GnYdvo6kj2gnaBf/g==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-musl-x64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-musl-x64/-/sass-embedded-linux-musl-x64-1.83.0.tgz", + "integrity": "sha512-eAMK6tyGqvqr21r9g8BnR3fQc1rYFj85RGduSQ3xkITZ6jOAnOhuU94N5fwRS852Hpws0lXhET+7JHXgg3U18w==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-riscv64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-riscv64/-/sass-embedded-linux-riscv64-1.83.0.tgz", + "integrity": "sha512-Ojpi78pTv02sy2fUYirRGXHLY3fPnV/bvwuC2i5LwPQw2LpCcFyFTtN0c5h4LJDk9P6wr+/ZB/JXU8tHIOlK+Q==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-linux-x64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-linux-x64/-/sass-embedded-linux-x64-1.83.0.tgz", + "integrity": "sha512-3iLjlXdoPfgZRtX4odhRvka1BQs5mAXqfCtDIQBgh/o0JnGPzJIWWl9bYLpHxK8qb+uyVBxXYgXpI0sCzArBOw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-win32-arm64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-win32-arm64/-/sass-embedded-win32-arm64-1.83.0.tgz", + "integrity": "sha512-iOHw/8/t2dlTW3lOFwG5eUbiwhEyGWawivlKWJ8lkXH7fjMpVx2VO9zCFAm8RvY9xOHJ9sf1L7g5bx3EnNP9BQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-win32-ia32": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-win32-ia32/-/sass-embedded-win32-ia32-1.83.0.tgz", + "integrity": "sha512-2PxNXJ8Pad4geVcTXY4rkyTr5AwbF8nfrCTDv0ulbTvPhzX2mMKEGcBZUXWn5BeHZTBc6whNMfS7d5fQXR9dDQ==", + "cpu": [ + "ia32" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/sass-embedded-win32-x64": { + "version": "1.83.0", + "resolved": "https://registry.npmjs.org/sass-embedded-win32-x64/-/sass-embedded-win32-x64-1.83.0.tgz", + "integrity": "sha512-muBXkFngM6eLTNqOV0FQi7Dv9s+YRQ42Yem26mosdan/GmJQc81deto6uDTgrYn+bzFNmiXcOdfm+0MkTWK3OQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/sucrase": { "version": "3.35.0", "resolved": "https://registry.npmjs.org/sucrase/-/sucrase-3.35.0.tgz", @@ -2641,6 +3107,45 @@ "node": ">=8" } }, + "node_modules/supports-color": { + "version": "8.1.1", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz", + "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==", + "devOptional": true, + "license": "MIT", + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/supports-color?sponsor=1" + } + }, + "node_modules/sync-child-process": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/sync-child-process/-/sync-child-process-1.0.2.tgz", + "integrity": "sha512-8lD+t2KrrScJ/7KXCSyfhT3/hRq78rC0wBFqNJXv3mZyn6hW2ypM05JmlSvtqRbeq6jqA94oHbxAr2vYsJ8vDA==", + "devOptional": true, + "license": "MIT", + "dependencies": { + "sync-message-port": "^1.0.0" + }, + "engines": { + "node": ">=16.0.0" + } + }, + "node_modules/sync-message-port": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/sync-message-port/-/sync-message-port-1.1.3.tgz", + "integrity": "sha512-GTt8rSKje5FilG+wEdfCkOcLL7LWqpMlr2c3LRuKt/YXxcJ52aGSbGBAdI4L3aaqfrBt6y711El53ItyH1NWzg==", + "devOptional": true, + "license": "MIT", + "engines": { + "node": ">=16.0.0" + } + }, "node_modules/tailwindcss": { "version": "3.4.15", "resolved": "https://registry.npmjs.org/tailwindcss/-/tailwindcss-3.4.15.tgz", @@ -2684,12 +3189,26 @@ "integrity": "sha512-iBHbi7BQxrFmwZUQJsT0SjNzlLLsXhvW/kg7EyOMVMBIrlnj/qYofwo1LVLZi+3GbUEo96Iu2eqToI2+lZoAEQ==", "license": "MIT" }, + "node_modules/tslib": { + "version": "2.8.1", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.8.1.tgz", + "integrity": "sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==", + "devOptional": true, + "license": "0BSD" + }, "node_modules/uc.micro": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/uc.micro/-/uc.micro-2.1.0.tgz", "integrity": "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A==", "license": "MIT" }, + "node_modules/varint": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/varint/-/varint-6.0.0.tgz", + "integrity": "sha512-cXEIW6cfr15lFv563k4GuVuW/fiwjknytD37jIOLSdSWuOI6WnO/oKwmP2FQTU2l01LP8/M5TSAJpzUaGe3uWg==", + "devOptional": true, + "license": "MIT" + }, "node_modules/vite": { "version": "5.4.11", "resolved": "https://registry.npmjs.org/vite/-/vite-5.4.11.tgz", diff --git a/examples/server/webui/package.json b/examples/server/webui/package.json index d656a841d..2836cce00 100644 --- a/examples/server/webui/package.json +++ b/examples/server/webui/package.json @@ -6,14 +6,20 @@ "scripts": { "dev": "vite", "build": "vite build", - "preview": "vite preview" + "preview": "vite preview", + "analyze": "ANALYZE=1 npx vite-bundle-visualizer" }, "devDependencies": { + "sass-embedded": "^1.83.0", "vite": "^5.4.10" }, "dependencies": { + "@sec-ant/readable-stream": "^0.6.0", + "@vscode/markdown-it-katex": "^1.1.1", "autoprefixer": "^10.4.20", "daisyui": "^4.12.14", + "highlight.js": "^11.10.0", + "katex": "^0.16.15", "markdown-it": "^14.1.0", "postcss": "^8.4.49", "tailwindcss": "^3.4.15", diff --git a/examples/server/webui/public/demo-conversation.json b/examples/server/webui/public/demo-conversation.json new file mode 100644 index 000000000..75ab599dd --- /dev/null +++ b/examples/server/webui/public/demo-conversation.json @@ -0,0 +1,33 @@ +{ + "demo": true, + "id": "conv-1734086746930", + "lastModified": 1734087548943, + "messages": [ + { + "id": 1734086764521, + "role": "user", + "content": "this is a demo conversation, used in dev mode" + }, + { + "id": 1734087548327, + "role": "assistant", + "content": "This is the formula:\n\n$\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}$\n\nGiven an input vector \\(\\mathbf{x} = [x_1, x_2, \\ldots, x_n]\\)\n\n\\[\ny_i = \\frac{e^{x_i}}{\\sum_{j=1}^n e^{x_j}}\n\\]\n\nCode block latex:\n```latex\n\\frac{e^{x_i}}{\\sum_{j=1}^{n}e^{x_j}}\n```\n\nTest dollar sign: $1234 $4567\n\nInvalid latex syntax: $E = mc^$ and $$E = mc^$$", + "timings": { + "prompt_n": 1, + "prompt_ms": 28.923, + "predicted_n": 25, + "predicted_ms": 573.016 + } + }, + { + "id": 1734087548328, + "role": "user", + "content": "this is a demo conversation, used in dev mode" + }, + { + "id": 1734087548329, + "role": "assistant", + "content": "Code block:\n```js\nconsole.log('hello world')\n```\n```sh\nls -la /dev\n```" + } + ] +} diff --git a/examples/server/webui/src/highlight-config.js b/examples/server/webui/src/highlight-config.js new file mode 100644 index 000000000..96c7028f9 --- /dev/null +++ b/examples/server/webui/src/highlight-config.js @@ -0,0 +1,60 @@ +import hljs from 'highlight.js/lib/core'; + +// only import commonly used languages to reduce bundle size + +import python from 'highlight.js/lib/languages/python'; +import javascript from 'highlight.js/lib/languages/javascript'; +import json from 'highlight.js/lib/languages/json'; +import bash from 'highlight.js/lib/languages/bash'; +import yaml from 'highlight.js/lib/languages/yaml'; +import markdown from 'highlight.js/lib/languages/markdown'; +import scss from 'highlight.js/lib/languages/scss'; +import xml from 'highlight.js/lib/languages/xml'; +import ruby from 'highlight.js/lib/languages/ruby'; +import go from 'highlight.js/lib/languages/go'; +import java from 'highlight.js/lib/languages/java'; +import rust from 'highlight.js/lib/languages/rust'; +import scala from 'highlight.js/lib/languages/scala'; +import cpp from 'highlight.js/lib/languages/cpp'; +import csharp from 'highlight.js/lib/languages/csharp'; +import swift from 'highlight.js/lib/languages/swift'; +import dart from 'highlight.js/lib/languages/dart'; +import elixir from 'highlight.js/lib/languages/elixir'; +import kotlin from 'highlight.js/lib/languages/kotlin'; +import lua from 'highlight.js/lib/languages/lua'; +import php from 'highlight.js/lib/languages/php'; +import latex from 'highlight.js/lib/languages/latex'; + +hljs.registerLanguage('python', python); +hljs.registerLanguage('javascript', javascript); +hljs.registerLanguage('json', json); +hljs.registerLanguage('yaml', yaml); +hljs.registerLanguage('markdown', markdown); +hljs.registerLanguage('xml', xml); +hljs.registerLanguage('ruby', ruby); +hljs.registerLanguage('go', go); +hljs.registerLanguage('java', java); +hljs.registerLanguage('rust', rust); +hljs.registerLanguage('scala', scala); +hljs.registerLanguage('csharp', csharp); +hljs.registerLanguage('swift', swift); +hljs.registerLanguage('dart', dart); +hljs.registerLanguage('elixir', elixir); +hljs.registerLanguage('kotlin', kotlin); +hljs.registerLanguage('lua', lua); +hljs.registerLanguage('php', php); +hljs.registerLanguage('latex', latex); + +// reuse some languages to further reduce bundle size + +hljs.registerLanguage('shell', bash); +hljs.registerLanguage('bash', bash); +hljs.registerLanguage('sh', bash); + +hljs.registerLanguage('css', scss); +hljs.registerLanguage('scss', scss); + +hljs.registerLanguage('c', cpp); +hljs.registerLanguage('cpp', cpp); + +export default hljs; diff --git a/examples/server/webui/src/katex-gpt.js b/examples/server/webui/src/katex-gpt.js new file mode 100644 index 000000000..7c7c5e22c --- /dev/null +++ b/examples/server/webui/src/katex-gpt.js @@ -0,0 +1,66 @@ +import katex from 'katex'; + +// Adapted from https://github.com/SchneeHertz/markdown-it-katex-gpt +// MIT license + +const defaultOptions = { + delimiters: [ + { left: '\\[', right: '\\]', display: true }, + { left: '\\(', right: '\\)', display: false }, + ], +}; + +export function renderLatexHTML(content, display = false) { + return katex.renderToString(content, { + throwOnError: false, + output: 'mathml', + displayMode: display, + }); +} + +function escapedBracketRule(options) { + return (state, silent) => { + const max = state.posMax; + const start = state.pos; + + for (const { left, right, display } of options.delimiters) { + + // Check if it starts with the left delimiter + if (!state.src.slice(start).startsWith(left)) continue; + + // Skip the length of the left delimiter + let pos = start + left.length; + + // Find the matching right delimiter + while (pos < max) { + if (state.src.slice(pos).startsWith(right)) { + break; + } + pos++; + } + + // No matching right delimiter found, skip to the next match + if (pos >= max) continue; + + // If not in silent mode, convert LaTeX formula to MathML + if (!silent) { + const content = state.src.slice(start + left.length, pos); + try { + const renderedContent = renderLatexHTML(content, display); + const token = state.push('html_inline', '', 0); + token.content = renderedContent; + } catch (e) { + console.error(e); + } + } + + // Update position, skip the length of the right delimiter + state.pos = pos + right.length; + return true; + } + } +} + +export default function (md, options = defaultOptions) { + md.inline.ruler.after('text', 'escaped_bracket', escapedBracketRule(options)); +} diff --git a/examples/server/webui/src/main.js b/examples/server/webui/src/main.js index f1f35481d..358a40628 100644 --- a/examples/server/webui/src/main.js +++ b/examples/server/webui/src/main.js @@ -1,8 +1,20 @@ -import './styles.css'; +import './styles.scss'; import { createApp, defineComponent, shallowRef, computed, h } from 'vue/dist/vue.esm-bundler.js'; import MarkdownIt from 'markdown-it'; import TextLineStream from 'textlinestream'; +// math formula rendering +import 'katex/dist/katex.min.css'; +import markdownItKatexGpt from './katex-gpt'; +import markdownItKatexNormal from '@vscode/markdown-it-katex'; + +// code highlighting +import hljs from './highlight-config'; +import daisyuiThemes from 'daisyui/src/theming/themes'; + +// ponyfill for missing ReadableStream asyncIterator on Safari +import { asyncIterator } from '@sec-ant/readable-stream/ponyfill/asyncIterator'; + const isDev = import.meta.env.MODE === 'development'; // utility functions @@ -10,18 +22,36 @@ const isString = (x) => !!x.toLowerCase; const isBoolean = (x) => x === true || x === false; const isNumeric = (n) => !isString(n) && !isNaN(n) && !isBoolean(n); const escapeAttr = (str) => str.replace(/>/g, '>').replace(/"/g, '"'); -const copyStr = (str) => navigator.clipboard.writeText(str); +const copyStr = (textToCopy) => { + // Navigator clipboard api needs a secure context (https) + if (navigator.clipboard && window.isSecureContext) { + navigator.clipboard.writeText(textToCopy); + } else { + // Use the 'out of viewport hidden text area' trick + const textArea = document.createElement('textarea'); + textArea.value = textToCopy; + // Move textarea out of the viewport so it's not visible + textArea.style.position = 'absolute'; + textArea.style.left = '-999999px'; + document.body.prepend(textArea); + textArea.select(); + document.execCommand('copy'); + } +}; // constants -const BASE_URL = localStorage.getItem('base') // for debugging - || (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production +const BASE_URL = isDev + ? (localStorage.getItem('base') || 'https://localhost:8080') // for debugging + : (new URL('.', document.baseURI).href).toString().replace(/\/$/, ''); // for production +console.log({ BASE_URL }); + const CONFIG_DEFAULT = { // Note: in order not to introduce breaking changes, please keep the same data type (number, string, etc) if you want to change the default value. Do not use null or undefined for default value. apiKey: '', systemMessage: 'You are a helpful assistant.', showTokensPerSecond: false, // make sure these default values are in sync with `common.h` - samplers: 'dkypmxt', + samplers: 'edkypmxt', temperature: 0.8, dynatemp_range: 0.0, dynatemp_exponent: 1.0, @@ -69,12 +99,39 @@ const CONFIG_INFO = { // config keys having numeric value (i.e. temperature, top_k, top_p, etc) const CONFIG_NUMERIC_KEYS = Object.entries(CONFIG_DEFAULT).filter(e => isNumeric(e[1])).map(e => e[0]); // list of themes supported by daisyui -const THEMES = ['light', 'dark', 'cupcake', 'bumblebee', 'emerald', 'corporate', 'synthwave', 'retro', 'cyberpunk', 'valentine', 'halloween', 'garden', 'forest', 'aqua', 'lofi', 'pastel', 'fantasy', 'wireframe', 'black', 'luxury', 'dracula', 'cmyk', 'autumn', 'business', 'acid', 'lemonade', 'night', 'coffee', 'winter', 'dim', 'nord', 'sunset']; +const THEMES = ['light', 'dark'] + // make sure light & dark are always at the beginning + .concat(Object.keys(daisyuiThemes).filter(t => t !== 'light' && t !== 'dark')); // markdown support const VueMarkdown = defineComponent( (props) => { - const md = shallowRef(new MarkdownIt({ breaks: true })); + const md = shallowRef(new MarkdownIt({ + breaks: true, + highlight: function (str, lang) { // Add highlight.js + if (lang && hljs.getLanguage(lang)) { + try { + return '
' +
+                   hljs.highlight(str, { language: lang, ignoreIllegals: true }).value +
+                   '
'; + } catch (__) {} + } + return '
' + md.value.utils.escapeHtml(str) + '
'; + } + })); + // support latex with double dollar sign and square brackets + md.value.use(markdownItKatexGpt, { + delimiters: [ + { left: '\\[', right: '\\]', display: true }, + { left: '\\(', right: '\\)', display: false }, + { left: '$$', right: '$$', display: false }, + // do not add single dollar sign here, other wise it will confused with dollar used for money symbol + ], + throwOnError: false, + }); + // support latex with single dollar sign + md.value.use(markdownItKatexNormal, { throwOnError: false }); + // add copy button to code blocks const origFenchRenderer = md.value.renderer.rules.fence; md.value.renderer.rules.fence = (tokens, idx, ...args) => { const content = tokens[idx].content; @@ -88,9 +145,9 @@ const VueMarkdown = defineComponent( }; window.copyStr = copyStr; const content = computed(() => md.value.render(props.source)); - return () => h("div", { innerHTML: content.value }); + return () => h('div', { innerHTML: content.value }); }, - { props: ["source"] } + { props: ['source'] } ); // input field to be used by settings modal @@ -244,7 +301,7 @@ async function* sendSSEPostRequest(url, fetchOptions) { const lines = res.body .pipeThrough(new TextDecoderStream()) .pipeThrough(new TextLineStream()); - for await (const line of lines) { + for await (const line of asyncIterator(lines)) { if (isDev) console.log({line}); if (line.startsWith('data:') && !line.endsWith('[DONE]')) { const data = JSON.parse(line.slice(5)); @@ -278,6 +335,7 @@ const mainApp = createApp({ themes: THEMES, configDefault: {...CONFIG_DEFAULT}, configInfo: {...CONFIG_INFO}, + isDev, } }, computed: {}, @@ -289,6 +347,7 @@ const mainApp = createApp({ if (this.isGenerating) chatScrollToBottom(true); }); resizeObserver.observe(pendingMsgElem); + this.setSelectedTheme(this.selectedTheme); }, watch: { viewingConvId: function(val, oldVal) { @@ -305,6 +364,8 @@ const mainApp = createApp({ }, setSelectedTheme(theme) { this.selectedTheme = theme; + document.body.setAttribute('data-theme', theme); + document.body.setAttribute('data-color-scheme', daisyuiThemes[theme]?.['color-scheme'] ?? 'auto'); StorageUtils.setTheme(theme); }, newConversation() { @@ -399,7 +460,7 @@ const mainApp = createApp({ method: 'POST', headers: { 'Content-Type': 'application/json', - 'Authorization': this.config.apiKey ? `Bearer ${this.config.apiKey}` : undefined, + ...(this.config.apiKey ? {'Authorization': `Bearer ${this.config.apiKey}`} : {}) }, body: JSON.stringify(params), signal: abortController.signal, @@ -513,6 +574,17 @@ const mainApp = createApp({ fetchMessages() { this.messages = StorageUtils.getOneConversation(this.viewingConvId)?.messages ?? []; }, + + // debug functions + async debugImportDemoConv() { + const res = await fetch('/demo-conversation.json'); + const demoConv = await res.json(); + StorageUtils.remove(demoConv.id); + for (const msg of demoConv.messages) { + StorageUtils.appendMsg(demoConv.id, msg); + } + this.fetchConversation(); + } }, }); mainApp.config.errorHandler = alert; diff --git a/examples/server/webui/src/styles.css b/examples/server/webui/src/styles.css deleted file mode 100644 index 67d35b99e..000000000 --- a/examples/server/webui/src/styles.css +++ /dev/null @@ -1,26 +0,0 @@ -@tailwind base; -@tailwind components; -@tailwind utilities; - -.markdown { - h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; } - pre { - @apply whitespace-pre-wrap rounded-lg p-2; - border: 1px solid currentColor; - } - /* TODO: fix markdown table */ -} - -.show-on-hover { - @apply md:opacity-0 md:group-hover:opacity-100; -} -.btn-mini { - @apply cursor-pointer hover:shadow-md; -} -.chat-screen { max-width: 900px; } - -.chat-bubble-base-300 { - --tw-bg-opacity: 1; - --tw-text-opacity: 1; - @apply bg-base-300 text-base-content; -} diff --git a/examples/server/webui/src/styles.scss b/examples/server/webui/src/styles.scss new file mode 100644 index 000000000..34fe2aaf0 --- /dev/null +++ b/examples/server/webui/src/styles.scss @@ -0,0 +1,48 @@ +@use "sass:meta"; + +@tailwind base; +@tailwind components; +@tailwind utilities; + +.markdown { + h1, h2, h3, h4, h5, h6, ul, ol, li { all: revert; } + pre { + @apply whitespace-pre-wrap rounded-lg p-2; + border: 1px solid currentColor; + } + /* TODO: fix markdown table */ +} + +.show-on-hover { + @apply md:opacity-0 md:group-hover:opacity-100; +} +.btn-mini { + @apply cursor-pointer hover:shadow-md; +} +.chat-screen { max-width: 900px; } + +.chat-bubble-base-300 { + --tw-bg-opacity: 1; + --tw-text-opacity: 1; + @apply bg-base-300 text-base-content; +} + +/* Highlight.js */ +[data-color-scheme='light'] { + @include meta.load-css('highlight.js/styles/stackoverflow-light'); +} +[data-color-scheme='dark'] { + @include meta.load-css('highlight.js/styles/stackoverflow-dark'); +} +[data-color-scheme='auto'] { + @media (prefers-color-scheme: light) { + @include meta.load-css('highlight.js/styles/stackoverflow-light'); + } + @media (prefers-color-scheme: dark) { + @include meta.load-css('highlight.js/styles/stackoverflow-dark'); + } +} +.hljs { + background: transparent !important; + padding: 0.5em !important; +} diff --git a/examples/server/webui/vite.config.js b/examples/server/webui/vite.config.js index 789bf9cbb..6619a630d 100644 --- a/examples/server/webui/vite.config.js +++ b/examples/server/webui/vite.config.js @@ -2,6 +2,9 @@ import { viteSingleFile } from 'vite-plugin-singlefile'; import path from 'path'; import fs from 'fs'; +import zlib from 'zlib'; + +const MAX_BUNDLE_SIZE = 1.5 * 1024 * 1024; // only increase when absolutely necessary const GUIDE_FOR_FRONTEND = ` `.trim(); -export default { - plugins: [ - viteSingleFile(), - (function llamaCppPlugin() { - let config; - return { - name: 'llamacpp:build', - apply: 'build', - async configResolved(_config) { - config = _config; - }, - writeBundle() { - const outputIndexHtml = path.join(config.build.outDir, 'index.html'); - const content = fs.readFileSync(outputIndexHtml, 'utf-8'); +const BUILD_PLUGINS = [ + viteSingleFile(), + (function llamaCppPlugin() { + let config; + return { + name: 'llamacpp:build', + apply: 'build', + async configResolved(_config) { + config = _config; + }, + writeBundle() { + const outputIndexHtml = path.join(config.build.outDir, 'index.html'); + const content = GUIDE_FOR_FRONTEND + '\n' + fs.readFileSync(outputIndexHtml, 'utf-8'); + const compressed = zlib.gzipSync(Buffer.from(content, 'utf-8'), { level: 9 }); - const targetOutputFile = path.join(config.build.outDir, '../../public/index.html'); - fs.writeFileSync(targetOutputFile, GUIDE_FOR_FRONTEND + '\n' + content); + // because gzip header contains machine-specific info, we must remove these data from the header + // timestamp + compressed[0x4] = 0; + compressed[0x5] = 0; + compressed[0x6] = 0; + compressed[0x7] = 0; + // OS + compressed[0x9] = 0; + + if (compressed.byteLength > MAX_BUNDLE_SIZE) { + throw new Error( + `Bundle size is too large (${Math.ceil(compressed.byteLength / 1024)} KB).\n` + + `Please reduce the size of the frontend or increase MAX_BUNDLE_SIZE in vite.config.js.\n`, + ); } + + const targetOutputFile = path.join(config.build.outDir, '../../public/index.html.gz'); + fs.writeFileSync(targetOutputFile, compressed); } - })(), - ], + } + })(), +]; + +/** @type {import('vite').UserConfig} */ +export default { + plugins: process.env.ANALYZE ? [] : BUILD_PLUGINS, }; diff --git a/examples/tts/CMakeLists.txt b/examples/tts/CMakeLists.txt new file mode 100644 index 000000000..c72bd814c --- /dev/null +++ b/examples/tts/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET llama-tts) +add_executable(${TARGET} tts.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/examples/tts/convert_pt_to_hf.py b/examples/tts/convert_pt_to_hf.py new file mode 100644 index 000000000..8909a65fd --- /dev/null +++ b/examples/tts/convert_pt_to_hf.py @@ -0,0 +1,180 @@ +# convert the https://huggingface.co/novateur/WavTokenizer-large-speech-75token to HF format +# the goal is to be able to reuse the convert_hf_to_gguf.py after that to create a GGUF file with the WavTokenizer decoder +# +# TODO: this script is LLM-generated and probably very inefficient and should be rewritten + +import torch +import json +import os +import sys +import re + +from safetensors.torch import save_file + +# default +model_path = './model.pt'; + +# read from CLI +if len(sys.argv) > 1: + model_path = sys.argv[1] + +# get the directory of the input model +path_dst = os.path.dirname(model_path) + +print(f"Loading model from {model_path}") + +model = torch.load(model_path, map_location='cpu') + +#print(model) + +# print all keys +for key in model.keys(): + print(key) + if key == 'hyper_parameters': + #print(model[key]) + # dump as json pretty + print(json.dumps(model[key], indent=4)) + #if key != 'state_dict' and key != 'optimizer_states': + # print(model[key]) + +# Check if the loaded model is a state_dict or a model instance +if isinstance(model, torch.nn.Module): + state_dict = model.state_dict() +else: + state_dict = model + +# Print the structure of the state_dict to understand its format +print("State dictionary keys:") +for key in state_dict.keys(): + print(key) + +# Ensure the state_dict is flat and contains only torch.Tensor objects +def flatten_state_dict(state_dict, parent_key='', sep='.'): + items = [] + items_new = [] + + for k, v in state_dict.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, torch.Tensor): + items.append((new_key, v)) + elif isinstance(v, dict): + items.extend(flatten_state_dict(v, new_key, sep=sep).items()) + return dict(items) + + size_total_mb = 0 + + for key, value in list(items): + # keep only what we need for inference + if not key.startswith('state_dict.feature_extractor.encodec.quantizer.') and \ + not key.startswith('state_dict.backbone.') and \ + not key.startswith('state_dict.head.out'): + print('Skipping key: ', key) + continue + + new_key = key + + new_key = new_key.replace('state_dict.', '') + new_key = new_key.replace('pos_net', 'posnet') + + # check if matches "backbone.posnet.%d.bias" or "backbone.posnet.%d.weight" + if new_key.startswith("backbone.posnet."): + match = re.match(r"backbone\.posnet\.(\d+)\.(bias|weight)", new_key) + if match: + new_key = f"backbone.posnet.{match.group(1)}.norm.{match.group(2)}" + + # "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed" -> "backbone.embedding.weight" + if new_key == "feature_extractor.encodec.quantizer.vq.layers.0._codebook.embed": + new_key = "backbone.embedding.weight" + + # these are the only rows used + # ref: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/wav_tokenizer/audio_codec.py#L100 + if new_key.endswith("norm.scale.weight"): + new_key = new_key.replace("norm.scale.weight", "norm.weight") + value = value[0] + + if new_key.endswith("norm.shift.weight"): + new_key = new_key.replace("norm.shift.weight", "norm.bias") + value = value[0] + + if new_key.endswith("gamma"): + new_key = new_key.replace("gamma", "gamma.weight") + + # convert from 1D [768] to 2D [768, 1] so that ggml_add can broadcast the bias + if (new_key.endswith("norm.weight") or new_key.endswith("norm1.weight") or new_key.endswith("norm2.weight") or new_key.endswith(".bias")) and (new_key.startswith("backbone.posnet") or new_key.startswith("backbone.embed.bias")): + value = value.unsqueeze(1) + + if new_key.endswith("dwconv.bias"): + value = value.unsqueeze(1) + + size_mb = value.element_size() * value.nelement() / (1024 * 1024) + print(f"{size_mb:8.2f} MB - {new_key}: {value.shape}") + + size_total_mb += size_mb + + #print(key, '->', new_key, ': ', value) + #print(key, '->', new_key) + + items_new.append((new_key, value)) + + print(f"Total size: {size_total_mb:8.2f} MB") + + return dict(items_new) + +flattened_state_dict = flatten_state_dict(state_dict) + + +# Convert the model to the safetensors format +output_path = path_dst + '/model.safetensors' +save_file(flattened_state_dict, output_path) + +print(f"Model has been successfully converted and saved to {output_path}") + +# Calculate the total size of the .safetensors file +total_size = os.path.getsize(output_path) + +# Create the weight map +weight_map = { + "model.safetensors": ["*"] # Assuming all weights are in one file +} + +# Create metadata for the index.json file +metadata = { + "total_size": total_size, + "weight_map": weight_map +} + +# Save the metadata to index.json +index_path = path_dst + '/index.json' +with open(index_path, 'w') as f: + json.dump(metadata, f, indent=4) + +print(f"Metadata has been saved to {index_path}") + +config = { + "architectures": [ + "WavTokenizerDec" + ], + "hidden_size": 1282, + "n_embd_features": 512, + "n_ff": 2304, + "vocab_size": 4096, + "n_head": 1, + "layer_norm_epsilon": 1e-6, + "group_norm_epsilon": 1e-6, + "group_norm_groups": 32, + "max_position_embeddings": 8192, # ? + "n_layer": 12, + "posnet": { + "n_embd": 768, + "n_layer": 6 + }, + "convnext": { + "n_embd": 768, + "n_layer": 12 + }, +} + +with open(path_dst + '/config.json', 'w') as f: + json.dump(config, f, indent=4) + +print(f"Config has been saved to {path_dst + 'config.json'}") diff --git a/examples/tts/tts-outetts.py b/examples/tts/tts-outetts.py new file mode 100644 index 000000000..0f81192fc --- /dev/null +++ b/examples/tts/tts-outetts.py @@ -0,0 +1,175 @@ +import sys +#import json +#import struct +import requests +import re + +def process_text(text: str): + text = re.sub(r'\d+(\.\d+)?', lambda x: x.group(), text.lower()) # TODO this needs to be fixed + text = re.sub(r'[-_/,\.\\]', ' ', text) + text = re.sub(r'[^a-z\s]', '', text) + text = re.sub(r'\s+', ' ', text).strip() + return text.split() + +# usage: +# python tts-outetts.py http://server-llm:port http://server-dec:port "text" + +if len(sys.argv) <= 3: + print("usage: python tts-outetts.py http://server-llm:port http://server-dec:port \"text\"") + exit(1) + +host_llm = sys.argv[1] +host_dec = sys.argv[2] +text = sys.argv[3] + +prefix = """<|im_start|> +<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>""" + +words = process_text(text) +words = "<|text_sep|>".join([i.strip() for i in words]) +words += "<|text_end|>\n" + +# voice data +# TODO: load from json +#suffix = """<|audio_start|> +#the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|> +#overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|> +#package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|> +#from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|> +#just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|> +#two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|> +#people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|> +#is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|> +#pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|> +#remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|> +#sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|> +#i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|> +#have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|> +#some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|> +#critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|> +#about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|> +#some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|> +#of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|> +#the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|> +#gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|> +#aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|> +#but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|> +#its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|> +#still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|> +#really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|> +#enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|> +#and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|> +#it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|> +#looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|> +#lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>""" + +# TODO: tokenization is slow for some reason - here is pre-tokenized input +suffix = [ 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, 152460, 153375, 151670, 198, 74455, + 155808, 151669, 151799, 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, 151970, 153413, + 152419, 153334, 153289, 153374, 153199, 152040, 153260, 152721, 152680, 153297, 152419, 153248, 152400, + 152691, 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, 152256, 152991, 152299, 152688, 153163, + 153016, 152789, 153198, 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, 152461, 153321, + 153309, 151750, 152137, 153340, 152573, 152267, 153347, 151789, 152681, 153339, 151992, 152512, 151751, + 152179, 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, 152311, 151670, 198, 1499, 155791, + 151669, 152276, 152454, 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, 153043, 152325, + 153267, 152622, 151670, 198, 4250, 155797, 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271, + 152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, 152112, 153204, 151722, 152542, 151670, 198, + 19789, 155796, 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, 152191, 151734, 152312, 152810, + 152237, 153224, 153169, 153224, 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, 152265, 151946, + 151808, 152412, 152363, 152305, 153156, 152733, 152810, 153157, 152016, 152100, 152069, 153234, 152317, + 152589, 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, 153376, 152272, 152433, 152325, + 151941, 151670, 198, 285, 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, 152474, 152680, + 152157, 153255, 152324, 151682, 151670, 198, 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682, + 152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, 153070, 151883, 152890, 152489, 153144, + 153375, 152358, 151685, 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, 151902, 152720, + 153377, 152027, 152378, 152821, 153207, 153459, 153028, 153068, 152507, 153255, 152158, 152921, 151958, + 152609, 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, 152606, 152162, 152186, 153071, + 152244, 153118, 153375, 153018, 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, 153380, + 153502, 152702, 152115, 153181, 152735, 153277, 153457, 152393, 153112, 152595, 151670, 198, 19098, 155808, + 151669, 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, 153163, 152922, 153402, 152034, + 152591, 153438, 152215, 151673, 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, 152718, + 152862, 153347, 151670, 198, 72, 155780, 151669, 151795, 152111, 152746, 152377, 153471, 152309, 151670, 198, + 19016, 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, 152939, 152536, 152091, 151815, 152733, + 151672, 151670, 198, 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, 153504, 152589, 153333, + 151839, 151941, 153038, 153180, 151670, 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, 152801, + 152985, 153400, 152393, 152818, 152765, 152249, 152600, 151699, 152302, 152752, 153018, 153009, 151992, + 153054, 152847, 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, 152048, 152757, 152428, + 153195, 151906, 153006, 153178, 153250, 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418, + 152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, 153321, 152217, 153039, 152935, 153400, 152122, + 152531, 153106, 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, 152901, 152885, 152594, + 153446, 153080, 151670, 198, 14689, 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, 151673, + 151690, 151698, 152714, 152846, 152981, 153171, 153384, 153364, 153188, 153246, 151670, 198, 1055, 155779, + 151669, 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, 155780, 151669, 153483, 153240, 152241, + 152558, 152697, 153046, 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, 153034, 153434, + 153372, 153347, 151887, 152453, 152758, 152133, 152510, 152694, 152431, 152321, 153088, 152676, 152223, + 152581, 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, 152903, 152859, 152989, 151748, + 152669, 152661, 152650, 152409, 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, 152988, + 152894, 151819, 152391, 153019, 152058, 153062, 153230, 151826, 152112, 152306, 152264, 152769, 153390, + 152384, 152435, 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, 151919, 151893, 152558, + 152817, 152946, 152956, 152129, 152715, 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450, + 151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, 152679, 152533, 152382, 152374, 152611, 153341, + 153163, 152285, 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, 151669, 151764, 152360, 153295, + 152634, 153342, 152199, 152271, 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, 152016, 152385, + 152629, 152495, 151826, 153321, 152958, 152180, 151886, 153432, 152922, 152128, 153024, 153040, 152593, + 152287, 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, 152680, 153331, 151699, 152316, 152938, + 152289, 152433, 153384, 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, 152489, 151941, + 152049, 152034, 153053, 152179, 153160, 151676, 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350, + 152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, 153135, 152291, 153235, 152143, 152583, + 152402, 153483, 152678, 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, 152548, 153442, + 152109, 152659, 153325, 152781, 152570, 152957, 151752, 152265, 153381, 152515, 151670, 198, 437, 155787, + 151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, 151792, 153409, 153327, 152990, 151670, 198, + 275, 155781, 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, 151670, 198, 94273, 155799, + 151669, 152953, 152938, 153427, 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, 152257, + 152987, 152777, 153448, 152408, 151696, 152408, 152326, 152699, 151670, 198, 385, 16239, 155828, 151669, + 152306, 152268, 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, 152918, 152923, 152467, + 152331, 153053, 153330, 151889, 153444, 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751, + 152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, 152109, 152255, 151739, 152267, 152759, + 153318, 153165, 153349, 151670, ] + +response = requests.post( + host_llm + "/completion", + json={ + "prompt": [prefix + words, *suffix], + "n_predict": 1024, + "cache_prompt": True, + "return_tokens": True, + "samplers": ["top_k"], + "top_k": 16, + "seed": 1003, + } +) + +response_json = response.json() + +#print(json.dumps(response_json, indent=4)) +#print(json.dumps(response_json["prompt"], indent=4).replace("\\n", "\n")) +#print(json.dumps(response_json["timings"], indent=4)) +#print(json.dumps(response_json["tokens"], indent=4)) + +codes = response_json["tokens"] + +codes = [t - 151672 for t in codes if t >= 151672 and t <= 155772] + +response = requests.post( + host_dec + "/embeddings", + json={ + "input": [*codes], + } +) + +response_json = response.json() + +#print(json.dumps(response_json, indent=4)) + +# spectrogram +embd = response_json[0]["embedding"] + +n_codes = len(embd) +n_embd = len(embd[0]) + +print('spectrogram generated: n_codes: %d, n_embd: %d' % (n_codes, n_embd)) + +# post-process the spectrogram to convert to audio +# TODO: see the tts.cpp:embd_to_audio() and implement it in Python +print('converting to audio ...') +print('TODO: see the tts.cpp:embd_to_audio() and implement it in Python') diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp new file mode 100644 index 000000000..7f36b80f0 --- /dev/null +++ b/examples/tts/tts.cpp @@ -0,0 +1,932 @@ +#include "arg.h" +#include "common.h" +#include "sampling.h" +#include "log.h" +#include "llama.h" + +#define _USE_MATH_DEFINES // For M_PI on MSVC + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// +// Terminal utils +// + +#define SQR(X) ((X) * (X)) +#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40 + +/** + * Quantizes 24-bit RGB to xterm256 code range [16,256). + */ +static int rgb2xterm256(int r, int g, int b) { + unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377}; + int av, ir, ig, ib, il, qr, qg, qb, ql; + av = r * .299 + g * .587 + b * .114 + .5; + ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8; + qr = cube[(ir = UNCUBE(r))]; + qg = cube[(ig = UNCUBE(g))]; + qb = cube[(ib = UNCUBE(b))]; + if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <= + SQR(ql - r) + SQR(ql - g) + SQR(ql - b)) + return ir * 36 + ig * 6 + ib + 020; + return il + 0350; +} + +static std::string set_xterm256_foreground(int r, int g, int b) { + int x = rgb2xterm256(r, g, b); + std::ostringstream oss; + oss << "\033[38;5;" << x << "m"; + return oss.str(); +} + +const std::vector k_colors = { + set_xterm256_foreground(220, 5, 12), + set_xterm256_foreground(232, 96, 28), + set_xterm256_foreground(241, 147, 45), + set_xterm256_foreground(246, 193, 65), + set_xterm256_foreground(247, 240, 86), + set_xterm256_foreground(144, 201, 135), + set_xterm256_foreground( 78, 178, 101), +}; + +static void print_usage(int, char ** argv) { + LOG("\nexample usage:\n"); + LOG("\n %s -m model.gguf -p \"Hello!\"\n", argv[0]); + LOG("\n"); +} + +struct wav_header { + char riff[4] = {'R', 'I', 'F', 'F'}; + uint32_t chunk_size; + char wave[4] = {'W', 'A', 'V', 'E'}; + char fmt[4] = {'f', 'm', 't', ' '}; + uint32_t fmt_chunk_size = 16; + uint16_t audio_format = 1; // PCM + uint16_t num_channels = 1; // Mono + uint32_t sample_rate; + uint32_t byte_rate; + uint16_t block_align; + uint16_t bits_per_sample = 16; + char data[4] = {'d', 'a', 't', 'a'}; + uint32_t data_size; +}; + +static void save_wav16(const std::string & fname, const std::vector & data, int sample_rate) { + std::ofstream file(fname, std::ios::binary); + if (!file) { + LOG_ERR("%s: Failed to open file '%s' for writing", __func__, fname.c_str()); + return; + } + + wav_header header; + header.sample_rate = sample_rate; + header.byte_rate = header.sample_rate * header.num_channels * (header.bits_per_sample / 8); + header.block_align = header.num_channels * (header.bits_per_sample / 8); + header.data_size = data.size() * (header.bits_per_sample / 8); + header.chunk_size = 36 + header.data_size; + + file.write(reinterpret_cast(&header), sizeof(header)); + + for (const auto & sample : data) { + int16_t pcm_sample = static_cast(std::clamp(sample * 32767.0, -32768.0, 32767.0)); + file.write(reinterpret_cast(&pcm_sample), sizeof(pcm_sample)); + } + + file.close(); +} + +static void fill_hann_window(int length, bool periodic, float * output) { + int offset = -1; + if (periodic) { + offset = 0; + } + for (int i = 0; i < length; i++) { + output[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + } +} + +// very poor-man fft +static void twiddle(float * real, float * imag, int k, int N) { + float angle = 2 * M_PI * k / N; + *real = cos(angle); + *imag = sin(angle); +} + +static void irfft(int n, const float * inp_cplx, float * out_real) { + int N = n / 2 + 1; + + std::vector real_input(N); + std::vector imag_input(N); + for (int i = 0; i < N; ++i) { + real_input[i] = inp_cplx[2 * i]; + imag_input[i] = inp_cplx[2 * i + 1]; + } + + std::vector real_output(n); + std::vector imag_output(n); + + for (int k = 0; k < n; ++k) { + real_output[k] = 0.0f; + imag_output[k] = 0.0f; + for (int m = 0; m < N; ++m) { + float twiddle_real; + float twiddle_imag; + + twiddle(&twiddle_real, &twiddle_imag, k * m, n); + + real_output[k] += real_input[m] * twiddle_real - imag_input[m] * twiddle_imag; + imag_output[k] += real_input[m] * twiddle_imag + imag_input[m] * twiddle_real; + } + } + + for (int i = 0; i < n; ++i) { + out_real[i] = real_output[i] / N; + } +} + +// +// y = torch.nn.functional.fold( +// data, output_size=(1, output_size), kernel_size=(1, self.win_length), stride=(1, self.hop_length), +// )[:, 0, 0, pad:-pad] +// +// data.shape = torch.Size([1, 1280, 261]) +// output_size = 84480 +// win_length = 1280 +// hop_length = 320 +// pad = 480 +// +static void fold(const std::vector & data, int64_t n_out, int64_t n_win, int64_t n_hop, int64_t n_pad, std::vector & output) { + int64_t output_height = n_out; + int64_t kernel_w = n_win; + int64_t stride_w = n_hop; + int64_t width = n_out; + + output.resize(width, 0.0f); + + int64_t col_idx = 0; + for (int64_t w_col = 0; w_col < width; ++w_col) { + int64_t start = w_col * stride_w - n_pad; + int64_t end = start + kernel_w; + + for (int64_t w_im = start; w_im < end; ++w_im) { + if (w_im >= 0 && w_im < output_height && col_idx < (int64_t) data.size()) { + output[w_im] += data[col_idx]; + } + col_idx++; + } + } + + output.resize(n_out - 2 * n_pad); +} + +// TODO: not optimized at all +static std::vector embd_to_audio( + const float * embd, + const int n_codes, + const int n_embd, + const int n_thread) { + const int n_fft = 1280; + const int n_hop = 320; + const int n_win = 1280; + const int n_pad = (n_win - n_hop)/2; + const int n_out = (n_codes - 1)*n_hop + n_win; + + std::vector hann(n_fft); + + fill_hann_window(hann.size(), true, hann.data()); + + int n_spec = n_embd*n_codes; + + std::vector E (n_spec); + std::vector S (n_spec); + std::vector ST(n_spec); + + for (int l = 0; l < n_codes; ++l) { + for (int k = 0; k < n_embd; ++k) { + E[k*n_codes + l] = embd[l*n_embd + k]; + } + } + + for (int k = 0; k < n_embd/2; ++k) { + for (int l = 0; l < n_codes; ++l) { + float mag = E[(k )*n_codes + l]; + float phi = E[(k + n_embd/2)*n_codes + l]; + + mag = exp(mag); + + if (mag > 1e2) { + mag = 1e2; + } + S[2*(k*n_codes + l) + 0] = mag*cosf(phi); + S[2*(k*n_codes + l) + 1] = mag*sinf(phi); + } + } + + for (int l = 0; l < n_codes; ++l) { + for (int k = 0; k < n_embd/2; ++k) { + ST[l*n_embd + 2*k + 0] = S[2*(k*n_codes + l) + 0]; + ST[l*n_embd + 2*k + 1] = S[2*(k*n_codes + l) + 1]; + } + } + + std::vector res (n_codes*n_fft); + std::vector hann2(n_codes*n_fft); + + std::vector workers(n_thread); + for (int i = 0; i < n_thread; ++i) { + workers[i] = std::thread([&, i]() { + for (int l = i; l < n_codes; l += n_thread) { + irfft(n_fft, ST.data() + l*n_embd, res.data() + l*n_fft); + for (int j = 0; j < n_fft; ++j) { + res [l*n_fft + j] *= hann[j]; + hann2[l*n_fft + j] = hann[j] * hann[j]; + } + } + }); + } + for (int i = 0; i < n_thread; ++i) { + workers[i].join(); + } + + std::vector audio; + std::vector env; + + fold(res, n_out, n_win, n_hop, n_pad, audio); + fold(hann2, n_out, n_win, n_hop, n_pad, env); // TODO: can be done once + + for (size_t i = 0; i < audio.size(); ++i) { + audio[i] /= env[i]; + } + + return audio; +} + +static const std::map ones = { + {0, "zero"}, {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"}, + {5, "five"}, {6, "six"}, {7, "seven"}, {8, "eight"}, {9, "nine"}, + {10, "ten"}, {11, "eleven"}, {12, "twelve"}, {13, "thirteen"}, {14, "fourteen"}, + {15, "fifteen"}, {16, "sixteen"}, {17, "seventeen"}, {18, "eighteen"}, {19, "nineteen"} +}; + +static const std::map tens = { + {2, "twenty"}, {3, "thirty"}, {4, "forty"}, {5, "fifty"}, + {6, "sixty"}, {7, "seventy"}, {8, "eighty"}, {9, "ninety"} +}; + +// Convert a number less than 1000 to words +static std::string convert_less_than_thousand(int num) { + std::string result; + + if (num >= 100) { + result += ones.at(num / 100) + " hundred "; + num %= 100; + } + + if (num >= 20) { + result += tens.at(num / 10); + if (num % 10 > 0) { + result += "-" + ones.at(num % 10); + } + } else if (num > 0) { + result += ones.at(num); + } + + return result; +} + +static std::string number_to_words(const std::string & number_str) { + try { + size_t decimal_pos = number_str.find('.'); + std::string integer_part = number_str.substr(0, decimal_pos); + + int int_number = std::stoi(integer_part); + std::string result; + + if (int_number == 0) { + result = "zero"; + } else { + if (int_number >= 1000000000) { + int billions = int_number / 1000000000; + result += convert_less_than_thousand(billions) + " billion "; + int_number %= 1000000000; + } + + if (int_number >= 1000000) { + int millions = int_number / 1000000; + result += convert_less_than_thousand(millions) + " million "; + int_number %= 1000000; + } + + if (int_number >= 1000) { + int thousands = int_number / 1000; + result += convert_less_than_thousand(thousands) + " thousand "; + int_number %= 1000; + } + + if (int_number > 0) { + result += convert_less_than_thousand(int_number); + } + } + + // Handle decimal part + if (decimal_pos != std::string::npos) { + result += " point"; + std::string decimal_part = number_str.substr(decimal_pos + 1); + for (char digit : decimal_part) { + result += " " + ones.at(digit - '0'); + } + } + + return result; + } catch (const std::exception& e) { + // Skip if fails + return " "; + } +} + +static std::string replace_numbers_with_words(const std::string & input_text) { + std::regex number_pattern(R"(\d+(\.\d+)?)"); + std::string result; + auto it = std::sregex_iterator(input_text.begin(), input_text.end(), number_pattern); + auto end = std::sregex_iterator(); + + size_t last_pos = 0; + for (std::sregex_iterator i = it; i != end; ++i) { + const std::smatch& match = *i; + result.append(input_text, last_pos, match.position() - last_pos); + result.append(number_to_words(match.str())); + last_pos = match.position() + match.length(); + } + result.append(input_text, last_pos); + + return result; +} + +// Based on: https://github.com/edwko/OuteTTS/blob/a613e79c489d8256dd657ea9168d78de75895d82/outetts/version/v1/prompt_processor.py#L39 +static std::string process_text(const std::string & text) { + + // For now I skipped text romanization as I am unsure how to handle + // uroman and MeCab implementations in C++ + // maybe something like https://github.com/anyascii/anyascii/ could work. + // currently only English would be supported in this function + + std::string processed_text = replace_numbers_with_words(text); + + std::transform(processed_text.begin(), processed_text.end(), + processed_text.begin(), ::tolower); + + std::regex special_chars(R"([-_/,\.\\])"); + processed_text = std::regex_replace(processed_text, special_chars, " "); + + std::regex non_alpha(R"([^a-z\s])"); + processed_text = std::regex_replace(processed_text, non_alpha, ""); + + std::regex multiple_spaces(R"(\s+)"); + processed_text = std::regex_replace(processed_text, multiple_spaces, " "); + + processed_text = std::regex_replace(processed_text, std::regex(R"(^\s+|\s+$)"), ""); + + /* + Replace spaces with the separator token same as in line 365 + + for (auto & c : prompt_user) { + if (c == ' ') { + prompt_clean += "<|text_sep|>"; + */ + processed_text = std::regex_replace(processed_text, std::regex(R"(\s)"), "<|text_sep|>"); + + return processed_text; +} + +static void prompt_add(llama_tokens & prompt, llama_token token) { + prompt.push_back(token); +} + +static void prompt_add(llama_tokens & prompt, const llama_tokens & tokens) { + prompt.insert(prompt.end(), tokens.begin(), tokens.end()); +} + +static void prompt_add(llama_tokens & prompt, const llama_model * model, const std::string & txt, bool add_special, bool parse_special) { + auto tmp = common_tokenize(model, txt, add_special, parse_special); + prompt_add(prompt, tmp); +} + +static void prompt_init(llama_tokens & prompt, const llama_model * model) { + prompt.clear(); + + prompt_add(prompt, model, "<|im_start|>\n", true, true); +} + +int main(int argc, char ** argv) { + common_params params; + + params.prompt = ""; + + params.n_predict = 4096; + params.n_batch = 8192; + params.n_ctx = 8192; + + params.sampling.top_k = 4; + params.sampling.samplers = { COMMON_SAMPLER_TYPE_TOP_K, }; + + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_TTS, print_usage)) { + return 1; + } + + const int n_parallel = params.n_parallel; + const int n_predict = params.n_predict; + + common_init(); + + // init LLM + + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model_ttc = NULL; // text-to-codes + llama_model * model_cts = NULL; // codes-to-speech + + llama_context * ctx_ttc = NULL; + llama_context * ctx_cts = NULL; + + common_init_result llama_init_ttc = common_init_from_params(params); + model_ttc = llama_init_ttc.model; + ctx_ttc = llama_init_ttc.context; + + // TODO: refactor in a common struct + params.model = params.vocoder.model; + params.model_url = params.vocoder.model_url; + params.hf_repo = params.vocoder.hf_repo; + params.hf_file = params.vocoder.hf_file; + + params.embedding = true; + + common_init_result llama_init_cts = common_init_from_params(params); + model_cts = llama_init_cts.model; + ctx_cts = llama_init_cts.context; + + std::vector smpl(n_parallel); + for (int i = 0; i < n_parallel; ++i) { + params.sampling.no_perf = (i != 0); + params.sampling.seed = params.sampling.seed + 1; + + smpl[i] = common_sampler_init(model_ttc, params.sampling); + } + + LOG_INF("sampler seed: %u\n", common_sampler_get_seed(smpl[0])); + LOG_INF("sampler params: \n%s\n", params.sampling.print().c_str()); + LOG_INF("sampler chain: %s\n", common_sampler_print(smpl[0]).c_str()); + + LOG_INF("%s: loading done\n", __func__); + + const auto t_main_start = ggml_time_us(); + + std::vector codes; + + // process prompt and generate voice codes + { + LOG_INF("%s: constructing prompt ..\n", __func__); + + std::vector prompt_inp; + + prompt_init(prompt_inp, model_ttc); + + prompt_add(prompt_inp, model_ttc, "<|text_start|>the<|text_sep|>overall<|text_sep|>package<|text_sep|>from<|text_sep|>just<|text_sep|>two<|text_sep|>people<|text_sep|>is<|text_sep|>pretty<|text_sep|>remarkable<|text_sep|>sure<|text_sep|>i<|text_sep|>have<|text_sep|>some<|text_sep|>critiques<|text_sep|>about<|text_sep|>some<|text_sep|>of<|text_sep|>the<|text_sep|>gameplay<|text_sep|>aspects<|text_sep|>but<|text_sep|>its<|text_sep|>still<|text_sep|>really<|text_sep|>enjoyable<|text_sep|>and<|text_sep|>it<|text_sep|>looks<|text_sep|>lovely<|text_sep|>", false, true); + + // convert the input text into the necessary format expected by OuteTTS + { + std::string prompt_clean = process_text(params.prompt); + + LOG_INF("%s: prompt: '%s'\n", __func__, prompt_clean.c_str()); + + prompt_add(prompt_inp, model_ttc, prompt_clean, false, true); + } + + prompt_add(prompt_inp, model_ttc, "<|text_end|>\n", false, true); + + // disabled to save time on tokenizing each time + // TODO: load voices from the json files +#if 0 + const std::string voice_data = R"(<|audio_start|> +the<|t_0.08|><|code_start|><|257|><|740|><|636|><|913|><|788|><|1703|><|code_end|> +overall<|t_0.36|><|code_start|><|127|><|201|><|191|><|774|><|700|><|532|><|1056|><|557|><|798|><|298|><|1741|><|747|><|1662|><|1617|><|1702|><|1527|><|368|><|1588|><|1049|><|1008|><|1625|><|747|><|1576|><|728|><|1019|><|1696|><|1765|><|code_end|> +package<|t_0.56|><|code_start|><|935|><|584|><|1319|><|627|><|1016|><|1491|><|1344|><|1117|><|1526|><|1040|><|239|><|1435|><|951|><|498|><|723|><|1180|><|535|><|789|><|1649|><|1637|><|78|><|465|><|1668|><|901|><|595|><|1675|><|117|><|1009|><|1667|><|320|><|840|><|79|><|507|><|1762|><|1508|><|1228|><|1768|><|802|><|1450|><|1457|><|232|><|639|><|code_end|> +from<|t_0.19|><|code_start|><|604|><|782|><|1682|><|872|><|1532|><|1600|><|1036|><|1761|><|647|><|1554|><|1371|><|653|><|1595|><|950|><|code_end|> +just<|t_0.25|><|code_start|><|1782|><|1670|><|317|><|786|><|1748|><|631|><|599|><|1155|><|1364|><|1524|><|36|><|1591|><|889|><|1535|><|541|><|440|><|1532|><|50|><|870|><|code_end|> +two<|t_0.24|><|code_start|><|1681|><|1510|><|673|><|799|><|805|><|1342|><|330|><|519|><|62|><|640|><|1138|><|565|><|1552|><|1497|><|1552|><|572|><|1715|><|1732|><|code_end|> +people<|t_0.39|><|code_start|><|593|><|274|><|136|><|740|><|691|><|633|><|1484|><|1061|><|1138|><|1485|><|344|><|428|><|397|><|1562|><|645|><|917|><|1035|><|1449|><|1669|><|487|><|442|><|1484|><|1329|><|1832|><|1704|><|600|><|761|><|653|><|269|><|code_end|> +is<|t_0.16|><|code_start|><|566|><|583|><|1755|><|646|><|1337|><|709|><|802|><|1008|><|485|><|1583|><|652|><|10|><|code_end|> +pretty<|t_0.32|><|code_start|><|1818|><|1747|><|692|><|733|><|1010|><|534|><|406|><|1697|><|1053|><|1521|><|1355|><|1274|><|816|><|1398|><|211|><|1218|><|817|><|1472|><|1703|><|686|><|13|><|822|><|445|><|1068|><|code_end|> +remarkable<|t_0.68|><|code_start|><|230|><|1048|><|1705|><|355|><|706|><|1149|><|1535|><|1787|><|1356|><|1396|><|835|><|1583|><|486|><|1249|><|286|><|937|><|1076|><|1150|><|614|><|42|><|1058|><|705|><|681|><|798|><|934|><|490|><|514|><|1399|><|572|><|1446|><|1703|><|1346|><|1040|><|1426|><|1304|><|664|><|171|><|1530|><|625|><|64|><|1708|><|1830|><|1030|><|443|><|1509|><|1063|><|1605|><|1785|><|721|><|1440|><|923|><|code_end|> +sure<|t_0.36|><|code_start|><|792|><|1780|><|923|><|1640|><|265|><|261|><|1525|><|567|><|1491|><|1250|><|1730|><|362|><|919|><|1766|><|543|><|1|><|333|><|113|><|970|><|252|><|1606|><|133|><|302|><|1810|><|1046|><|1190|><|1675|><|code_end|> +i<|t_0.08|><|code_start|><|123|><|439|><|1074|><|705|><|1799|><|637|><|code_end|> +have<|t_0.16|><|code_start|><|1509|><|599|><|518|><|1170|><|552|><|1029|><|1267|><|864|><|419|><|143|><|1061|><|0|><|code_end|> +some<|t_0.16|><|code_start|><|619|><|400|><|1270|><|62|><|1370|><|1832|><|917|><|1661|><|167|><|269|><|1366|><|1508|><|code_end|> +critiques<|t_0.60|><|code_start|><|559|><|584|><|1163|><|1129|><|1313|><|1728|><|721|><|1146|><|1093|><|577|><|928|><|27|><|630|><|1080|><|1346|><|1337|><|320|><|1382|><|1175|><|1682|><|1556|><|990|><|1683|><|860|><|1721|><|110|><|786|><|376|><|1085|><|756|><|1523|><|234|><|1334|><|1506|><|1578|><|659|><|612|><|1108|><|1466|><|1647|><|308|><|1470|><|746|><|556|><|1061|><|code_end|> +about<|t_0.29|><|code_start|><|26|><|1649|><|545|><|1367|><|1263|><|1728|><|450|><|859|><|1434|><|497|><|1220|><|1285|><|179|><|755|><|1154|><|779|><|179|><|1229|><|1213|><|922|><|1774|><|1408|><|code_end|> +some<|t_0.23|><|code_start|><|986|><|28|><|1649|><|778|><|858|><|1519|><|1|><|18|><|26|><|1042|><|1174|><|1309|><|1499|><|1712|><|1692|><|1516|><|1574|><|code_end|> +of<|t_0.07|><|code_start|><|197|><|716|><|1039|><|1662|><|64|><|code_end|> +the<|t_0.08|><|code_start|><|1811|><|1568|><|569|><|886|><|1025|><|1374|><|code_end|> +gameplay<|t_0.48|><|code_start|><|1269|><|1092|><|933|><|1362|><|1762|><|1700|><|1675|><|215|><|781|><|1086|><|461|><|838|><|1022|><|759|><|649|><|1416|><|1004|><|551|><|909|><|787|><|343|><|830|><|1391|><|1040|><|1622|><|1779|><|1360|><|1231|><|1187|><|1317|><|76|><|997|><|989|><|978|><|737|><|189|><|code_end|> +aspects<|t_0.56|><|code_start|><|1423|><|797|><|1316|><|1222|><|147|><|719|><|1347|><|386|><|1390|><|1558|><|154|><|440|><|634|><|592|><|1097|><|1718|><|712|><|763|><|1118|><|1721|><|1311|><|868|><|580|><|362|><|1435|><|868|><|247|><|221|><|886|><|1145|><|1274|><|1284|><|457|><|1043|><|1459|><|1818|><|62|><|599|><|1035|><|62|><|1649|><|778|><|code_end|> +but<|t_0.20|><|code_start|><|780|><|1825|><|1681|><|1007|><|861|><|710|><|702|><|939|><|1669|><|1491|><|613|><|1739|><|823|><|1469|><|648|><|code_end|> +its<|t_0.09|><|code_start|><|92|><|688|><|1623|><|962|><|1670|><|527|><|599|><|code_end|> +still<|t_0.27|><|code_start|><|636|><|10|><|1217|><|344|><|713|><|957|><|823|><|154|><|1649|><|1286|><|508|><|214|><|1760|><|1250|><|456|><|1352|><|1368|><|921|><|615|><|5|><|code_end|> +really<|t_0.36|><|code_start|><|55|><|420|><|1008|><|1659|><|27|><|644|><|1266|><|617|><|761|><|1712|><|109|><|1465|><|1587|><|503|><|1541|><|619|><|197|><|1019|><|817|><|269|><|377|><|362|><|1381|><|507|><|1488|><|4|><|1695|><|code_end|> +enjoyable<|t_0.49|><|code_start|><|678|><|501|><|864|><|319|><|288|><|1472|><|1341|><|686|><|562|><|1463|><|619|><|1563|><|471|><|911|><|730|><|1811|><|1006|><|520|><|861|><|1274|><|125|><|1431|><|638|><|621|><|153|><|876|><|1770|><|437|><|987|><|1653|><|1109|><|898|><|1285|><|80|><|593|><|1709|><|843|><|code_end|> +and<|t_0.15|><|code_start|><|1285|><|987|><|303|><|1037|><|730|><|1164|><|502|><|120|><|1737|><|1655|><|1318|><|code_end|> +it<|t_0.09|><|code_start|><|848|><|1366|><|395|><|1601|><|1513|><|593|><|1302|><|code_end|> +looks<|t_0.27|><|code_start|><|1281|><|1266|><|1755|><|572|><|248|><|1751|><|1257|><|695|><|1380|><|457|><|659|><|585|><|1315|><|1105|><|1776|><|736|><|24|><|736|><|654|><|1027|><|code_end|> +lovely<|t_0.56|><|code_start|><|634|><|596|><|1766|><|1556|><|1306|><|1285|><|1481|><|1721|><|1123|><|438|><|1246|><|1251|><|795|><|659|><|1381|><|1658|><|217|><|1772|><|562|><|952|><|107|><|1129|><|1112|><|467|><|550|><|1079|><|840|><|1615|><|1469|><|1380|><|168|><|917|><|836|><|1827|><|437|><|583|><|67|><|595|><|1087|><|1646|><|1493|><|1677|><|code_end|>)"; + + auto tmp = common_tokenize(model_ttc, voice_data, false, true); + printf("\n\n"); + for (int i = 0; i < tmp.size(); ++i) { + printf("%d, ", tmp[i]); + } + printf("\n\n"); +#else + prompt_add(prompt_inp, llama_tokens { + 151667, 198, 1782, 155780, 151669, 151929, 152412, 152308, 152585, + 152460, 153375, 151670, 198, 74455, 155808, 151669, 151799, + 151873, 151863, 152446, 152372, 152204, 152728, 152229, 152470, + 151970, 153413, 152419, 153334, 153289, 153374, 153199, 152040, + 153260, 152721, 152680, 153297, 152419, 153248, 152400, 152691, + 153368, 153437, 151670, 198, 1722, 155828, 151669, 152607, + 152256, 152991, 152299, 152688, 153163, 153016, 152789, 153198, + 152712, 151911, 153107, 152623, 152170, 152395, 152852, 152207, + 152461, 153321, 153309, 151750, 152137, 153340, 152573, 152267, + 153347, 151789, 152681, 153339, 151992, 152512, 151751, 152179, + 153434, 153180, 152900, 153440, 152474, 153122, 153129, 151904, + 152311, 151670, 198, 1499, 155791, 151669, 152276, 152454, + 153354, 152544, 153204, 153272, 152708, 153433, 152319, 153226, + 153043, 152325, 153267, 152622, 151670, 198, 4250, 155797, + 151669, 153454, 153342, 151989, 152458, 153420, 152303, 152271, + 152827, 153036, 153196, 151708, 153263, 152561, 153207, 152213, + 152112, 153204, 151722, 152542, 151670, 198, 19789, 155796, + 151669, 153353, 153182, 152345, 152471, 152477, 153014, 152002, + 152191, 151734, 152312, 152810, 152237, 153224, 153169, 153224, + 152244, 153387, 153404, 151670, 198, 16069, 155811, 151669, + 152265, 151946, 151808, 152412, 152363, 152305, 153156, 152733, + 152810, 153157, 152016, 152100, 152069, 153234, 152317, 152589, + 152707, 153121, 153341, 152159, 152114, 153156, 153001, 153504, + 153376, 152272, 152433, 152325, 151941, 151670, 198, 285, + 155788, 151669, 152238, 152255, 153427, 152318, 153009, 152381, + 152474, 152680, 152157, 153255, 152324, 151682, 151670, 198, + 32955, 155804, 151669, 153490, 153419, 152364, 152405, 152682, + 152206, 152078, 153369, 152725, 153193, 153027, 152946, 152488, + 153070, 151883, 152890, 152489, 153144, 153375, 152358, 151685, + 152494, 152117, 152740, 151670, 198, 37448, 480, 155840, 151669, + 151902, 152720, 153377, 152027, 152378, 152821, 153207, 153459, + 153028, 153068, 152507, 153255, 152158, 152921, 151958, 152609, + 152748, 152822, 152286, 151714, 152730, 152377, 152353, 152470, + 152606, 152162, 152186, 153071, 152244, 153118, 153375, 153018, + 152712, 153098, 152976, 152336, 151843, 153202, 152297, 151736, + 153380, 153502, 152702, 152115, 153181, 152735, 153277, 153457, + 152393, 153112, 152595, 151670, 198, 19098, 155808, 151669, + 152464, 153452, 152595, 153312, 151937, 151933, 153197, 152239, + 153163, 152922, 153402, 152034, 152591, 153438, 152215, 151673, + 152005, 151785, 152642, 151924, 153278, 151805, 151974, 153482, + 152718, 152862, 153347, 151670, 198, 72, 155780, 151669, 151795, + 152111, 152746, 152377, 153471, 152309, 151670, 198, 19016, + 155788, 151669, 153181, 152271, 152190, 152842, 152224, 152701, + 152939, 152536, 152091, 151815, 152733, 151672, 151670, 198, + 14689, 155788, 151669, 152291, 152072, 152942, 151734, 153042, + 153504, 152589, 153333, 151839, 151941, 153038, 153180, 151670, + 198, 36996, 8303, 155832, 151669, 152231, 152256, 152835, + 152801, 152985, 153400, 152393, 152818, 152765, 152249, 152600, + 151699, 152302, 152752, 153018, 153009, 151992, 153054, 152847, + 153354, 153228, 152662, 153355, 152532, 153393, 151782, 152458, + 152048, 152757, 152428, 153195, 151906, 153006, 153178, 153250, + 152331, 152284, 152780, 153138, 153319, 151980, 153142, 152418, + 152228, 152733, 151670, 198, 9096, 155801, 151669, 151698, + 153321, 152217, 153039, 152935, 153400, 152122, 152531, 153106, + 152169, 152892, 152957, 151851, 152427, 152826, 152451, 151851, + 152901, 152885, 152594, 153446, 153080, 151670, 198, 14689, + 155795, 151669, 152658, 151700, 153321, 152450, 152530, 153191, + 151673, 151690, 151698, 152714, 152846, 152981, 153171, 153384, + 153364, 153188, 153246, 151670, 198, 1055, 155779, 151669, + 151869, 152388, 152711, 153334, 151736, 151670, 198, 1782, + 155780, 151669, 153483, 153240, 152241, 152558, 152697, 153046, + 151670, 198, 5804, 1363, 155820, 151669, 152941, 152764, 152605, + 153034, 153434, 153372, 153347, 151887, 152453, 152758, 152133, + 152510, 152694, 152431, 152321, 153088, 152676, 152223, 152581, + 152459, 152015, 152502, 153063, 152712, 153294, 153451, 153032, + 152903, 152859, 152989, 151748, 152669, 152661, 152650, 152409, + 151861, 151670, 198, 300, 7973, 155828, 151669, 153095, 152469, + 152988, 152894, 151819, 152391, 153019, 152058, 153062, 153230, + 151826, 152112, 152306, 152264, 152769, 153390, 152384, 152435, + 152790, 153393, 152983, 152540, 152252, 152034, 153107, 152540, + 151919, 151893, 152558, 152817, 152946, 152956, 152129, 152715, + 153131, 153490, 151734, 152271, 152707, 151734, 153321, 152450, + 151670, 198, 8088, 155792, 151669, 152452, 153497, 153353, + 152679, 152533, 152382, 152374, 152611, 153341, 153163, 152285, + 153411, 152495, 153141, 152320, 151670, 198, 1199, 155781, + 151669, 151764, 152360, 153295, 152634, 153342, 152199, 152271, + 151670, 198, 43366, 155799, 151669, 152308, 151682, 152889, + 152016, 152385, 152629, 152495, 151826, 153321, 152958, 152180, + 151886, 153432, 152922, 152128, 153024, 153040, 152593, 152287, + 151677, 151670, 198, 53660, 155808, 151669, 151727, 152092, + 152680, 153331, 151699, 152316, 152938, 152289, 152433, 153384, + 151781, 153137, 153259, 152175, 153213, 152291, 151869, 152691, + 152489, 151941, 152049, 152034, 153053, 152179, 153160, 151676, + 153367, 151670, 198, 268, 4123, 480, 155821, 151669, 152350, + 152173, 152536, 151991, 151960, 153144, 153013, 152358, 152234, + 153135, 152291, 153235, 152143, 152583, 152402, 153483, 152678, + 152192, 152533, 152946, 151797, 153103, 152310, 152293, 151825, + 152548, 153442, 152109, 152659, 153325, 152781, 152570, 152957, + 151752, 152265, 153381, 152515, 151670, 198, 437, 155787, + 151669, 152957, 152659, 151975, 152709, 152402, 152836, 152174, + 151792, 153409, 153327, 152990, 151670, 198, 275, 155781, + 151669, 152520, 153038, 152067, 153273, 153185, 152265, 152974, + 151670, 198, 94273, 155799, 151669, 152953, 152938, 153427, + 152244, 151920, 153423, 152929, 152367, 153052, 152129, 152331, + 152257, 152987, 152777, 153448, 152408, 151696, 152408, 152326, + 152699, 151670, 198, 385, 16239, 155828, 151669, 152306, 152268, + 153438, 153228, 152978, 152957, 153153, 153393, 152795, 152110, + 152918, 152923, 152467, 152331, 153053, 153330, 151889, 153444, + 152234, 152624, 151779, 152801, 152784, 152139, 152222, 152751, + 152512, 153287, 153141, 153052, 151840, 152589, 152508, 153499, + 152109, 152255, 151739, 152267, 152759, 153318, 153165, 153349, + 151670,}); +#endif + + // print the prompt token-by-token + + LOG("\n"); + + for (auto id : prompt_inp) { + LOG("%s", common_token_to_piece(ctx_ttc, id).c_str()); + } + + LOG_INF("%s: prompt size: %d\n", __func__, (int) prompt_inp.size()); + + LOG("\n"); + + // create a llama_batch + // we use this object to submit token data for decoding + llama_batch batch = llama_batch_init(std::max(prompt_inp.size(), (size_t) n_parallel), 0, n_parallel); + + std::vector seq_ids(n_parallel, 0); + for (int32_t i = 0; i < n_parallel; ++i) { + seq_ids[i] = i; + } + + // evaluate the initial prompt + for (size_t i = 0; i < prompt_inp.size(); ++i) { + common_batch_add(batch, prompt_inp[i], i, seq_ids, false); + } + GGML_ASSERT(batch.n_tokens == (int) prompt_inp.size()); + + // llama_decode will output logits only for the last token of the prompt + batch.logits[batch.n_tokens - 1] = true; + + if (llama_decode(ctx_ttc, batch) != 0) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + return 1; + } + + if (n_parallel > 1) { + LOG_INF("\n\n%s: generating %d sequences ...\n", __func__, n_parallel); + } + + llama_synchronize(ctx_ttc); + + LOG_INF("%s: time for prompt: %.3f ms\n\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); + + const auto t_dec_start = ggml_time_us(); + + // main loop + + // remember the batch index of the last token for each parallel sequence + // we need this to determine which logits to sample from + std::vector i_batch(n_parallel, batch.n_tokens - 1); + + int n_past = batch.n_tokens; + int n_decode = 0; + + while (n_decode <= n_predict) { + // prepare the next batch + common_batch_clear(batch); + + // sample the next token for each parallel sequence / stream + for (int32_t i = 0; i < n_parallel; ++i) { + if (i_batch[i] < 0) { + // the stream has already finished + continue; + } + + const llama_token new_token_id = common_sampler_sample(smpl[i], ctx_ttc, i_batch[i]); + + common_sampler_accept(smpl[i], new_token_id, true); + + codes.push_back(new_token_id); + + const auto * cands = common_sampler_get_candidates(smpl[i]); + + // is it an end of generation? -> mark the stream as finished + if (llama_token_is_eog(model_ttc, new_token_id) || n_decode == n_predict) { + std::string reason; + if (llama_token_is_eog(model_ttc, new_token_id)) { + reason = "eos"; + } else { + reason = "n_predict"; + } + + i_batch[i] = -1; + + LOG("\n"); + if (n_parallel > 1) { + LOG_CNT("\n"); + LOG_INF("%s: stream %d finished at n_past = %d, reason = '%s'\n", __func__, i, n_past, reason.c_str()); + } + + continue; + } + + { + const float p = cands->data[cands->selected].p; + + const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) ((3*p)*float(k_colors.size())))); + + LOG_CNT("%s%d%s", k_colors[col].c_str(), i, "\033[0m"); + //LOG_CNT("%d", i); + } + + i_batch[i] = batch.n_tokens; + + // push this new token for next evaluation + common_batch_add(batch, new_token_id, n_past, { i }, true); + } + + // all streams are finished + if (batch.n_tokens == 0) { + break; + } + + n_decode += 1; + n_past += 1; + + // evaluate the current batch with the transformer model + if (llama_decode(ctx_ttc, batch)) { + LOG_ERR("%s : failed to eval, return code %d\n", __func__, 1); + return 1; + } + } + + llama_batch_free(batch); + + LOG("\n"); + LOG_INF("%s: time for decoder: %.3f ms\n", __func__, (ggml_time_us() - t_dec_start) / 1000.0f); + } + + common_perf_print(ctx_ttc, smpl[0]); + + //std::vector codes = {198, 88225, 155856, 151669, 152205, + // 153064, 152537, 153421, 153209, 152524, 151689, 152993, 152438, 152695, + // 153091, 152945, 152829, 152534, 152934, 153020, 151997, 152263, 153010, + // 153146, 152399, 153208, 152496, 151793, 152848, 152263, 152571, 153286, + // 152227, 153300, 152934, 152263, 153208, 152263, 152965, 152430, 152296, + // 153146, 152920, 152376, 152556, 153363, 151775, 152044, 152972, 152690, + // 153379, 152368, 152233, 153422, 152490, 151996, 152022, 151694, 152061, + // 153238, 152539, 153356, 152640, 153021, 153123, 151962, 153094, 151670, + // 198, 20339, 13189, 155824, 151669, 152070, 152007, 152910, 151683, + // 152000, 152373, 152760, 152046, 151735, 152334, 152394, 153073, 152908, + // 151856, 151953, 153247, 153293, 151903, 153480, 153168, 152478, 153359, + // 153429, 151905, 151678, 152567, 152411, 152165, 152556, 153075, 153424, + // 151993, 152999, 153078, 152151, 152088, 153389, 152484, 151874, 151670, + // 198, 285, 155784, 151669, 152226, 152126, 152638, 153215, 151729, + // 152959, 153479, 153059, 151838, 151670, 198, 1782, 155783, 151669, + // 153288, 153055, 153314, 152497, 152962, 152741, 152076, 153253, 151670, + // 198, 471, 16488, 155825, 151669, 152060, 152916, 151893, 153469, 152501, + // 152080, 152743, 151932, 153161, 152096, 152761, 152698, 153401, 153242, + // 153336, 152441, 152838, 153467, 152706, 153496, 153310, 152422, 153360, + // 153115, 152763, 151998, 152373, 153450, 152554, 151968, 153323, 152055, + // 152468, 153111, 153358, 152813, 152010, 151770, 152823, 152960, 151670, + // 198, 22627, 155823, 151669, 152814, 152366, 153484, 152931, 153441, + // 152164, 152877, 152915, 153463, 151692, 152911, 152747, 152776, 151831, + // 153449, 151882, 152975, 152031, 152513, 153150, 152448, 152667, 153133, + // 153189, 152619, 153466, 152054, 152106, 153119, 152277, 152439, 153109, + // 152997, 152141, 153154, 153256, 153311, 151922, 151670, 198, 1055, + // 155781, 151669, 152633, 151850, 153060, 153270, 152560, 153348, 152729, + // 151670, 198, 25312, 155803, 151669, 152521, 153403, 152561, 153337, + // 153383, 152199, 153493, 153326, 151830, 152254, 152248, 152349, 152153, + // 153007, 151823, 153037, 152575, 152457, 152406, 152592, 153116, 153365, + // 153456, 151670, 198, 88225, 155817, 151669, 153271, 151925, 152218, + // 152418, 152253, 153140, 151903, 153151, 152626, 152338, 152647, 153464, + // 152785, 152768, 151711, 152037, 152033, 151804, 152216, 151701, 151855, + // 152348, 152995, 152955, 152905, 152342, 152340, 153391, 153453, 152418, + // 153415, 151990, 153083, 152884, 151670, 198, 151668, 198, 151645}; + + { + const std::string inp_txt = common_detokenize(ctx_ttc, codes, true); + + LOG("\n"); + LOG_INF("codes: '%s'\n", inp_txt.c_str()); + LOG_INF("%s: codes size: %d\n", __func__, (int) codes.size()); + } + + // remove all non-audio tokens (i.e. < 151672 || > 155772) + codes.erase(std::remove_if(codes.begin(), codes.end(), [](llama_token t) { return t < 151672 || t > 155772; }), codes.end()); + + { + const std::string inp_txt = common_detokenize(ctx_ttc, codes, true); + LOG_INF("codes audio: '%s'\n", inp_txt.c_str()); + LOG_INF("%s: codes audio size: %d\n", __func__, (int) codes.size()); + } + + for (auto & token : codes) { + token -= 151672; + } + + const auto t_voc_start = ggml_time_us(); + + const int n_codes = codes.size(); + + llama_batch batch = llama_batch_init(n_codes, 0, 1); + + for (size_t i = 0; i < codes.size(); ++i) { + common_batch_add(batch, codes[i], i, { 0 }, true); // TODO: all logits? + } + GGML_ASSERT(batch.n_tokens == n_codes); + + if (llama_decode(ctx_cts, batch) != 0) { + LOG_ERR("%s: llama_decode() failed\n", __func__); + return 1; + } + + llama_synchronize(ctx_cts); + + LOG_INF("%s: time for vocoder: %.3f ms\n", __func__, (ggml_time_us() - t_voc_start) / 1000.0f); + + const auto t_spec_start = ggml_time_us(); + +#if 1 + // spectral operations + const int n_embd = llama_n_embd(model_cts); + const float * embd = llama_get_embeddings(ctx_cts); + + auto audio = embd_to_audio(embd, n_codes, n_embd, params.cpuparams.n_threads); + +#else + // read the spectrogram from a file for debugging purposes + std::vector audio; + { + std::ifstream fin("out.bin", std::ios::binary); + if (!fin) { + LOG_ERR("%s: failed to open file '%s'\n", __func__, "out.bin"); + return 1; + } + + std::vector embd; + + int n_codes; + int n_embd; + + fin.read(reinterpret_cast(&n_codes), sizeof(int)); + fin.read(reinterpret_cast(&n_embd), sizeof(int)); + + embd.resize(n_codes * n_embd); + fin.read(reinterpret_cast(embd.data()), n_codes * n_embd * sizeof(float)); + fin.close(); + + LOG_INF("%s: n_codes: %d, n_embd: %d\n", __func__, n_codes, n_embd); + + audio = embd_to_audio(embd.data(), n_codes, n_embd, params.cpuparams.n_threads); + } +#endif + + const std::string fname = "output.wav"; + + const int n_sr = 24000; // sampling rate + + // zero out first 0.25 seconds + for (int i = 0; i < 24000/4; ++i) { + audio[i] = 0.0f; + } + + LOG_INF("%s: time for spectral ops: %.3f ms\n", __func__, (ggml_time_us() - t_spec_start) / 1000.0f); + LOG_INF("%s: total time: %.3f ms\n", __func__, (ggml_time_us() - t_main_start) / 1000.0f); + + save_wav16(fname, audio, n_sr); + + LOG_INF("%s: audio written to file '%s'\n", __func__, fname.c_str()); + + llama_free(ctx_ttc); + llama_free_model(model_ttc); + + llama_free(ctx_cts); + llama_free_model(model_cts); + + llama_backend_free(); + + return 0; +} diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 3442142ad..e33d97482 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -74,10 +74,10 @@ if (NOT GGML_CUDA_GRAPHS_DEFAULT) endif() # general -option(GGML_STATIC "ggml: static link libraries" OFF) -option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) -option(GGML_LTO "ggml: enable link time optimization" OFF) -option(GGML_CCACHE "ggml: use ccache if available" ON) +option(GGML_STATIC "ggml: static link libraries" OFF) +option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT}) +option(GGML_LTO "ggml: enable link time optimization" OFF) +option(GGML_CCACHE "ggml: use ccache if available" ON) # debug option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON) @@ -120,8 +120,9 @@ endif() option(GGML_LASX "ggml: enable lasx" ON) option(GGML_LSX "ggml: enable lsx" ON) option(GGML_RVV "ggml: enable rvv" ON) -option(GGML_SVE "ggml: enable SVE" OFF) + option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF) +set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM") if (WIN32) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b0c1ac9ce..c714fc8c8 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1564,17 +1564,6 @@ extern "C" { int d1, // dilation dimension 1 bool is_2D); - GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, // convolution kernel - struct ggml_tensor * b, // data - int s0, // stride dimension 0 - int s1, // stride dimension 1 - int p0, // padding dimension 0 - int p1, // padding dimension 1 - int d0, // dilation dimension 0 - int d1); // dilation dimension 1 - GGML_API struct ggml_tensor * ggml_conv_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1592,6 +1581,23 @@ extern "C" { int s, // stride int d); // dilation + // depthwise + // TODO: this is very likely wrong for some cases! - needs more testing + GGML_API struct ggml_tensor * ggml_conv_1d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0); // dilation + + GGML_API struct ggml_tensor * ggml_conv_1d_dw_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int d0); // dilation + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel @@ -1611,7 +1617,6 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 - // kernel size is a->ne[0] x a->ne[1] // stride is equal to kernel size // padding is zero @@ -1638,6 +1643,18 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); + // depthwise + GGML_API struct ggml_tensor * ggml_conv_2d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride dimension 0 + int s1, // stride dimension 1 + int p0, // padding dimension 0 + int p1, // padding dimension 1 + int d0, // dilation dimension 0 + int d1); // dilation dimension 1 + GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index bf5ee5fc2..a5f7f7b5b 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -234,6 +234,7 @@ function(ggml_add_backend_library backend) # write the shared library to the output directory set_target_properties(${backend} PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) target_compile_definitions(${backend} PRIVATE GGML_BACKEND_DL) + add_dependencies(ggml ${backend}) else() add_library(${backend} ${ARGN}) target_link_libraries(ggml PUBLIC ${backend}) diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c index 2b2240be8..8dc8226ac 100644 --- a/ggml/src/ggml-alloc.c +++ b/ggml/src/ggml-alloc.c @@ -534,7 +534,6 @@ static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node); hn->buffer_id = buffer_id; hn->offset = offset; - return; } } diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 66927148a..7ddd178b5 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -66,6 +66,26 @@ #include "ggml-kompute.h" #endif +// disable C++17 deprecation warning for std::codecvt_utf8 +#if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wdeprecated-declarations" +#endif + +static std::wstring utf8_to_utf16(const std::string & str) { + std::wstring_convert> converter; + return converter.from_bytes(str); +} + +static std::string utf16_to_utf8(const std::wstring & str) { + std::wstring_convert> converter; + return converter.to_bytes(str); +} + +#if defined(__clang__) +# pragma clang diagnostic pop +#endif + #ifdef _WIN32 using dl_handle = std::remove_pointer_t; @@ -88,11 +108,6 @@ static dl_handle * dl_load_library(const std::wstring & path) { return handle; } -static dl_handle * dl_load_library(const std::string & path) { - std::wstring_convert> converter; - return dl_load_library(converter.from_bytes(path)); -} - static void * dl_get_sym(dl_handle * handle, const char * name) { DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS); SetErrorMode(old_mode | SEM_FAILCRITICALERRORS); @@ -114,8 +129,8 @@ struct dl_handle_deleter { } }; -static void * dl_load_library(const std::string & path) { - dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL); +static void * dl_load_library(const std::wstring & path) { + dl_handle * handle = dlopen(utf16_to_utf8(path).c_str(), RTLD_NOW | RTLD_LOCAL); return handle; } @@ -202,11 +217,11 @@ struct ggml_backend_registry { devices.push_back(device); } - ggml_backend_reg_t load_backend(const char * path, bool silent) { + ggml_backend_reg_t load_backend(const std::wstring & path, bool silent) { dl_handle_ptr handle { dl_load_library(path) }; if (!handle) { if (!silent) { - GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path); + GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(path).c_str()); } return nullptr; } @@ -214,7 +229,7 @@ struct ggml_backend_registry { auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); if (score_fn && score_fn() == 0) { if (!silent) { - GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path); + GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, utf16_to_utf8(path).c_str()); } return nullptr; } @@ -222,7 +237,7 @@ struct ggml_backend_registry { auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init"); if (!backend_init_fn) { if (!silent) { - GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path); + GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, utf16_to_utf8(path).c_str()); } return nullptr; } @@ -231,16 +246,16 @@ struct ggml_backend_registry { if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) { if (!silent) { if (!reg) { - GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path); + GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, utf16_to_utf8(path).c_str()); } else { GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n", - __func__, path, reg->api_version, GGML_BACKEND_API_VERSION); + __func__, utf16_to_utf8(path).c_str(), reg->api_version, GGML_BACKEND_API_VERSION); } } return nullptr; } - GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path); + GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), utf16_to_utf8(path).c_str()); register_backend(reg, std::move(handle)); @@ -376,14 +391,14 @@ ggml_backend_t ggml_backend_init_best(void) { // Dynamic loading ggml_backend_reg_t ggml_backend_load(const char * path) { - return get_reg().load_backend(path, false); + return get_reg().load_backend(utf8_to_utf16(path), false); } void ggml_backend_unload(ggml_backend_reg_t reg) { get_reg().unload_backend(reg, true); } -static std::string get_executable_path() { +static std::wstring get_executable_path() { #if defined(__APPLE__) // get executable path std::vector path; @@ -401,13 +416,17 @@ static std::string get_executable_path() { if (last_slash != std::string::npos) { base_path = base_path.substr(0, last_slash); } - return base_path + "/"; -#elif defined(__linux__) + return utf8_to_utf16(base_path + "/"); +#elif defined(__linux__) || defined(__FreeBSD__) std::string base_path = "."; std::vector path(1024); while (true) { // get executable path +# if defined(__linux__) ssize_t len = readlink("/proc/self/exe", path.data(), path.size()); +# elif defined(__FreeBSD__) + ssize_t len = readlink("/proc/curproc/file", path.data(), path.size()); +# endif if (len == -1) { break; } @@ -423,57 +442,63 @@ static std::string get_executable_path() { path.resize(path.size() * 2); } - return base_path + "/"; + return utf8_to_utf16(base_path + "/"); #elif defined(_WIN32) - std::vector path(MAX_PATH); - DWORD len = GetModuleFileNameA(NULL, path.data(), path.size()); + std::vector path(MAX_PATH); + DWORD len = GetModuleFileNameW(NULL, path.data(), path.size()); if (len == 0) { - return ""; + return {}; } - std::string base_path(path.data(), len); + std::wstring base_path(path.data(), len); // remove executable name auto last_slash = base_path.find_last_of('\\'); if (last_slash != std::string::npos) { base_path = base_path.substr(0, last_slash); } - return base_path + "\\"; + return base_path + L"\\"; +#else + return {}; #endif } -static std::string backend_filename_prefix() { +static std::wstring backend_filename_prefix() { #ifdef _WIN32 - return "ggml-"; + return L"ggml-"; #else - return "libggml-"; + return L"libggml-"; #endif } -static std::string backend_filename_suffix() { +static std::wstring backend_filename_suffix() { #ifdef _WIN32 - return ".dll"; + return L".dll"; #else - return ".so"; + return L".so"; +#endif +} + +static std::wstring path_separator() { +#ifdef _WIN32 + return L"\\"; +#else + return L"/"; #endif } static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, const char * user_search_path) { // enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths // TODO: search system paths - std::string file_prefix = backend_filename_prefix() + name + "-"; - std::vector search_paths; + std::wstring file_prefix = backend_filename_prefix() + utf8_to_utf16(name) + L"-"; + std::vector search_paths; if (user_search_path == nullptr) { - search_paths.push_back("./"); + search_paths.push_back(L"." + path_separator()); search_paths.push_back(get_executable_path()); } else { -#if defined(_WIN32) - search_paths.push_back(std::string(user_search_path) + "\\"); -#else - search_paths.push_back(std::string(user_search_path) + "/"); -#endif + search_paths.push_back(utf8_to_utf16(user_search_path) + path_separator()); } int best_score = 0; - std::string best_path; + std::wstring best_path; namespace fs = std::filesystem; for (const auto & search_path : search_paths) { @@ -483,27 +508,27 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied); for (const auto & entry : dir_it) { if (entry.is_regular_file()) { - std::string filename = entry.path().filename().string(); - std::string ext = entry.path().extension().string(); + std::wstring filename = entry.path().filename().wstring(); + std::wstring ext = entry.path().extension().wstring(); if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) { - dl_handle_ptr handle { dl_load_library(entry.path().c_str()) }; + dl_handle_ptr handle { dl_load_library(entry.path().wstring()) }; if (!handle && !silent) { - GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str()); + GGML_LOG_ERROR("%s: failed to load %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str()); } if (handle) { auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score"); if (score_fn) { int s = score_fn(); #ifndef NDEBUG - GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s); + GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str(), s); #endif if (s > best_score) { best_score = s; - best_path = entry.path().string(); + best_path = entry.path().wstring(); } } else { if (!silent) { - GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str()); + GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, utf16_to_utf8(entry.path().wstring()).c_str()); } } } @@ -515,15 +540,15 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent, if (best_score == 0) { // try to load the base backend for (const auto & search_path : search_paths) { - std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix(); + std::wstring path = search_path + backend_filename_prefix() + utf8_to_utf16(name) + backend_filename_suffix(); if (fs::exists(path)) { - return get_reg().load_backend(path.c_str(), silent); + return get_reg().load_backend(path, silent); } } return nullptr; } - return get_reg().load_backend(best_path.c_str(), silent); + return get_reg().load_backend(best_path, silent); } void ggml_backend_load_all() { diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 0e0556703..f0aecac1b 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -74,112 +74,96 @@ function(ggml_add_cpu_backend_variant_impl tag_name) if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR - (NOT CMAKE_OSX_ARCHITECTURES AND - NOT CMAKE_GENERATOR_PLATFORM_LWR AND + (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$")) message(STATUS "ARM detected") - if (MSVC) - list(APPEND ARCH_DEFINITIONS __aarch64__) # MSVC defines _M_ARM64 instead - list(APPEND ARCH_DEFINITIONS __ARM_NEON) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FMA) - - set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS}) - string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2") - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - - message(STATUS "ARM feature DOTPROD enabled") - endif () - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () - - check_cxx_source_compiles("#include \nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC) - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - - message(STATUS "ARM feature FP16_VECTOR_ARITHMETIC enabled") - endif () - - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV}) - elseif (APPLE) - if (GGML_NATIVE) - set(USER_PROVIDED_MARCH FALSE) - foreach(flag_var IN ITEMS CMAKE_C_FLAGS CMAKE_CXX_FLAGS CMAKE_REQUIRED_FLAGS) - if ("${${flag_var}}" MATCHES "-march=[a-zA-Z0-9+._-]+") - set(USER_PROVIDED_MARCH TRUE) - break() - endif() - endforeach() - - if (NOT USER_PROVIDED_MARCH) - set(MARCH_FLAGS "-march=armv8.2a") - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD) - if (GGML_COMPILER_SUPPORT_DOTPROD) - set(MARCH_FLAGS "${MARCH_FLAGS}+dotprod") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_DOTPROD) - - message(STATUS "ARM feature DOTPROD enabled") - endif () - - set(TEST_I8MM_FLAGS "-march=armv8.2a+i8mm") - - set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${TEST_I8MM_FLAGS}") - - check_cxx_source_compiles("#include \nint main() { int8x16_t _a, _b; int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8) - if (GGML_COMPILER_SUPPORT_MATMUL_INT8) - set(MARCH_FLAGS "${MARCH_FLAGS}+i8mm") - list(APPEND ARCH_DEFINITIONS __ARM_FEATURE_MATMUL_INT8) - - message(STATUS "ARM feature MATMUL_INT8 enabled") - endif () - - set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) - - list(APPEND ARCH_FLAGS "${MARCH_FLAGS}") - endif () - endif () + if (MSVC AND NOT CMAKE_C_COMPILER_ID STREQUAL "Clang") + message(FATAL_ERROR "MSVC is not supported for ARM, use clang") else() - check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E) - if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") + check_cxx_compiler_flag(-mfp16-format=ieee GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E) + if (NOT "${GGML_COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "") list(APPEND ARCH_FLAGS -mfp16-format=ieee) endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6") - # Raspberry Pi 1, Zero - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access) - endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7") - if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android") - # Android armeabi-v7a - list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations) - else() - # Raspberry Pi 2 - list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations) + + if (GGML_NATIVE) + # -mcpu=native does not always enable all the features in some compilers, + # so we check for them manually and enable them if available + + execute_process( + COMMAND ${CMAKE_C_COMPILER} -mcpu=native -E -v - + INPUT_FILE "/dev/null" + OUTPUT_QUIET + ERROR_VARIABLE ARM_MCPU + RESULT_VARIABLE ARM_MCPU_RESULT + ) + if (NOT ARM_MCPU_RESULT) + string(REGEX MATCH "-mcpu=[^ ']+" ARM_MCPU_FLAG "${ARM_MCPU}") + endif() + if ("${ARM_MCPU_FLAG}" STREQUAL "") + set(ARM_MCPU_FLAG -mcpu=native) + message(STATUS "ARM -mcpu not found, -mcpu=native will be used") + endif() + + include(CheckCXXSourceRuns) + + function(check_arm_feature tag code) + set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS}) + set(CMAKE_REQUIRED_FLAGS "${ARM_MCPU_FLAG}+${tag}") + check_cxx_source_runs( + "${code}" + GGML_MACHINE_SUPPORTS_${tag} + ) + if (GGML_MACHINE_SUPPORTS_${tag}) + set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+${tag}" PARENT_SCOPE) + else() + set(ARM_MCPU_FLAG_FIX "${ARM_MCPU_FLAG_FIX}+no${tag}" PARENT_SCOPE) + endif() + set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE}) + endfunction() + + check_arm_feature(dotprod "#include \nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }") + check_arm_feature(i8mm "#include \nint main() { int8x16_t _a, _b; volatile int32x4_t _s = vmmlaq_s32(_s, _a, _b); return 0; }") + check_arm_feature(sve "#include \nint main() { svfloat32_t _a, _b; volatile svfloat32_t _c = svadd_f32_z(svptrue_b8(), _a, _b); return 0; }") + + list(APPEND ARCH_FLAGS "${ARM_MCPU_FLAG}${ARM_MCPU_FLAG_FIX}") + else() + if (GGML_CPU_ARM_ARCH) + list(APPEND ARCH_FLAGS -march=${GGML_CPU_ARM_ARCH}) endif() endif() - if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8") - # Android arm64-v8a - # Raspberry Pi 3, 4, Zero 2 (32-bit) - list(APPEND ARCH_FLAGS -mno-unaligned-access) + + # show enabled features + if (CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows") + set(FEAT_INPUT_FILE "NUL") + else() + set(FEAT_INPUT_FILE "/dev/null") endif() - if (GGML_SVE) - list(APPEND ARCH_FLAGS -march=armv8.6-a+sve) + + execute_process( + COMMAND ${CMAKE_C_COMPILER} ${ARCH_FLAGS} -dM -E - + INPUT_FILE ${FEAT_INPUT_FILE} + OUTPUT_VARIABLE ARM_FEATURE + RESULT_VARIABLE ARM_FEATURE_RESULT + ) + if (ARM_FEATURE_RESULT) + message(WARNING "Failed to get ARM features") + else() + foreach(feature DOTPROD SVE MATMUL_INT8 FMA FP16_VECTOR_ARITHMETIC) + string(FIND "${ARM_FEATURE}" "__ARM_FEATURE_${feature} 1" feature_pos) + if (NOT ${feature_pos} EQUAL -1) + message(STATUS "ARM feature ${feature} enabled") + endif() + endforeach() endif() endif() elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND - CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$")) + CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$")) + + message(STATUS "x86 detected") + if (MSVC) # instruction set detection for MSVC only if (GGML_NATIVE) @@ -339,6 +323,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name) target_compile_definitions(${GGML_CPU_NAME} PRIVATE ${ARCH_DEFINITIONS}) if (GGML_BACKEND_DL) + if (GGML_NATIVE) + # the feature check relies on ARCH_DEFINITIONS, but it is not set with GGML_NATIVE + message(FATAL_ERROR "GGML_NATIVE is not compatible with GGML_BACKEND_DL, consider using GGML_CPU_ALL_VARIANTS") + endif() + # The feature detection code is compiled as a separate target so that # it can be built without the architecture flags # Since multiple variants of the CPU backend may be included in the same diff --git a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp index a51d1a6c5..2d79b8b61 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp @@ -564,21 +564,21 @@ static void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { - const block_q4_0x4 * b_ptr = (const block_q4_0x4 *)vx; + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx; for (int c = 0; c < nc; c += ncols_interleaved) { - const block_q8_0 * a_ptr = (const block_q8_0 *)vy; + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; float32x4_t acc = vdupq_n_f32(0); for (int b = 0; b < nb; b++) { - int8x16_t b0 = vld1q_s8((const int8_t *)b_ptr->qs); - int8x16_t b1 = vld1q_s8((const int8_t *)b_ptr->qs + 16); - int8x16_t b2 = vld1q_s8((const int8_t *)b_ptr->qs + 32); - int8x16_t b3 = vld1q_s8((const int8_t *)b_ptr->qs + 48); - float16x4_t bd = vld1_f16((const __fp16 *)b_ptr->d); + int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs); + int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16); + int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32); + int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48); + float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d); int8x16_t a0 = vld1q_s8(a_ptr->qs); int8x16_t a1 = vld1q_s8(a_ptr->qs + qk/2); - float16x4_t ad = vld1_dup_f16((const __fp16 *)&a_ptr->d); + float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d); int32x4_t ret = vdupq_n_s32(0); @@ -647,72 +647,52 @@ static void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, c UNUSED(ncols_interleaved); UNUSED(blocklen); -#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) - if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) { - const void * b_ptr = vx; - const void * a_ptr = vy; - float * res_ptr = s; +#if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) + if (ggml_cpu_has_neon() && ggml_cpu_has_dotprod()) { + const block_q4_0x4 * b_ptr = (const block_q4_0x4 *) vx; - __asm__ __volatile__( - "movi v2.16b, #0x4\n" - "movi v1.16b, #0xf0\n" - "add %x[b_ptr], %x[b_ptr], #0x8\n" - "1:" // Column loop - "add x23, %x[a_ptr], #0x2\n" - "movi v0.16b, #0x0\n" - "mov x22, %x[nb]\n" - "2:" // Block loop - "ldr q31, [%x[b_ptr], #0x0]\n" - "ldr q30, [%x[b_ptr], #0x10]\n" - "mov x21, x23\n" - "movi v29.4s, #0x0\n" - "ldr q28, [%x[b_ptr], #0x20]\n" - "ldr q27, [%x[b_ptr], #0x30]\n" - "movi v26.4s, #0x0\n" - "sub x20, x23, #0x2\n" - "ld1r { v25.8h }, [x20]\n" - "ldr q24, [%x[b_ptr], #-0x8]\n" - "sub x22, x22, #0x1\n" - "add x23, x23, #0x22\n" - "ld1r { v23.2d }, [x21], #0x8\n" - "sshl v22.16b, v31.16b, v2.16b\n" - "sshl v16.16b, v30.16b, v2.16b\n" - "add %x[b_ptr], %x[b_ptr], #0x48\n" - "ld1r { v21.2d }, [x21], #0x8\n" - "sshl v20.16b, v28.16b, v2.16b\n" - "sshl v19.16b, v27.16b, v2.16b\n" - "ld1r { v18.2d }, [x21], #0x8\n" - "ld1r { v17.2d }, [x21], #0x8\n" - "and v31.16b, v31.16b, v1.16b\n" - "and v30.16b, v30.16b, v1.16b\n" - ".inst 0x4e9796dd // sdot v29.4s, v22.16b, v23.16b\n" - ".inst 0x4e97961a // sdot v26.4s, v16.16b, v23.16b\n" - "and v28.16b, v28.16b, v1.16b\n" - "and v27.16b, v27.16b, v1.16b\n" - "fcvtl v25.4s, v25.4h\n" - "fcvtl v16.4s, v24.4h\n" - ".inst 0x4e95969d // sdot v29.4s, v20.16b, v21.16b\n" - ".inst 0x4e95967a // sdot v26.4s, v19.16b, v21.16b\n" - "fmul v16.4s, v16.4s, v25.4s\n" - ".inst 0x4e9297fd // sdot v29.4s, v31.16b, v18.16b\n" - ".inst 0x4e9297da // sdot v26.4s, v30.16b, v18.16b\n" - ".inst 0x4e91979d // sdot v29.4s, v28.16b, v17.16b\n" - ".inst 0x4e91977a // sdot v26.4s, v27.16b, v17.16b\n" - "addp v29.4s, v29.4s, v26.4s\n" - "scvtf v29.4s, v29.4s, #0x4\n" - "fmla v0.4s, v29.4s, v16.4s\n" - "cbnz x22, 2b\n" - "sub %x[nc], %x[nc], #0x4\n" - "str q0, [%x[res_ptr], #0x0]\n" - "add %x[res_ptr], %x[res_ptr], #0x10\n" - "cbnz %x[nc], 1b\n" - : [b_ptr] "+&r" (b_ptr), [res_ptr] "+&r" (res_ptr), [nc] "+&r" (nc) - : [a_ptr] "r" (a_ptr), [nb] "r" (nb) - : "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23" - ); + for (int c = 0; c < nc; c += ncols_interleaved) { + const block_q8_0 * a_ptr = (const block_q8_0 *) vy; + float32x4_t acc = vdupq_n_f32(0); + for (int b = 0; b < nb; b++) { + int8x16_t b0 = vld1q_s8((const int8_t *) b_ptr->qs); + int8x16_t b1 = vld1q_s8((const int8_t *) b_ptr->qs + 16); + int8x16_t b2 = vld1q_s8((const int8_t *) b_ptr->qs + 32); + int8x16_t b3 = vld1q_s8((const int8_t *) b_ptr->qs + 48); + float16x4_t bd = vld1_f16((const __fp16 *) b_ptr->d); + + int8x16_t a0 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs); + int8x16_t a1 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 1); + int8x16_t a2 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 2); + int8x16_t a3 = (int8x16_t) vld1q_dup_s64((const int64_t *) a_ptr->qs + 3); + float16x4_t ad = vld1_dup_f16((const __fp16 *) &a_ptr->d); + + int32x4_t ret0 = vdupq_n_s32(0); + int32x4_t ret1 = vdupq_n_s32(0); + + ret0 = vdotq_s32(ret0, b0 << 4, a0); + ret1 = vdotq_s32(ret1, b1 << 4, a0); + ret0 = vdotq_s32(ret0, b2 << 4, a1); + ret1 = vdotq_s32(ret1, b3 << 4, a1); + + ret0 = vdotq_s32(ret0, b0 & 0xf0U, a2); + ret1 = vdotq_s32(ret1, b1 & 0xf0U, a2); + ret0 = vdotq_s32(ret0, b2 & 0xf0U, a3); + ret1 = vdotq_s32(ret1, b3 & 0xf0U, a3); + + int32x4_t ret = vpaddq_s32(ret0, ret1); + + acc = vfmaq_f32(acc, vcvtq_n_f32_s32(ret, 4), + vmulq_f32(vcvt_f32_f16(ad), vcvt_f32_f16(bd))); + a_ptr++; + b_ptr++; + } + vst1q_f32(s, acc); + s += ncols_interleaved; + } return; } -#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_MATMUL_INT8) +#endif // #if ! ((defined(_MSC_VER)) && ! defined(__clang__)) && defined(__aarch64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_DOTPROD) float sumf[4]; int sumi; diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 67e67a089..b7fefb9dd 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -986,7 +986,7 @@ inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) { #define GGML_F16_STEP 32 #define GGML_F16_EPR 4 -static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { +static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) { float tmp[4]; tmp[0] = GGML_FP16_TO_FP32(x[0]); @@ -997,7 +997,7 @@ static inline __m128 __sse_f16x4_load(ggml_fp16_t *x) { return _mm_loadu_ps(tmp); } -static inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) { +static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) { float arr[4]; _mm_storeu_ps(arr, y); @@ -7419,14 +7419,14 @@ static void ggml_compute_forward_mul_mat( if (src1_cont) { for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + if (!llamafile_sgemm(params, + ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), (const char *)src1->data + i12*nb12 + i13*nb13, nb11/ggml_type_size(src1->type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), - ith, nth, src0->type, src1->type, dst->type)) @@ -7471,14 +7471,14 @@ UseGgmlGemm1:; for (int64_t i13 = 0; i13 < ne13; i13++) for (int64_t i12 = 0; i12 < ne12; i12++) - if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + if (!llamafile_sgemm(params, + ne01, ne11, ne00/ggml_blck_size(src0->type), (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, nb01/ggml_type_size(src0->type), (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, row_size/ggml_type_size(vec_dot_type), (char *)dst->data + i12*nb2 + i13*nb3, nb1/ggml_type_size(dst->type), - ith, nth, src0->type, vec_dot_type, dst->type)) diff --git a/ggml/src/ggml-cpu/ggml-cpu.cpp b/ggml/src/ggml-cpu/ggml-cpu.cpp index c390957af..f11399cc6 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.cpp +++ b/ggml/src/ggml-cpu/ggml-cpu.cpp @@ -394,8 +394,11 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st switch (op->op) { case GGML_OP_CPY: return + op->type != GGML_TYPE_IQ3_XXS && + op->type != GGML_TYPE_IQ3_S && op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && + op->type != GGML_TYPE_IQ2_S && op->type != GGML_TYPE_IQ1_S && op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: @@ -519,6 +522,12 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r if (ggml_cpu_has_sve()) { features.push_back({ "SVE", "1" }); } + if (ggml_cpu_has_dotprod()) { + features.push_back({ "DOTPROD", "1" }); + } + if (ggml_cpu_has_matmul_int8()) { + features.push_back({ "MATMUL_INT8", "1" }); + } if (ggml_cpu_get_sve_cnt() > 0) { static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt()); features.push_back({ "SVE_CNT", sve_cnt.c_str() }); diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index da4146ec4..00f7f1170 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -53,6 +53,8 @@ #include "ggml-cpu-impl.h" #include "ggml-quants.h" +#include + #ifdef _MSC_VER #define NOINLINE __declspec(noinline) #else @@ -134,6 +136,16 @@ inline __m512 madd(__m512 a, __m512 b, __m512 c) { return _mm512_fmadd_ps(a, b, c); } #endif +#if defined(__AVX512BF16__) +template <> +inline __m512 madd(__m512bh a, __m512bh b, __m512 c) { + return _mm512_dpbf16_ps(c, a, b); +} +template <> +inline __m256 madd(__m256bh a, __m256bh b, __m256 c) { + return _mm256_dpbf16_ps(c, a, b); +} +#endif #endif #if defined(__ARM_FEATURE_FMA) @@ -204,6 +216,7 @@ template <> inline float32x4_t load(const float *p) { return vld1q_f32(p); } #if !defined(_MSC_VER) +// FIXME: this should check for __ARM_FEATURE_FP16_VECTOR_ARITHMETIC template <> inline float16x8_t load(const ggml_fp16_t *p) { return vld1q_f16((const float16_t *)p); } @@ -225,6 +238,13 @@ template <> inline __m256 load(const float *p) { } #endif // __AVX__ +#if defined(__AVX2__) || defined(__AVX512F__) +template <> inline __m256 load(const ggml_bf16_t *p) { + return _mm256_castsi256_ps( + _mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)p)), 16)); +} +#endif // __AVX2__ + #if defined(__F16C__) template <> inline __m256 load(const ggml_fp16_t *p) { return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p)); @@ -238,8 +258,27 @@ template <> inline __m512 load(const float *p) { template <> inline __m512 load(const ggml_fp16_t *p) { return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p)); } +template <> inline __m512 load(const ggml_bf16_t *p) { + return _mm512_castsi512_ps( + _mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)p)), 16)); +} #endif // __AVX512F__ +#if defined(__AVX512BF16__) +template <> inline __m512bh load(const ggml_bf16_t *p) { + return (__m512bh)_mm512_loadu_ps((const float *)p); +} +template <> inline __m256bh load(const ggml_bf16_t *p) { + return (__m256bh)_mm256_loadu_ps((const float *)p); +} +template <> inline __m512bh load(const float *p) { + return _mm512_cvtne2ps_pbh(_mm512_loadu_ps(p + 16), _mm512_loadu_ps(p)); +} +template <> inline __m256bh load(const float *p) { + return _mm512_cvtneps_pbh(_mm512_loadu_ps(p)); +} +#endif + //////////////////////////////////////////////////////////////////////////////////////////////////// // CONSTANTS @@ -251,199 +290,170 @@ static const __m128i iq4nlt = _mm_loadu_si128((const __m128i *) kvalues_iq4nl); //////////////////////////////////////////////////////////////////////////////////////////////////// // FLOATING POINT MATRIX MULTIPLICATION +template +static inline int64_t BLOCK_SIZE(size_t m) { + const int64_t NB_BLOC_M = (m + M - 1) / M; + return (m % NB_BLOC_M == 0) ? m / NB_BLOC_M : (m / NB_BLOC_M) + 1; +} + +static constexpr inline int64_t BLOC_POS(int64_t ib, int64_t ibN, int64_t bloc_size) { + return ib < ibN ? ib * bloc_size : ibN * bloc_size + (ib - ibN) * (bloc_size - 1); +} + template class tinyBLAS { public: - tinyBLAS(int64_t k, + tinyBLAS(const ggml_compute_params * params, int64_t k, const TA *A, int64_t lda, const TB *B, int64_t ldb, - TC *C, int64_t ldc, - int ith, int nth) - : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + TC *C, int64_t ldc) + : params(params), A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc) { } - void matmul(int64_t m, int64_t n) { - mnpack(0, m, 0, n); + bool matmul(int64_t m, int64_t n) { + if (k % KN != 0) + return false; + // compute RM for only need tile with size RM&RM-1 +#if VECTOR_REGISTERS == 32 + if (m % 16 == 0 && (m/16 >= params->nth)) { + const int64_t SIZE_N = BLOCK_SIZE<6>(n); + mnpack<4, 6, 4>(m, n, SIZE_N, 12); + return true; + } + if (m % 8 == 0 ) { + const int64_t SIZE_N = BLOCK_SIZE<6>(n); + mnpack<4, 6, 2>(m, n, SIZE_N, 12); + return true; + } + if (m % 4 == 0) { + const int64_t SIZE_N = BLOCK_SIZE<6>(n); + mnpack<4, 6, 1>(m, n, SIZE_N, 12); + return true; + } +#else // VECTOR_REGISTERS == 16 + if (m % 16 == 0 && (m/16 >= params->nth)) { + const int64_t SIZE_N = BLOCK_SIZE<3>(n); + mnpack<4, 3, 4>(m, n, SIZE_N, 24); + return true; + } + if (m % 8 == 0 ) { + const int64_t SIZE_N = BLOCK_SIZE<3>(n); + mnpack<4, 3, 2>(m, n, SIZE_N, 24); + return true; + } + if (m % 4 == 0) { + const int64_t SIZE_N = BLOCK_SIZE<3>(n); + mnpack<4, 3, 1>(m, n, SIZE_N, 24); + return true; + } +#endif + return false; } private: - NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { - int64_t mc, nc, mp, np; - switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) { -#if VECTOR_REGISTERS == 32 - case 0x55: - mc = 5; - nc = 5; - gemm<5, 5>(m0, m, n0, n); - break; - case 0x45: - mc = 4; - nc = 5; - gemm<4, 5>(m0, m, n0, n); - break; - case 0x54: - mc = 5; - nc = 4; - gemm<5, 4>(m0, m, n0, n); - break; - case 0x44: - mc = 4; - nc = 4; - gemm<4, 4>(m0, m, n0, n); - break; - case 0x53: - mc = 5; - nc = 3; - gemm<5, 3>(m0, m, n0, n); - break; - case 0x35: - mc = 3; - nc = 5; - gemm<3, 5>(m0, m, n0, n); - break; - case 0x43: - mc = 4; - nc = 3; - gemm<4, 3>(m0, m, n0, n); - break; -#else - case 0x55: - case 0x54: - case 0x53: - case 0x45: - case 0x44: - case 0x43: - mc = 4; - nc = 3; - gemm<4, 3>(m0, m, n0, n); - break; - case 0x35: -#endif - case 0x34: - mc = 3; - nc = 4; - gemm<3, 4>(m0, m, n0, n); - break; - case 0x52: - mc = 5; - nc = 2; - gemm<5, 2>(m0, m, n0, n); - break; - case 0x33: - mc = 3; - nc = 3; - gemm<3, 3>(m0, m, n0, n); - break; - case 0x25: - mc = 2; - nc = 5; - gemm<2, 5>(m0, m, n0, n); - break; - case 0x42: - mc = 4; - nc = 2; - gemm<4, 2>(m0, m, n0, n); - break; - case 0x24: - mc = 2; - nc = 4; - gemm<2, 4>(m0, m, n0, n); - break; - case 0x32: - mc = 3; - nc = 2; - gemm<3, 2>(m0, m, n0, n); - break; - case 0x23: - mc = 2; - nc = 3; - gemm<2, 3>(m0, m, n0, n); - break; - case 0x51: - mc = 5; - nc = 1; - gemm<5, 1>(m0, m, n0, n); - break; - case 0x41: - mc = 4; - nc = 1; - gemm<4, 1>(m0, m, n0, n); - break; - case 0x22: - mc = 2; - nc = 2; - gemm<2, 2>(m0, m, n0, n); - break; - case 0x15: - mc = 1; - nc = 5; - gemm<1, 5>(m0, m, n0, n); - break; - case 0x14: - mc = 1; - nc = 4; - gemm<1, 4>(m0, m, n0, n); - break; - case 0x31: - mc = 3; - nc = 1; - gemm<3, 1>(m0, m, n0, n); - break; - case 0x13: - mc = 1; - nc = 3; - gemm<1, 3>(m0, m, n0, n); - break; - case 0x21: - mc = 2; - nc = 1; - gemm<2, 1>(m0, m, n0, n); - break; - case 0x12: - mc = 1; - nc = 2; - gemm<1, 2>(m0, m, n0, n); - break; - case 0x11: - mc = 1; - nc = 1; - gemm<1, 1>(m0, m, n0, n); - break; - default: - return; + template + inline void mnpack(int64_t m, int64_t n, int64_t SIZE_N, int64_t BN) { + if (SIZE_N == RN) { + return gemm(m, n, BN); + } + if constexpr (RN > 1) { + return mnpack(m, n, SIZE_N, BN); + } else { + GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N); + GGML_ASSERT(false); // we have miss something. } - mp = m0 + (m - m0) / mc * mc; - np = n0 + (n - n0) / nc * nc; - mnpack(mp, m, n0, np); - mnpack(m0, m, np, n); } template - NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { - int64_t ytiles = (m - m0) / RM; - int64_t xtiles = (n - n0) / RN; - int64_t tiles = xtiles * ytiles; - int64_t duty = (tiles + nth - 1) / nth; - int64_t start = duty * ith; - int64_t end = start + duty; - if (end > tiles) - end = tiles; - for (int64_t job = start; job < end; ++job) { - int64_t ii = m0 + job / xtiles * RM; - int64_t jj = n0 + job % xtiles * RN; - D Cv[RN][RM] = {}; - for (int64_t l = 0; l < k; l += KN) - for (int64_t j = 0; j < RN; ++j) - for (int64_t i = 0; i < RM; ++i) - Cv[j][i] = madd(load(A + lda * (ii + i) + l), - load(B + ldb * (jj + j) + l), - Cv[j][i]); - for (int64_t j = 0; j < RN; ++j) - for (int64_t i = 0; i < RM; ++i) - C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); + inline void gemm_bloc(int64_t ii, int64_t jj) { + D Cv[RN][RM] = {}; + for (int64_t l = 0; l < k; l += KN) { + // help compiler for op order. + if constexpr (RM <= RN) { + V Av[RM]; + for (int64_t i = 0; i < RM; ++i) { + Av[i] = load(A + lda * (ii + i) + l); + } + for (int64_t j = 0; j < RN; ++j) { + V Bv = load(B + ldb * (jj + j) + l); + for (int64_t i = 0; i < RM; ++i) { + Cv[j][i] = madd(Av[i], Bv, Cv[j][i]); + } + } + } else { + V Bv[RN]; + for (int64_t j = 0; j < RN; ++j) { + Bv[j] = load(B + ldb * (jj + j) + l); + } + for (int64_t i = 0; i < RM; ++i) { + V Av = load(A + lda * (ii + i) + l); + for (int64_t j = 0; j < RN; ++j) { + Cv[j][i] = madd(Av, Bv[j], Cv[j][i]); + } + } + } } + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) + C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); } + template + NOINLINE void gemm(int64_t m, int64_t n, int64_t BN) { + static std::atomic current_chunk; + + GGML_ASSERT(m % (RM * BM) == 0); + const int64_t ytiles = m / (RM * BM); + const int64_t xtiles = (n + RN -1) / RN; + const int64_t jj_RN = (xtiles - (xtiles * RN - n)); + + // "round" bloc_size to "nearest" BN + const int64_t NB_BN = xtiles < BN ? 1 : (xtiles + BN / 2) / BN; + const int64_t SIZE_BN = xtiles % NB_BN == 0 ? xtiles / NB_BN : xtiles / NB_BN + 1; + const int64_t jj_BN = (NB_BN - (NB_BN * SIZE_BN - xtiles)); + const int64_t nb_job = ytiles * NB_BN; + + if (params->ith == 0) { + GGML_ASSERT( jj_BN * SIZE_BN + (NB_BN - jj_BN) * (SIZE_BN - 1) == xtiles); + // Every thread starts at ith, so the first unprocessed chunk is nth. This save a bit of coordination right at the start. + std::atomic_store_explicit(¤t_chunk, (int64_t)params->nth, std::memory_order_relaxed); + } + + ggml_barrier(params->threadpool); + + int64_t job = params->ith; + while (job < nb_job) { + const int64_t ii = (job % ytiles) * RM * BM; + const int64_t jb = job / ytiles; + const int64_t jr0 = BLOC_POS(jb , jj_BN, SIZE_BN); + const int64_t jrN = BLOC_POS(jb+1, jj_BN, SIZE_BN); + + const int64_t jj0 = BLOC_POS(jr0, jj_RN, RN); + const int64_t jj2 = BLOC_POS(jrN, jj_RN, RN); + const int64_t jj1 = jj2 < jj_RN * RN ? jj2 : jj_RN * RN; + + for (int64_t bi = 0; bi < BM * RM; bi += RM) { + int64_t jj = jj0; + for (; jj < jj1; jj += RN) { + gemm_bloc(ii + bi, jj); + } + if constexpr (RN > 1) { + for (; jj < jj2; jj += RN - 1) { + gemm_bloc(ii + bi, jj); + } + } + GGML_ASSERT(jj == jj2); + } + + // next step. + job = std::atomic_fetch_add_explicit(¤t_chunk, (int64_t)1, std::memory_order_relaxed); + } + + ggml_barrier(params->threadpool); + return; + } + + const ggml_compute_params * params; const TA *const A; const TB *const B; TC *const C; @@ -451,8 +461,6 @@ class tinyBLAS { const int64_t lda; const int64_t ldb; const int64_t ldc; - const int ith; - const int nth; }; ////////////////////////////////////////////////////////////////////////////////////////// @@ -1656,8 +1664,9 @@ class tinyBLAS_PPC { * @param Ctype is GGML data type of `C` * @return true if this function was able to service the matmul request */ -bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, - int64_t ldc, int ith, int nth, int Atype, int Btype, int Ctype) { +bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64_t n, int64_t k, + const void *A, int64_t lda, const void *B, int64_t ldb, void *C, + int64_t ldc, int Atype, int Btype, int Ctype) { assert(m >= 0); assert(n >= 0); @@ -1665,8 +1674,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda assert(lda >= k); assert(ldb >= k); assert(ldc >= m); - assert(nth > 0); - assert(ith < nth); + assert(params->nth > 0); + assert(params->ith < params->nth); // only enable sgemm for prompt processing if (n < 2) @@ -1681,37 +1690,25 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda if (Btype != GGML_TYPE_F32) return false; #if defined(__AVX512F__) - if (k % 16) - return false; - tinyBLAS<16, __m512, __m512, float, float, float> tb{ + tinyBLAS<16, __m512, __m512, float, float, float> tb{ params, k, (const float *)A, lda, (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + (float *)C, ldc}; + return tb.matmul(m, n); #elif defined(__AVX__) || defined(__AVX2__) - if (k % 8) - return false; - tinyBLAS<8, __m256, __m256, float, float, float> tb{ + tinyBLAS<8, __m256, __m256, float, float, float> tb{ params, k, (const float *)A, lda, (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + (float *)C, ldc}; + return tb.matmul(m, n); #elif defined(__ARM_NEON) if (n < 4) return false; - if (k % 4) - return false; - tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ + tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ params, k, (const float *)A, lda, (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + (float *)C, ldc}; + return tb.matmul(m, n); #elif defined(__MMA__) if (k % 8) return false; @@ -1719,7 +1716,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda k, (const float *)A, lda, (const float *)B, ldb, (float *)C, ldc, - ith, nth}; + params->ith, params->nth}; tb.matmul(m, n); return true; #else @@ -1727,60 +1724,71 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda #endif } + case GGML_TYPE_BF16: { +#if defined(__AVX512BF16__) + if (Btype == GGML_TYPE_BF16) { + tinyBLAS<32, __m512, __m512bh, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k, + (const ggml_bf16_t *)A, lda, + (const ggml_bf16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } +#elif defined(__AVX512F__) + if (Btype == GGML_TYPE_BF16) { + tinyBLAS<16, __m512, __m512, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k, + (const ggml_bf16_t *)A, lda, + (const ggml_bf16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } +#elif defined(__AVX2__) + if (Btype == GGML_TYPE_BF16) { + tinyBLAS<8, __m256, __m256, ggml_bf16_t, ggml_bf16_t, float> tb{ params, k, + (const ggml_bf16_t *)A, lda, + (const ggml_bf16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } +#endif + return false; + } case GGML_TYPE_F16: { #if defined(__AVX512F__) - if (k % 16) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + if (Btype == GGML_TYPE_F16) { + tinyBLAS<16, __m512, __m512, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k, + (const ggml_fp16_t *)A, lda, + (const ggml_fp16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } #elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) - if (k % 8) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + if (Btype == GGML_TYPE_F16) { + tinyBLAS<8, __m256, __m256, ggml_fp16_t, ggml_fp16_t, float> tb{ params, k, + (const ggml_fp16_t *)A, lda, + (const ggml_fp16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } #elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) if (n < 8) return false; - if (k % 8) - return false; - if (Btype != GGML_TYPE_F16) - return false; - tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const ggml_fp16_t *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; + if (Btype == GGML_TYPE_F16) { + tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ params, + k, (const ggml_fp16_t *)A, lda, + (const ggml_fp16_t *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } #elif defined(__ARM_NEON) && !defined(_MSC_VER) - if (k % 4) - return false; - if (Btype != GGML_TYPE_F32) - return false; - tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ - k, (const ggml_fp16_t *)A, lda, - (const float *)B, ldb, - (float *)C, ldc, - ith, nth}; - tb.matmul(m, n); - return true; -#else - return false; + if (Btype == GGML_TYPE_F32) { + tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ params, + k, (const ggml_fp16_t *)A, lda, + (const float *)B, ldb, + (float *)C, ldc}; + return tb.matmul(m, n); + } #endif + return false; } case GGML_TYPE_Q8_0: { @@ -1791,7 +1799,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda k, (const block_q8_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, - ith, nth}; + params->ith, params->nth}; tb.matmul(m, n); return true; #elif defined(__ARM_FEATURE_DOTPROD) @@ -1799,7 +1807,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda k, (const block_q8_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, - ith, nth}; + params->ith, params->nth}; tb.matmul(m, n); return true; #else @@ -1815,7 +1823,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda k, (const block_q4_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, - ith, nth}; + params->ith, params->nth}; tb.matmul(m, n); return true; #elif defined(__ARM_FEATURE_DOTPROD) @@ -1823,7 +1831,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda k, (const block_q4_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, - ith, nth}; + params->ith, params->nth}; tb.matmul(m, n); return true; #else @@ -1839,7 +1847,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda k, (const block_q5_0 *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, - ith, nth}; + params->ith, params->nth}; tb.matmul(m, n); return true; #else @@ -1855,7 +1863,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda k, (const block_iq4_nl *)A, lda, (const block_q8_0 *)B, ldb, (float *)C, ldc, - ith, nth}; + params->ith, params->nth}; tb.matmul(m, n); return true; #else @@ -1867,6 +1875,7 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda return false; } + (void)params; (void)m; (void)n; (void)k; @@ -1876,8 +1885,6 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda (void)ldb; (void)C; (void)ldc; - (void)ith; - (void)nth; (void)Atype; (void)Btype; (void)Ctype; diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.h b/ggml/src/ggml-cpu/llamafile/sgemm.h index caf6dd556..3d2909515 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.h +++ b/ggml/src/ggml-cpu/llamafile/sgemm.h @@ -5,8 +5,8 @@ extern "C" { #endif -bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, - const void *, int64_t, void *, int64_t, int, int, +bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t, int64_t, int64_t, + const void *, int64_t, const void *, int64_t, void *, int64_t, int, int, int); #ifdef __cplusplus diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index f961134ed..549772c57 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -551,6 +551,22 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) +// expose GGUF internals for test code + +GGML_API size_t gguf_type_size(enum gguf_type type); + +GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params); + +struct gguf_buf { + void * data; + size_t size; + size_t offset; +}; +GGML_API struct gguf_buf gguf_buf_init(size_t size); +GGML_API void gguf_buf_free(struct gguf_buf buf); + +GGML_API void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta); + #ifdef __cplusplus } #endif diff --git a/ggml/src/ggml-sycl/common.cpp b/ggml/src/ggml-sycl/common.cpp index a9ee40491..88314a5cd 100644 --- a/ggml/src/ggml-sycl/common.cpp +++ b/ggml/src/ggml-sycl/common.cpp @@ -11,6 +11,8 @@ // #include "common.hpp" + +#include "ggml-backend-impl.h" #include "ggml-impl.h" int get_current_device_id() { @@ -65,9 +67,9 @@ void ggml_sycl_op_flatten(ggml_backend_sycl_context & ctx, const ggml_tensor *sr const ggml_sycl_op_flatten_t op) try { const bool use_src1 = src1 != nullptr; - - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + if(use_src1) + GGML_ASSERT(strcmp(src1->buffer->buft->iface.get_name(src1->buffer->buft), GGML_SYCL_NAME "_Split") != 0); + GGML_ASSERT(strcmp(dst->buffer->buft->iface.get_name(dst->buffer->buft), GGML_SYCL_NAME "_Split") != 0); // dd = data device float * src0_ddf = (float *) src0->data; diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp index c1582f610..62b4cea3a 100644 --- a/ggml/src/ggml-sycl/common.hpp +++ b/ggml/src/ggml-sycl/common.hpp @@ -26,7 +26,11 @@ #define GGML_COMMON_DECL_SYCL #define GGML_COMMON_IMPL_SYCL +/* suppress warning spam */ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wnested-anon-types" #include "ggml-common.h" +#pragma clang diagnostic pop void* ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void* ptr); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index 84f1328e7..312ccfeb8 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -288,10 +288,8 @@ ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor *tensor) try { ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; - if (tensor->view_src != NULL && tensor->view_offs == 0) { + if (tensor->view_src != NULL) { assert(tensor->view_src->buffer->buft == buffer->buft); - tensor->backend = tensor->view_src->backend; - tensor->extra = tensor->view_src->extra; return; } @@ -539,7 +537,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { auto dev_count = ggml_backend_sycl_get_device_count(); if (device>=dev_count or device<0) { - printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", device, dev_count-1); GGML_ASSERT(devicedevice; if (device>=ggml_sycl_info().device_count or device<0) { - printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", + GGML_LOG_ERROR("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", device, ggml_sycl_info().device_count-1); GGML_ASSERT(devicetype, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } - // FIXME: do not crash if cudaMalloc fails + // FIXME: do not crash if SYCL Buffer alloc fails // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first ggml_sycl_set_device(i); const queue_ptr stream = ctx->streams[i]; @@ -788,7 +786,6 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); } } - tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; tensor->extra = extra; } catch (sycl::exception const &exc) { @@ -2349,12 +2346,22 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, dpct::memcpy_direction kind; char * src_ptr; - if (src->backend == GGML_BACKEND_TYPE_CPU) { + if (ggml_backend_buffer_is_host(src->buffer)) { kind = dpct::host_to_device; + //GGML_SYCL_DEBUG("%s: Host buffer type src tensor\n", __func__); src_ptr = (char *) src->data; // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d GGML_BACKEND_TYPE_CPU src_ptr %p\n", src_ptr); - } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); + } else if (ggml_backend_buffer_is_sycl(src->buffer)) { + // If buffer is a SYCL buffer + //GGML_SYCL_DEBUG("%s: SYCL buffer type src tensor\n", __func__); + kind = dpct::device_to_device; + src_ptr = (char *) src->data; + } else if (ggml_backend_buffer_is_sycl_split(src->buffer)) { + /* + If buffer is a SYCL split buffer + */ + //GGML_SYCL_DEBUG("%s: Split buffer type src tensor\n", __func__); + GGML_ASSERT(i1_low == 0 && i1_high == src->ne[1]); kind = dpct::device_to_device; ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; @@ -2857,8 +2864,8 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(dst->buffer)); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src1->buffer)); GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -2878,7 +2885,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer); GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); @@ -3198,7 +3205,7 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg const ggml_tensor *src1, ggml_tensor *dst) try { GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer)); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -3231,7 +3238,7 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -3293,7 +3300,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx, ggml_tensor *dst) try { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(!ggml_backend_buffer_is_sycl_split(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS @@ -4638,10 +4645,9 @@ static ggml_backend_dev_t ggml_backend_sycl_reg_get_device(ggml_backend_reg_t re static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) { GGML_UNUSED(reg); - // TODO: update to the current function signature - //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { - // return (void *)ggml_backend_sycl_split_buffer_type; - //} + if (strcmp(name, "ggml_backend_split_buffer_type") == 0) { + return (void *)ggml_backend_sycl_split_buffer_type; + } // SYCL doesn't support registering host memory, left here for reference // "ggml_backend_register_host_buffer" diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index a26a8fe09..c0a43631c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -245,6 +245,7 @@ struct vk_device_struct { vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16; vk_pipeline pipeline_timestep_embedding_f32; vk_pipeline pipeline_pool2d_f32; + vk_pipeline pipeline_rwkv_wkv6_f32; // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} vk_pipeline pipeline_flash_attn_f32_f16_D64[GGML_TYPE_COUNT][2][2][2]; @@ -528,6 +529,13 @@ struct vk_op_pool2d_push_constants { int32_t p0; int32_t p1; }; +struct vk_op_rwkv_wkv6_push_constants { + uint32_t B; + uint32_t T; + uint32_t C; + uint32_t H; +}; + // Allow pre-recording command buffers struct vk_staging_memcpy { vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} @@ -1363,7 +1371,7 @@ static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vec // Needs to be kept up to date on shader changes const uint32_t bank_conflict_offset = device->coopmat_support ? 8 : 1; const uint32_t type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float); - const uint32_t warps = warptile[0] / device->subgroup_size; + const uint32_t warps = warptile[0] / warptile[10]; const uint32_t load_bufs = (warptile[1] + warptile[2]) * (warptile[3] + bank_conflict_offset) * type_size; const uint32_t mmid_row_ids = mul_mat_id ? 3072 * sizeof(uint32_t) : 0; @@ -1377,8 +1385,9 @@ static void ggml_vk_load_shaders(vk_device& device) { std::cerr << "ggml_vulkan: Compiling shaders"; - // some shaders require the subgroup size to be 16 or larger + // some shaders have a minimum subgroup size const uint32_t subgroup_size_16 = std::max(device->subgroup_size, 16u); + const uint32_t subgroup_size_32 = std::max(device->subgroup_size, 32u); // mulmat std::vector l_warptile, m_warptile, s_warptile, @@ -1445,7 +1454,7 @@ static void ggml_vk_load_shaders(vk_device& device) { l_warptile_mmq = { 128, 128, 128, 32, device->subgroup_size * 2, 64, 2, tm_l, tn_l, tk_l, device->subgroup_size }; m_warptile_mmq = { 128, 64, 64, 32, device->subgroup_size, 32, 2, tm_m, tn_m, tk_m, device->subgroup_size }; - s_warptile_mmq = { subgroup_size_16, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; + s_warptile_mmq = { subgroup_size_32, 32, 32, 32, 32, 32, 2, tm_s, tn_s, tk_s, device->subgroup_size }; l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 }; m_mmq_wg_denoms = m_wg_denoms = { 64, 64, 1 }; @@ -1846,53 +1855,58 @@ static void ggml_vk_load_shaders(vk_device& device) { // mul mat vec - // AMD GCN and Intel graphics cards perform best when the number of rows per shader is doubled - uint32_t rm = 1; - if ((device->vendor_id == VK_VENDOR_ID_AMD && device->subgroup_min_size == 64 && device->subgroup_max_size == 64) || device->vendor_id == VK_VENDOR_ID_INTEL) - rm = 2; + // the number of rows computed per shader depends on GPU model and quant + uint32_t rm_stdq = 1; + uint32_t rm_kq = 2; + if (device->vendor_id == VK_VENDOR_ID_AMD) { + if (device->subgroup_min_size == 64 && device->subgroup_max_size == 64) { // GCN + rm_stdq = 2; + rm_kq = 4; + } + } else if (device->vendor_id == VK_VENDOR_ID_INTEL) + rm_stdq = 2; - // computing additional rows per workgroup is a benefit for Q4_0 -> Q5_1, but not for Q8_0. ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f32_f32", mul_mat_vec_f32_f32_f32_len, mul_mat_vec_f32_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f32_f32", mul_mat_vec_q2_k_f32_f32_len, mul_mat_vec_q2_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f32_f32", mul_mat_vec_q3_k_f32_f32_len, mul_mat_vec_q3_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f32_f32", mul_mat_vec_q4_k_f32_f32_len, mul_mat_vec_q4_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f32_f32", mul_mat_vec_q5_k_f32_f32_len, mul_mat_vec_q5_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f32_f32", mul_mat_vec_q6_k_f32_f32_len, mul_mat_vec_q6_k_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f32_f32", mul_mat_vec_iq4_nl_f32_f32_len, mul_mat_vec_iq4_nl_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F32 ], "mul_mat_vec_f32_f16_f32", mul_mat_vec_f32_f16_f32_len, mul_mat_vec_f32_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_k_f16_f32", mul_mat_vec_q2_k_f16_f32_len, mul_mat_vec_q2_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_k_f16_f32", mul_mat_vec_q3_k_f16_f32_len, mul_mat_vec_q3_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_k_f16_f32", mul_mat_vec_q4_k_f16_f32_len, mul_mat_vec_q4_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_k_f16_f32", mul_mat_vec_q5_k_f16_f32_len, mul_mat_vec_q5_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_k_f16_f32", mul_mat_vec_q6_k_f16_f32_len, mul_mat_vec_q6_k_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_iq4_nl_f16_f32", mul_mat_vec_iq4_nl_f16_f32_len, mul_mat_vec_iq4_nl_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F32 ], "mul_mat_vec_id_f32_f32", mul_mat_vec_id_f32_f32_len, mul_mat_vec_id_f32_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2, 1, 1}, {device->subgroup_size, 2}, 1); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm, 1, 1}, {device->subgroup_size, 1*rm}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1, 1, 1}, {subgroup_size_16}, 1, true); - ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm, 1, 1}, {device->subgroup_size, 2*rm}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {device->subgroup_size, 2*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {1*rm_stdq, 1, 1}, {device->subgroup_size, 1*rm_stdq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_k_f32", mul_mat_vec_id_q2_k_f32_len, mul_mat_vec_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_k_f32", mul_mat_vec_id_q3_k_f32_len, mul_mat_vec_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_k_f32", mul_mat_vec_id_q4_k_f32_len, mul_mat_vec_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_k_f32", mul_mat_vec_id_q5_k_f32_len, mul_mat_vec_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_k_f32", mul_mat_vec_id_q6_k_f32_len, mul_mat_vec_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {rm_kq, 1, 1}, {subgroup_size_16, rm_kq}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_IQ4_NL], "mul_mat_vec_id_iq4_nl_f32", mul_mat_vec_id_iq4_nl_f32_len, mul_mat_vec_id_iq4_nl_f32_data, "main", 4, sizeof(vk_mat_vec_id_push_constants), {2*rm_stdq, 1, 1}, {subgroup_size_16, 2*rm_stdq}, 1, true); // dequant shaders ggml_vk_create_pipeline(device, device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -2014,6 +2028,8 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_rwkv_wkv6_f32, "rwkv_wkv6_f32", rwkv_wkv6_f32_len, rwkv_wkv6_f32_data, "main", 7, sizeof(vk_op_rwkv_wkv6_push_constants), {1, 1, 1}, {device->subgroup_size}, 1); + for (auto &c : compiles) { c.wait(); } @@ -3194,8 +3210,8 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont GGML_ABORT("fatal error"); } // Check if src is pinned memory - vk_buffer buf; - size_t buf_offset; + vk_buffer buf = nullptr; + size_t buf_offset = 0; ggml_vk_host_get(ctx->device, tensor->data, buf, buf_offset); const uint64_t ne0 = tensor->ne[0]; @@ -3258,7 +3274,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont VkBufferCopy buf_copy{ 0, offset, copy_size }; ggml_vk_sync_buffers(subctx); - vkCmdCopyBuffer(subctx->s->buffer, staging->buffer, dst->buffer, 1, &buf_copy); + vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); for (uint64_t i3 = 0; i3 < ne3; i3++) { for (uint64_t i2 = 0; i2 < ne2; i2++) { @@ -3291,7 +3307,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz } // Check if src is pinned memory vk_buffer buf = nullptr; - size_t buf_offset; + size_t buf_offset = 0; ggml_vk_host_get(dst->device, src, buf, buf_offset); if (buf != nullptr) { @@ -3333,7 +3349,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz copy_size}; ggml_vk_sync_buffers(subctx); - vkCmdCopyBuffer(subctx->s->buffer, staging_buffer->buffer, dst->buffer, 1, &buf_copy); + vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); if (width == spitch) { deferred_memcpy((uint8_t *)staging_buffer->ptr, src, width * height, &subctx->in_memcpys); @@ -3389,7 +3405,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size // Check if dst is pinned memory vk_buffer buf = nullptr; - size_t buf_offset; + size_t buf_offset = 0; ggml_vk_host_get(src->device, dst, buf, buf_offset); std::vector slices(1); @@ -3469,7 +3485,7 @@ static void ggml_vk_buffer_copy_async(vk_context& ctx, vk_buffer& dst, size_t ds VkBufferCopy bc{ src_offset, dst_offset, size }; - vkCmdCopyBuffer(ctx->s->buffer, src->buffer, dst->buffer, 1, &bc); + vkCmdCopyBuffer(ctx->s->buffer, (VkBuffer)src->buffer, (VkBuffer)dst->buffer, 1, &bc); } static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& src, size_t src_offset, size_t size) { @@ -3721,9 +3737,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; - vk_buffer d_Qx; + vk_buffer d_Qx = nullptr; size_t qx_buf_offset = 0; - vk_buffer d_Qy; + vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; bool src0_uma = false; @@ -3923,9 +3939,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; - vk_buffer d_Qx; + vk_buffer d_Qx = nullptr; size_t qx_buf_offset = 0; - vk_buffer d_Qy; + vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; bool src0_uma = false; @@ -4101,7 +4117,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c ggml_backend_vk_buffer_context * src0_buf_ctx = (ggml_backend_vk_buffer_context *)src0->buffer->context; ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; - vk_buffer d_Qy; + vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; bool src1_uma = false; @@ -4289,11 +4305,11 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; - vk_buffer d_Qx; + vk_buffer d_Qx = nullptr; size_t qx_buf_offset = 0; - vk_buffer d_Qy; + vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; - vk_buffer d_ids; + vk_buffer d_ids = nullptr; size_t ids_buf_offset = 0; bool src0_uma = false; @@ -4494,11 +4510,11 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte ggml_backend_vk_buffer_context * src1_buf_ctx = (ggml_backend_vk_buffer_context *)src1->buffer->context; ggml_backend_vk_buffer_context * ids_buf_ctx = (ggml_backend_vk_buffer_context *)ids->buffer->context; - vk_buffer d_Qx; + vk_buffer d_Qx = nullptr; size_t qx_buf_offset = 0; - vk_buffer d_Qy; + vk_buffer d_Qy = nullptr; size_t qy_buf_offset = 0; - vk_buffer d_ids; + vk_buffer d_ids = nullptr; size_t ids_buf_offset = 0; bool src0_uma = false; @@ -4757,8 +4773,8 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_sync_buffers(subctx); - vk_buffer d_Q, d_K, d_V, d_D, d_M; - uint64_t q_buf_offset, k_buf_offset, v_buf_offset, d_buf_offset, m_buf_offset; + vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr; + size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0; bool Q_uma = false, K_uma = false, V_uma = false, D_uma = false, M_uma = false; @@ -5022,6 +5038,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ctx->device->pipeline_pool2d_f32; } return nullptr; + case GGML_OP_RWKV_WKV6: + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_rwkv_wkv6_f32; + } + return nullptr; case GGML_OP_LEAKY_RELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_leaky_relu_f32; @@ -5424,6 +5445,134 @@ static void ggml_vk_div(ggml_backend_vk_context * ctx, vk_context& subctx, const }, dryrun); } +static void ggml_vk_op_f32_rwkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, const vk_op_rwkv_wkv6_push_constants&& pc, bool dryrun = false) { + const ggml_tensor * k = dst->src[0]; + const ggml_tensor * v = dst->src[1]; + const ggml_tensor * r = dst->src[2]; + const ggml_tensor * tf = dst->src[3]; + const ggml_tensor * td = dst->src[4]; + const ggml_tensor * state = dst->src[5]; + + GGML_ASSERT(!ggml_is_quantized(k->type)); + GGML_ASSERT(!ggml_is_quantized(v->type)); + GGML_ASSERT(!ggml_is_quantized(r->type)); + GGML_ASSERT(!ggml_is_quantized(tf->type)); + GGML_ASSERT(!ggml_is_quantized(td->type)); + GGML_ASSERT(!ggml_is_quantized(state->type)); + GGML_ASSERT(dst->buffer != nullptr); + + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, k, v, r, dst, GGML_OP_RWKV_WKV6); + GGML_ASSERT(pipeline != nullptr); + + if (dryrun) { + ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + return; + } + + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_buffer_context * k_buf_ctx = (ggml_backend_vk_buffer_context *)k->buffer->context; + ggml_backend_vk_buffer_context * v_buf_ctx = (ggml_backend_vk_buffer_context *)v->buffer->context; + ggml_backend_vk_buffer_context * r_buf_ctx = (ggml_backend_vk_buffer_context *)r->buffer->context; + ggml_backend_vk_buffer_context * tf_buf_ctx = (ggml_backend_vk_buffer_context *)tf->buffer->context; + ggml_backend_vk_buffer_context * td_buf_ctx = (ggml_backend_vk_buffer_context *)td->buffer->context; + ggml_backend_vk_buffer_context * state_buf_ctx = (ggml_backend_vk_buffer_context *)state->buffer->context; + + ggml_vk_sync_buffers(subctx); + + vk_buffer d_D = nullptr, d_K = nullptr, d_V = nullptr, d_R = nullptr, d_TF = nullptr, d_TD = nullptr, d_State = nullptr; + size_t k_offset = 0, v_offset = 0, r_offset = 0, tf_offset = 0, td_offset = 0, state_offset = 0, dst_offset = 0; + bool K_uma = false, V_uma = false, R_uma = false, TF_uma = false, TD_uma = false, STATE_uma = false, DST_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx->device, k->data, d_K, k_offset); + ggml_vk_host_get(ctx->device, v->data, d_V, v_offset); + ggml_vk_host_get(ctx->device, r->data, d_R, r_offset); + ggml_vk_host_get(ctx->device, tf->data, d_TF, tf_offset); + ggml_vk_host_get(ctx->device, td->data, d_TD, td_offset); + ggml_vk_host_get(ctx->device, state->data, d_State, state_offset); + ggml_vk_host_get(ctx->device, dst->data, d_D, dst_offset); + + K_uma = d_K != nullptr; + V_uma = d_V != nullptr; + R_uma = d_R != nullptr; + TF_uma = d_TF != nullptr; + TD_uma = d_TD != nullptr; + STATE_uma = d_State != nullptr; + DST_uma = d_D != nullptr; + } + + if (!K_uma) { + d_K = k_buf_ctx->dev_buffer; + k_offset = vk_tensor_offset(k) + k->view_offs; + } + if (!V_uma) { + d_V = v_buf_ctx->dev_buffer; + v_offset = vk_tensor_offset(v) + v->view_offs; + } + if (!R_uma) { + d_R = r_buf_ctx->dev_buffer; + r_offset = vk_tensor_offset(r) + r->view_offs; + } + if (!TF_uma) { + d_TF = tf_buf_ctx->dev_buffer; + tf_offset = vk_tensor_offset(tf) + tf->view_offs; + } + if (!TD_uma) { + d_TD = td_buf_ctx->dev_buffer; + td_offset = vk_tensor_offset(td) + td->view_offs; + } + if (!STATE_uma) { + d_State = state_buf_ctx->dev_buffer; + state_offset = vk_tensor_offset(state) + state->view_offs; + } + if (!DST_uma) { + d_D = dst_buf_ctx->dev_buffer; + dst_offset = vk_tensor_offset(dst) + dst->view_offs; + } + + const uint64_t k_size = ggml_nbytes(k); + const uint64_t v_size = ggml_nbytes(v); + const uint64_t r_size = ggml_nbytes(r); + const uint64_t tf_size = ggml_nbytes(tf); + const uint64_t td_size = ggml_nbytes(td); + const uint64_t state_size = ggml_nbytes(state); + const uint64_t dst_size = ggml_nbytes(dst); + + std::array elements = { + (uint32_t)(pc.B * pc.H), + 1, + 1 + }; + + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { + vk_subbuffer{ d_K, k_offset, k_size }, + vk_subbuffer{ d_V, v_offset, v_size }, + vk_subbuffer{ d_R, r_offset, r_size }, + vk_subbuffer{ d_TF, tf_offset, tf_size }, + vk_subbuffer{ d_TD, td_offset, td_size }, + vk_subbuffer{ d_State, state_offset, state_size }, + vk_subbuffer{ d_D, dst_offset, dst_size } + }, sizeof(vk_op_rwkv_wkv6_push_constants), &pc, elements); +} + +static void ggml_vk_rwkv_wkv6(ggml_backend_vk_context * ctx, vk_context& subctx, ggml_tensor * dst, bool dryrun = false) { + const size_t seq_length = dst->src[0]->ne[3]; + const size_t n_embed = dst->ne[0]; + const size_t n_heads = dst->src[0]->ne[2]; + const size_t n_seqs = dst->src[5]->ne[1]; + + ggml_vk_op_f32_rwkv6( + ctx, subctx, dst, + { + (uint32_t)n_seqs, + (uint32_t)seq_length, + (uint32_t)n_embed, + (uint32_t)n_heads, + }, + dryrun + ); +} + static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { int * op_params = (int *)dst->op_params; @@ -6569,6 +6718,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: + case GGML_OP_RWKV_WKV6: case GGML_OP_LEAKY_RELU: case GGML_OP_FLASH_ATTN_EXT: break; @@ -6768,6 +6918,11 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_FLASH_ATTN_EXT: ggml_vk_flash_attn(ctx, compute_ctx, src0, src1, src2, src3, node, dryrun); + break; + + case GGML_OP_RWKV_WKV6: + ggml_vk_rwkv_wkv6(ctx, compute_ctx, node, dryrun); + break; default: return false; @@ -6848,6 +7003,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor * case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: + case GGML_OP_RWKV_WKV6: case GGML_OP_LEAKY_RELU: case GGML_OP_REPEAT: buf = tensor->buffer; @@ -7724,6 +7880,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_IM2COL: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_POOL_2D: + case GGML_OP_RWKV_WKV6: case GGML_OP_LEAKY_RELU: return true; default: @@ -8300,7 +8457,11 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) { } else if (tensor->op == GGML_OP_LEAKY_RELU) { const float * op_params = (const float *)tensor->op_params; tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false); - } else { + } else if (tensor->op == GGML_OP_RWKV_WKV6) { + tensor_clone = ggml_rwkv_wkv6(ggml_ctx, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], + tensor->src[4], tensor->src[5]); + } + else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ABORT("fatal error"); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp index a8707b621..94b78598e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/dequant_funcs_cm2.comp @@ -10,9 +10,10 @@ float16_t dequantFuncQ4_0(const in decodeBufQ4_0 bl, const in uint blockCoords[2 const float16_t d = bl.block.d; const uint idx = coordInBlock[1]; const uint shift = (idx & 0x10) >> 2; - uint32_t qs = unpack8(uint32_t(bl.block.qs[(idx & 0xE) >> 1]))[idx & 1]; + uint32_t qs = uint32_t(bl.block.qs[(idx & 0xE) >> 1]); qs >>= shift; - qs &= 0xF; + qs &= 0x0F0F; + qs = unpack8(qs)[idx & 1]; float16_t ret = (float16_t(qs) - float16_t(8)) * d; return ret; } @@ -152,15 +153,17 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4 block_q4_K block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ4_K_packed16 { + block_q4_K_packed16 block; +}; + float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { + decodeBufQ4_K_packed16 bl16 = decodeBufQ4_K_packed16(bl); const uint idx = coordInBlock[1]; - const uint iqs = idx; - const uint n = iqs / 64; // 0,1,2,3 - const uint b = (iqs % 64) / 32; // 0,1 + const uint b = (idx & 0x20) >> 5; // 0,1 const uint is = (idx & 0xE0) >> 5; // 0..7 - const uint qsi = n * 32 + (iqs % 32); // 0..127 const f16vec2 loadd = bl.block.d; @@ -184,9 +187,11 @@ float16_t dequantFuncQ4_K(const in decodeBufQ4_K bl, const in uint blockCoords[2 const float16_t d = loadd.x * float16_t(sc); const float16_t m = loadd.y * float16_t(mbyte); - uint32_t dmask = 0xF << (b * 4); + uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]); + qs = (qs >> (b * 4)) & 0x0F0F; + qs = unpack8(qs)[idx & 1]; - float16_t ret = d * float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) - m; + float16_t ret = d * float16_t(qs) - m; return ret; } @@ -195,18 +200,19 @@ layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5 block_q5_K block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ5_K_packed16 { + block_q5_K_packed16 block; +}; + float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { + decodeBufQ5_K_packed16 bl16 = decodeBufQ5_K_packed16(bl); const uint idx = coordInBlock[1]; - const uint iqs = idx; - const uint n = iqs / 64; // 0,1,2,3 - const uint b = (iqs % 64) / 32; // 0,1 + const uint b = (idx & 0x20) >> 5; // 0,1 const uint is = (idx & 0xE0) >> 5; // 0..7 - const uint qsi = n * 32 + (iqs % 32); // 0..127 - const uint qhi = (iqs % 32); // 0..31 - const uint8_t hm = uint8_t(1 << (iqs / 32)); + const uint32_t hm = 0x0101 << is; const f16vec2 loadd = bl.block.d; @@ -230,9 +236,15 @@ float16_t dequantFuncQ5_K(const in decodeBufQ5_K bl, const in uint blockCoords[2 const float16_t d = loadd.x * float16_t(sc); const float16_t m = loadd.y * float16_t(mbyte); - uint32_t dmask = 0xF << (b * 4); + uint qh = uint32_t(bl16.block.qh[(idx & 0x1E) >> 1]); + qh = qh & hm; + qh = unpack8(qh)[idx & 1]; - float16_t ret = d * (float16_t((bl.block.qs[qsi ] & dmask) >> (b * 4)) + float16_t((bl.block.qh[qhi ] & hm) != 0 ? 16 : 0)) - m; + uint qs = uint32_t(bl16.block.qs[((idx & 0xC0) >> 2) + ((idx & 0x1E) >> 1)]); + qs = (qs >> (b * 4)) & 0x0F0F; + qs = unpack8(qs)[idx & 1]; + + float16_t ret = d * (float16_t(qs) + (qh != 0 ? float16_t(16) : float16_t(0))) - m; return ret; } @@ -241,22 +253,30 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufQ6_ block_q6_K block; }; +layout(buffer_reference, std430, buffer_reference_align = 16) buffer decodeBufQ6_K_packed16 { + block_q6_K_packed16 block; +}; + float16_t dequantFuncQ6_K(const in decodeBufQ6_K bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { + decodeBufQ6_K_packed16 bl16 = decodeBufQ6_K_packed16(bl); const uint idx = coordInBlock[1]; - const uint iqs = idx; - const uint n = iqs / 128; // 0,1 - const uint b = (iqs % 128) / 64; // 0,1 - const uint is_b = (iqs % 32) / 16; // 0,1 - const uint qhshift = ((iqs % 128) / 32) * 2;// 0,2,4,6 - const uint is = 8 * n + qhshift + is_b; // 0..15 - const uint qsi = n * 64 + (iqs % 64); // 0..127 - const uint qhi = n * 32 + (iqs % 32); // 0..63 + const uint b = (idx & 0x40) >> 6; // 0,1 + const uint qhshift = (idx & 0x60) >> 4; // 0,2,4,6 + const uint is = (idx & 0xF0) >> 4; // 0..15 const float16_t dscale = bl.block.d * float16_t(bl.block.scales[is]); - float16_t ret = dscale * float16_t(int8_t(((bl.block.ql[qsi ] >> (b * 4)) & 0xF) | (((bl.block.qh[qhi ] >> qhshift) & 3) << 4)) - 32); + uint ql = uint32_t(bl16.block.ql[((idx & 0x80) >> 2) + ((idx & 0x3E) >> 1)]); + ql = (ql >> (b * 4)) & 0x0F0F; + + uint qh = uint32_t(bl16.block.qh[((idx & 0x80) >> 3) + ((idx & 0x1E) >> 1)]); + qh = ((qh >> qhshift) & 0x0303) << 4; + + int q = unpack8(ql | qh)[idx & 1]; + + float16_t ret = dscale * float16_t(q - 32); return ret; } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp index 1a5350d99..138ad0184 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q2_k.comp @@ -6,21 +6,15 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; +layout (constant_id = 1) const uint NUM_ROWS = 1; -shared FLOAT_TYPE tmp[BLOCK_SIZE]; - -void main() { - const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; - - if (row >= p.stride_d) { - return; - } +shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; @@ -38,15 +32,15 @@ void main() { const uint s_offset = 8*v_im; const uint y_offset = 128*v_im + l0; - FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp[NUM_ROWS]; + + [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { + temp[i] = FLOAT_TYPE(0); + } [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y_idx = i * QUANT_K + y_offset; - f16vec2 d = data_a[ib0 + i].d; - const FLOAT_TYPE dall = d.x; - const FLOAT_TYPE dmin = d.y; - B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0]; B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8]; B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16]; @@ -56,58 +50,84 @@ void main() { B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48]; B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56]; - uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0]; - uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + f16vec2 d = data_a[ib0 + i].d; + const FLOAT_TYPE dall = d.x; + const FLOAT_TYPE dmin = d.y; - uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F; - uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F; - uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F; - uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F; + uint32_t s0_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 0]; + uint32_t s4_u32 = data_a_packed32[ib0 + i].scales[s_offset / 4 + 1]; - uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32)); - uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32)); - uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32)); - uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32)); + uint32_t s0_lo4_u32 = s0_u32 & 0x0F0F0F0F; + uint32_t s0_hi4_u32 = (s0_u32 >> 4) & 0x0F0F0F0F; + uint32_t s4_lo4_u32 = s4_u32 & 0x0F0F0F0F; + uint32_t s4_hi4_u32 = (s4_u32 >> 4) & 0x0F0F0F0F; - uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0]; - uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]; - uvec2 qs0 = uvec2(unpack8(qs0_u16)); - uvec2 qs16 = uvec2(unpack8(qs16_u16)); + uvec4 s0_lo4 = uvec4(unpack8(s0_lo4_u32)); + uvec4 s4_lo4 = uvec4(unpack8(s4_lo4_u32)); + uvec4 s0_hi4 = uvec4(unpack8(s0_hi4_u32)); + uvec4 s4_hi4 = uvec4(unpack8(s4_hi4_u32)); - FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); - FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < 2; ++l) { - sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3), - fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3), - fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3), - fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3), - fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l] >> 4) & 3), - fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3), - fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l] >> 6) & 3), - fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1)))))))); - sum2 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_hi4[0]), - fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_hi4[1]), - fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_hi4[2]), - fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_hi4[3]), - fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_hi4[0]), - fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_hi4[1]), - fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]), - fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2)))))))); + uint16_t qs0_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 0]; + uint16_t qs16_u16 = data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]; + uvec2 qs0 = uvec2(unpack8(qs0_u16)); + uvec2 qs16 = uvec2(unpack8(qs16_u16)); + + FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); + FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); + [[unroll]] for (int l = 0; l < 2; ++l) { + sum1 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_lo4[0]) * FLOAT_TYPE((qs0[l] >> 0) & 3), + fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_lo4[1]) * FLOAT_TYPE((qs16[l] >> 0) & 3), + fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_lo4[2]) * FLOAT_TYPE((qs0[l] >> 2) & 3), + fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_lo4[3]) * FLOAT_TYPE((qs16[l] >> 2) & 3), + fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_lo4[0]) * FLOAT_TYPE((qs0[l] >> 4) & 3), + fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_lo4[1]) * FLOAT_TYPE((qs16[l] >> 4) & 3), + fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_lo4[2]) * FLOAT_TYPE((qs0[l] >> 6) & 3), + fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_lo4[3]) * FLOAT_TYPE((qs16[l] >> 6) & 3), sum1)))))))); + sum2 = fma(FLOAT_TYPE(b0[l]), FLOAT_TYPE(s0_hi4[0]), + fma(FLOAT_TYPE(b16[l]), FLOAT_TYPE(s0_hi4[1]), + fma(FLOAT_TYPE(b32[l]), FLOAT_TYPE(s0_hi4[2]), + fma(FLOAT_TYPE(b48[l]), FLOAT_TYPE(s0_hi4[3]), + fma(FLOAT_TYPE(b64[l]), FLOAT_TYPE(s4_hi4[0]), + fma(FLOAT_TYPE(b80[l]), FLOAT_TYPE(s4_hi4[1]), + fma(FLOAT_TYPE(b96[l]), FLOAT_TYPE(s4_hi4[2]), + fma(FLOAT_TYPE(b112[l]), FLOAT_TYPE(s4_hi4[3]), sum2)))))))); + } + temp[n] = fma(dall, sum1, fma(-dmin, sum2, temp[n])); } - temp = fma(dall, sum1, fma(-dmin, sum2, temp)); } - tmp[gl_LocalInvocationID.x] = temp; - // sum up partial sums and write back result + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] = temp[n]; + } barrier(); - [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { if (tid < s) { - tmp[tid] += tmp[tid + s]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] += tmpsh[n][tid + s]; + } } barrier(); } if (tid == 0) { - data_d[d_offset + row] = D_TYPE(tmp[0]); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]); + } + } +} + +void main() { + const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); + + // do NUM_ROWS at a time, unless there aren't enough remaining rows + if (first_row + NUM_ROWS <= p.stride_d) { + compute_outputs(first_row, NUM_ROWS); + } else { + if (first_row >= p.stride_d) { + return; + } + compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp index b19c38111..82ec42d25 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp @@ -6,21 +6,15 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; +layout (constant_id = 1) const uint NUM_ROWS = 1; -shared FLOAT_TYPE tmp[BLOCK_SIZE]; - -void main() { - const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; - - if (row >= p.stride_d) { - return; - } +shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; @@ -35,19 +29,21 @@ void main() { const uint8_t m = uint8_t(1 << (4 * v_im)); - const uint l0 = 2*v_in; // 0...15 + const uint l0 = 2*v_in; // 0...15 const uint q_offset = 32*v_im + l0; const uint y_offset = 128*v_im + l0; - FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp[NUM_ROWS]; + + [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { + temp[i] = FLOAT_TYPE(0); + } const uint s_shift = 4 * v_im; [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y_idx = i * QUANT_K + y_offset; - const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); - B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0]; B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8]; B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16]; @@ -57,44 +53,68 @@ void main() { B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48]; B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56]; - uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0]; - uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1]; - uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2]; - uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3]; - uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4]; - uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5]; - u8vec2 s0 = unpack8(s0_16); - u8vec2 s2 = unpack8(s2_16); - u8vec2 s4 = unpack8(s4_16); - u8vec2 s6 = unpack8(s6_16); - u8vec2 s8 = unpack8(s8_16); - u8vec2 s10 = unpack8(s10_16); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); - FLOAT_TYPE sum = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < 2; ++l) { - sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)), - fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum)))))))); + uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0]; + uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1]; + uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2]; + uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3]; + uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4]; + uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5]; + u8vec2 s0 = unpack8(s0_16); + u8vec2 s2 = unpack8(s2_16); + u8vec2 s4 = unpack8(s4_16); + u8vec2 s6 = unpack8(s6_16); + u8vec2 s8 = unpack8(s8_16); + u8vec2 s10 = unpack8(s10_16); + + FLOAT_TYPE sum = FLOAT_TYPE(0.0); + [[unroll]] for (int l = 0; l < 2; ++l) { + sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)), + fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum)))))))); + } + temp[n] = fma(d, sum, temp[n]); } - temp = fma(d, sum, temp); } - tmp[gl_LocalInvocationID.x] = temp; - // sum up partial sums and write back result + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] = temp[n]; + } barrier(); - [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { if (tid < s) { - tmp[tid] += tmp[tid + s]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] += tmpsh[n][tid + s]; + } } barrier(); } if (tid == 0) { - data_d[d_offset + row] = D_TYPE(tmp[0]); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]); + } + } +} + +void main() { + const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); + + // do NUM_ROWS at a time, unless there aren't enough remaining rows + if (first_row + NUM_ROWS <= p.stride_d) { + compute_outputs(first_row, NUM_ROWS); + } else { + if (first_row >= p.stride_d) { + return; + } + compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp index b86d28589..677c207a8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q4_k.comp @@ -7,21 +7,15 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; +layout (constant_id = 1) const uint NUM_ROWS = 1; -shared FLOAT_TYPE tmp[BLOCK_SIZE]; - -void main() { - const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; - - if (row >= p.stride_d) { - return; - } +shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; @@ -31,8 +25,8 @@ void main() { const uint step = 4; - const uint il = itid/step; // 0...3 - const uint ir = itid - step*il; // 0...7 or 0...3 + const uint il = itid/step; // 0...3 + const uint ir = itid - step*il; // 0...7 or 0...3 const uint n = 4; const uint v_im = il / 2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 @@ -42,90 +36,116 @@ void main() { const uint q_offset = 32*v_im + l0; const uint y_offset = 64*v_im + l0; - FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp[NUM_ROWS]; + + [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { + temp[i] = FLOAT_TYPE(0); + } [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; - f16vec2 d = data_a[ib0 + i].d; - const FLOAT_TYPE dall = FLOAT_TYPE(d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - - uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; - uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; - uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; - uvec4 scale0 = uvec4(unpack8(scale0_u32)); - uvec4 scale4 = uvec4(unpack8(scale4_u32)); - uvec4 scale8 = uvec4(unpack8(scale8_u32)); - - const uint32_t sc0 = ( scale0.x & 0x3f); - const uint32_t sc1 = ( scale0.y & 0x3f); - const uint32_t sc2 = ( scale4.x & 0x3f); - const uint32_t sc3 = ( scale4.y & 0x3f); - const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); - const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); - const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); - const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); - - uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4]; - uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16]; - - uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F; - uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F; - uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F; - uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F; - - uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4)); - uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4)); - uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4)); - uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4)); - - const uint32_t q4_0 = qs0_lo4.x; - const uint32_t q4_1 = qs0_lo4.y; - const uint32_t q4_2 = qs0_lo4.z; - const uint32_t q4_3 = qs0_lo4.w; - const uint32_t q4_4 = qs0_hi4.x; - const uint32_t q4_5 = qs0_hi4.y; - const uint32_t q4_6 = qs0_hi4.z; - const uint32_t q4_7 = qs0_hi4.w; - const uint32_t q4_8 = qs64_lo4.x; - const uint32_t q4_9 = qs64_lo4.y; - const uint32_t q4_10 = qs64_lo4.z; - const uint32_t q4_11 = qs64_lo4.w; - const uint32_t q4_12 = qs64_hi4.x; - const uint32_t q4_13 = qs64_hi4.y; - const uint32_t q4_14 = qs64_hi4.z; - const uint32_t q4_15 = qs64_hi4.w; - B_TYPE_VEC4 by10 = data_b_v4[(b_offset + y1_idx) / 4]; B_TYPE_VEC4 by132 = data_b_v4[(b_offset + y1_idx) / 4 + 8]; B_TYPE_VEC4 by20 = data_b_v4[(b_offset + y2_idx) / 4]; B_TYPE_VEC4 by232 = data_b_v4[(b_offset + y2_idx) / 4 + 8]; - const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3))); - const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7))); - const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11))); - const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15))); - const FLOAT_TYPE smin = - fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7, - fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7, - fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7, - fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7))))))))))))))); - temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp)); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + f16vec2 d = data_a[ib0 + i].d; + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); + + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; + uvec4 scale0 = uvec4(unpack8(scale0_u32)); + uvec4 scale4 = uvec4(unpack8(scale4_u32)); + uvec4 scale8 = uvec4(unpack8(scale8_u32)); + + const uint32_t sc0 = ( scale0.x & 0x3f); + const uint32_t sc1 = ( scale0.y & 0x3f); + const uint32_t sc2 = ( scale4.x & 0x3f); + const uint32_t sc3 = ( scale4.y & 0x3f); + const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); + const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); + const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); + const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); + + uint32_t qs0_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4]; + uint32_t qs64_u32 = data_a_packed32[ib0 + i].qs[q_offset / 4 + 16]; + + uint32_t qs0_u32_lo4 = qs0_u32 & 0x0F0F0F0F; + uint32_t qs0_u32_hi4 = (qs0_u32 >> 4) & 0x0F0F0F0F; + uint32_t qs64_u32_lo4 = qs64_u32 & 0x0F0F0F0F; + uint32_t qs64_u32_hi4 = (qs64_u32 >> 4) & 0x0F0F0F0F; + + uvec4 qs0_lo4 = uvec4(unpack8(qs0_u32_lo4)); + uvec4 qs64_lo4 = uvec4(unpack8(qs64_u32_lo4)); + uvec4 qs0_hi4 = uvec4(unpack8(qs0_u32_hi4)); + uvec4 qs64_hi4 = uvec4(unpack8(qs64_u32_hi4)); + + const uint32_t q4_0 = qs0_lo4.x; + const uint32_t q4_1 = qs0_lo4.y; + const uint32_t q4_2 = qs0_lo4.z; + const uint32_t q4_3 = qs0_lo4.w; + const uint32_t q4_4 = qs0_hi4.x; + const uint32_t q4_5 = qs0_hi4.y; + const uint32_t q4_6 = qs0_hi4.z; + const uint32_t q4_7 = qs0_hi4.w; + const uint32_t q4_8 = qs64_lo4.x; + const uint32_t q4_9 = qs64_lo4.y; + const uint32_t q4_10 = qs64_lo4.z; + const uint32_t q4_11 = qs64_lo4.w; + const uint32_t q4_12 = qs64_hi4.x; + const uint32_t q4_13 = qs64_hi4.y; + const uint32_t q4_14 = qs64_hi4.z; + const uint32_t q4_15 = qs64_hi4.w; + + const FLOAT_TYPE sx = fma(FLOAT_TYPE(by10.x), q4_0, fma(FLOAT_TYPE(by10.y), q4_1, fma(FLOAT_TYPE(by10.z), q4_2, FLOAT_TYPE(by10.w) * q4_3))); + const FLOAT_TYPE sy = fma(FLOAT_TYPE(by132.x), q4_4, fma(FLOAT_TYPE(by132.y), q4_5, fma(FLOAT_TYPE(by132.z), q4_6, FLOAT_TYPE(by132.w) * q4_7))); + const FLOAT_TYPE sz = fma(FLOAT_TYPE(by20.x), q4_8, fma(FLOAT_TYPE(by20.y), q4_9, fma(FLOAT_TYPE(by20.z), q4_10, FLOAT_TYPE(by20.w) * q4_11))); + const FLOAT_TYPE sw = fma(FLOAT_TYPE(by232.x), q4_12, fma(FLOAT_TYPE(by232.y), q4_13, fma(FLOAT_TYPE(by232.z), q4_14, FLOAT_TYPE(by232.w) * q4_15))); + const FLOAT_TYPE smin = + fma(FLOAT_TYPE(by10.x), sc2, fma(FLOAT_TYPE(by132.x), sc3, fma(FLOAT_TYPE(by20.x), sc6, fma(FLOAT_TYPE(by232.x), sc7, + fma(FLOAT_TYPE(by10.y), sc2, fma(FLOAT_TYPE(by132.y), sc3, fma(FLOAT_TYPE(by20.y), sc6, fma(FLOAT_TYPE(by232.y), sc7, + fma(FLOAT_TYPE(by10.z), sc2, fma(FLOAT_TYPE(by132.z), sc3, fma(FLOAT_TYPE(by20.z), sc6, fma(FLOAT_TYPE(by232.z), sc7, + fma(FLOAT_TYPE(by10.w), sc2, fma(FLOAT_TYPE(by132.w), sc3, fma(FLOAT_TYPE(by20.w), sc6, FLOAT_TYPE(by232.w) * sc7))))))))))))))); + temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n])); + } } - tmp[gl_LocalInvocationID.x] = temp; - // sum up partial sums and write back result + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] = temp[n]; + } barrier(); - [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { if (tid < s) { - tmp[tid] += tmp[tid + s]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] += tmpsh[n][tid + s]; + } } barrier(); } if (tid == 0) { - data_d[d_offset + row] = D_TYPE(tmp[0]); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]); + } + } +} + +void main() { + const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); + + // do NUM_ROWS at a time, unless there aren't enough remaining rows + if (first_row + NUM_ROWS <= p.stride_d) { + compute_outputs(first_row, NUM_ROWS); + } else { + if (first_row >= p.stride_d) { + return; + } + compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp index fd243cf91..ed3c25d89 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q5_k.comp @@ -7,21 +7,15 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; +layout (constant_id = 1) const uint NUM_ROWS = 1; -shared FLOAT_TYPE tmp[BLOCK_SIZE]; - -void main() { - const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; - - if (row >= p.stride_d) { - return; - } +shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; @@ -39,74 +33,16 @@ void main() { const uint q_offset = 32*v_im + l0; const uint y_offset = 64*v_im + l0; - FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp[NUM_ROWS]; + + [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { + temp[i] = FLOAT_TYPE(0); + } [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { const uint y1_idx = i * QUANT_K + y_offset; const uint y2_idx = y1_idx + 128; - f16vec2 d = data_a[ib0 + i].d; - const FLOAT_TYPE dall = FLOAT_TYPE(d.x); - const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); - - uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; - uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; - uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; - uvec4 scale0 = uvec4(unpack8(scale0_u32)); - uvec4 scale4 = uvec4(unpack8(scale4_u32)); - uvec4 scale8 = uvec4(unpack8(scale8_u32)); - - const uint32_t sc0 = ( scale0.x & 0x3f); - const uint32_t sc1 = ( scale0.y & 0x3f); - const uint32_t sc2 = ( scale4.x & 0x3f); - const uint32_t sc3 = ( scale4.y & 0x3f); - const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); - const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); - const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); - const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); - - uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); - uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16); - - uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F; - uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F; - uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F; - uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F; - - uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8])); - - uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4; - uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3; - uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0; - uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1; - - qs0_16_u32_lo4 += qs0_16_lo4_offset16; - qs0_16_u32_hi4 += qs0_16_hi4_offset16; - qs64_80_u32_lo4 += qs64_80_lo4_offset16; - qs64_80_u32_hi4 += qs64_80_hi4_offset16; - - uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4)); - uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4)); - uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4)); - uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4)); - - const uint32_t q4_0 = qs0_16_lo4.x; - const uint32_t q4_1 = qs0_16_lo4.y; - const uint32_t q4_2 = qs0_16_lo4.z; - const uint32_t q4_3 = qs0_16_lo4.w; - const uint32_t q4_4 = qs0_16_hi4.x; - const uint32_t q4_5 = qs0_16_hi4.y; - const uint32_t q4_6 = qs0_16_hi4.z; - const uint32_t q4_7 = qs0_16_hi4.w; - const uint32_t q4_8 = qs64_80_lo4.x; - const uint32_t q4_9 = qs64_80_lo4.y; - const uint32_t q4_10 = qs64_80_lo4.z; - const uint32_t q4_11 = qs64_80_lo4.w; - const uint32_t q4_12 = qs64_80_hi4.x; - const uint32_t q4_13 = qs64_80_hi4.y; - const uint32_t q4_14 = qs64_80_hi4.z; - const uint32_t q4_15 = qs64_80_hi4.w; - B_TYPE_VEC2 by10 = data_b_v2[(b_offset + y1_idx) / 2]; B_TYPE_VEC2 by116 = data_b_v2[(b_offset + y1_idx) / 2 + 8]; B_TYPE_VEC2 by132 = data_b_v2[(b_offset + y1_idx) / 2 + 16]; @@ -116,45 +52,129 @@ void main() { B_TYPE_VEC2 by232 = data_b_v2[(b_offset + y2_idx) / 2 + 16]; B_TYPE_VEC2 by248 = data_b_v2[(b_offset + y2_idx) / 2 + 24]; - const FLOAT_TYPE sx = - fma(FLOAT_TYPE(by10.x), q4_0, - fma(FLOAT_TYPE(by10.y), q4_1, - fma(FLOAT_TYPE(by116.x), q4_2, - FLOAT_TYPE(by116.y) * q4_3))); - const FLOAT_TYPE sy = - fma(FLOAT_TYPE(by132.x), q4_4, - fma(FLOAT_TYPE(by132.y), q4_5, - fma(FLOAT_TYPE(by148.x), q4_6, - FLOAT_TYPE(by148.y) * q4_7))); - const FLOAT_TYPE sz = - fma(FLOAT_TYPE(by20.x), q4_8, - fma(FLOAT_TYPE(by20.y), q4_9, - fma(FLOAT_TYPE(by216.x), q4_10, - FLOAT_TYPE(by216.y) * q4_11))); - const FLOAT_TYPE sw = - fma(FLOAT_TYPE(by232.x), q4_12, - fma(FLOAT_TYPE(by232.y), q4_13, - fma(FLOAT_TYPE(by248.x), q4_14, - FLOAT_TYPE(by248.y) * q4_15))); - const FLOAT_TYPE smin = - fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2, - fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3, - fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6, - (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7))); - temp = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp)); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + f16vec2 d = data_a[ib0 + i].d; + const FLOAT_TYPE dall = FLOAT_TYPE(d.x); + const FLOAT_TYPE dmin = FLOAT_TYPE(d.y); + + uint32_t scale0_u32 = data_a_packed16[ib0 + i].scales[v_im ]; + uint32_t scale4_u32 = data_a_packed16[ib0 + i].scales[v_im + 2]; + uint32_t scale8_u32 = data_a_packed16[ib0 + i].scales[v_im + 4]; + uvec4 scale0 = uvec4(unpack8(scale0_u32)); + uvec4 scale4 = uvec4(unpack8(scale4_u32)); + uvec4 scale8 = uvec4(unpack8(scale8_u32)); + + const uint32_t sc0 = ( scale0.x & 0x3f); + const uint32_t sc1 = ( scale0.y & 0x3f); + const uint32_t sc2 = ( scale4.x & 0x3f); + const uint32_t sc3 = ( scale4.y & 0x3f); + const uint32_t sc4 = (( scale8.x & 0x0f) | ((scale0.x & 0xc0) >> 2)); + const uint32_t sc5 = (( scale8.y & 0x0f) | ((scale0.y & 0xc0) >> 2)); + const uint32_t sc6 = (((scale8.x >> 4) & 0x0f) | ((scale4.x & 0xc0) >> 2)); + const uint32_t sc7 = (((scale8.y >> 4) & 0x0f) | ((scale4.y & 0xc0) >> 2)); + + uint32_t qs0_16_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 8]) << 16); + uint32_t qs64_80_u32 = uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 32]) | (uint32_t(data_a_packed16[ib0 + i].qs[q_offset / 2 + 40]) << 16); + + uint32_t qs0_16_u32_lo4 = qs0_16_u32 & 0x0F0F0F0F; + uint32_t qs0_16_u32_hi4 = (qs0_16_u32 >> 4) & 0x0F0F0F0F; + uint32_t qs64_80_u32_lo4 = qs64_80_u32 & 0x0F0F0F0F; + uint32_t qs64_80_u32_hi4 = (qs64_80_u32 >> 4) & 0x0F0F0F0F; + + uint32_t qh = pack32(u16vec2(data_a_packed16[ib0 + i].qh[l0 / 2], data_a_packed16[ib0 + i].qh[l0 / 2 + 8])); + + uint32_t qs0_16_lo4_offset16 = ((qh >> (2*v_im)) & 0x01010101) << 4; + uint32_t qs0_16_hi4_offset16 = ((qh >> (2*v_im)) & 0x02020202) << 3; + uint32_t qs64_80_lo4_offset16 = ((qh >> (2*v_im)) & 0x10101010) << 0; + uint32_t qs64_80_hi4_offset16 = ((qh >> (2*v_im)) & 0x20202020) >> 1; + + qs0_16_u32_lo4 += qs0_16_lo4_offset16; + qs0_16_u32_hi4 += qs0_16_hi4_offset16; + qs64_80_u32_lo4 += qs64_80_lo4_offset16; + qs64_80_u32_hi4 += qs64_80_hi4_offset16; + + uvec4 qs0_16_lo4 = uvec4(unpack8(qs0_16_u32_lo4)); + uvec4 qs64_80_lo4 = uvec4(unpack8(qs64_80_u32_lo4)); + uvec4 qs0_16_hi4 = uvec4(unpack8(qs0_16_u32_hi4)); + uvec4 qs64_80_hi4 = uvec4(unpack8(qs64_80_u32_hi4)); + + const uint32_t q4_0 = qs0_16_lo4.x; + const uint32_t q4_1 = qs0_16_lo4.y; + const uint32_t q4_2 = qs0_16_lo4.z; + const uint32_t q4_3 = qs0_16_lo4.w; + const uint32_t q4_4 = qs0_16_hi4.x; + const uint32_t q4_5 = qs0_16_hi4.y; + const uint32_t q4_6 = qs0_16_hi4.z; + const uint32_t q4_7 = qs0_16_hi4.w; + const uint32_t q4_8 = qs64_80_lo4.x; + const uint32_t q4_9 = qs64_80_lo4.y; + const uint32_t q4_10 = qs64_80_lo4.z; + const uint32_t q4_11 = qs64_80_lo4.w; + const uint32_t q4_12 = qs64_80_hi4.x; + const uint32_t q4_13 = qs64_80_hi4.y; + const uint32_t q4_14 = qs64_80_hi4.z; + const uint32_t q4_15 = qs64_80_hi4.w; + + const FLOAT_TYPE sx = + fma(FLOAT_TYPE(by10.x), q4_0, + fma(FLOAT_TYPE(by10.y), q4_1, + fma(FLOAT_TYPE(by116.x), q4_2, + FLOAT_TYPE(by116.y) * q4_3))); + const FLOAT_TYPE sy = + fma(FLOAT_TYPE(by132.x), q4_4, + fma(FLOAT_TYPE(by132.y), q4_5, + fma(FLOAT_TYPE(by148.x), q4_6, + FLOAT_TYPE(by148.y) * q4_7))); + const FLOAT_TYPE sz = + fma(FLOAT_TYPE(by20.x), q4_8, + fma(FLOAT_TYPE(by20.y), q4_9, + fma(FLOAT_TYPE(by216.x), q4_10, + FLOAT_TYPE(by216.y) * q4_11))); + const FLOAT_TYPE sw = + fma(FLOAT_TYPE(by232.x), q4_12, + fma(FLOAT_TYPE(by232.y), q4_13, + fma(FLOAT_TYPE(by248.x), q4_14, + FLOAT_TYPE(by248.y) * q4_15))); + const FLOAT_TYPE smin = + fma(FLOAT_TYPE(by10.x) + FLOAT_TYPE(by10.y) + FLOAT_TYPE(by116.x) + FLOAT_TYPE(by116.y), sc2, + fma(FLOAT_TYPE(by132.x) + FLOAT_TYPE(by132.y) + FLOAT_TYPE(by148.x) + FLOAT_TYPE(by148.y), sc3, + fma(FLOAT_TYPE(by20.x) + FLOAT_TYPE(by20.y) + FLOAT_TYPE(by216.x) + FLOAT_TYPE(by216.y), sc6, + (FLOAT_TYPE(by232.x) + FLOAT_TYPE(by232.y) + FLOAT_TYPE(by248.x) + FLOAT_TYPE(by248.y)) * sc7))); + temp[n] = fma(dall, fma(sx, sc0, fma(sy, sc1, fma(sz, sc4, sw * sc5))), fma(-dmin, smin, temp[n])); + } } - tmp[gl_LocalInvocationID.x] = temp; - // sum up partial sums and write back result + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] = temp[n]; + } barrier(); - [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { if (tid < s) { - tmp[tid] += tmp[tid + s]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] += tmpsh[n][tid + s]; + } } barrier(); } if (tid == 0) { - data_d[d_offset + row] = D_TYPE(tmp[0]); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]); + } + } +} + +void main() { + const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); + + // do NUM_ROWS at a time, unless there aren't enough remaining rows + if (first_row + NUM_ROWS <= p.stride_d) { + compute_outputs(first_row, NUM_ROWS); + } else { + if (first_row >= p.stride_d) { + return; + } + compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp index 760aff854..fab4ff5ff 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp @@ -7,21 +7,15 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (constant_id = 0) const uint BLOCK_SIZE = 32; +layout (constant_id = 1) const uint NUM_ROWS = 1; -shared FLOAT_TYPE tmp[BLOCK_SIZE]; - -void main() { - const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z; - - if (row >= p.stride_d) { - return; - } +shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE]; +void compute_outputs(const uint32_t first_row, const uint32_t num_rows) { uint a_offset, b_offset, d_offset; get_offsets(a_offset, b_offset, d_offset); const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; // 16 threads are used to process each block const uint it_size = gl_WorkGroupSize.x/16; @@ -42,69 +36,95 @@ void main() { const uint s_offset = 8*v_im + is; const uint y_offset = 128*v_im + l0; - FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp + FLOAT_TYPE temp[NUM_ROWS]; + + [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) { + temp[i] = FLOAT_TYPE(0); + } [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) { - const uint y_idx = i * QUANT_K + y_offset; - - const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); - - FLOAT_TYPE scales[4]; - scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]); - scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]); - scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]); - scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]); - - uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); - uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); - - uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; - uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; - uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; - uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; - - uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); - uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; - uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; - uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0; - uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; - - uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; - uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; - uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; - uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; - - uvec4 q0 = uvec4(unpack8(q0_u32)); - uvec4 q1 = uvec4(unpack8(q1_u32)); - uvec4 q2 = uvec4(unpack8(q2_u32)); - uvec4 q3 = uvec4(unpack8(q3_u32)); + const uint y_idx = i * QUANT_K + y_offset; B_TYPE_VEC4 by0 = data_b_v4[(b_offset + y_idx) / 4]; B_TYPE_VEC4 by32 = data_b_v4[(b_offset + y_idx) / 4 + 8]; B_TYPE_VEC4 by64 = data_b_v4[(b_offset + y_idx) / 4 + 16]; B_TYPE_VEC4 by96 = data_b_v4[(b_offset + y_idx) / 4 + 24]; - FLOAT_TYPE sum = FLOAT_TYPE(0.0); - [[unroll]] for (int l = 0; l < 4; ++l) { - sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32), - fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32), - fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32), - fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum)))); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row; + const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); + + FLOAT_TYPE scales[4]; + scales[0] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]); + scales[1] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]); + scales[2] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]); + scales[3] = FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]); + + uint32_t ql0_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 1]) << 16); + uint32_t ql32_u32 = uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 16]) | (uint32_t(data_a_packed16[ib0 + i].ql[ql_offset / 2 + 17]) << 16); + + uint32_t ql0_u32_lo4 = ql0_u32 & 0x0F0F0F0F; + uint32_t ql0_u32_hi4 = (ql0_u32 >> 4) & 0x0F0F0F0F; + uint32_t ql32_u32_lo4 = ql32_u32 & 0x0F0F0F0F; + uint32_t ql32_u32_hi4 = (ql32_u32 >> 4) & 0x0F0F0F0F; + + uint32_t qh_u32 = uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2]) | (uint32_t(data_a_packed16[ib0 + i].qh[qh_offset / 2 + 1]) << 16); + uint32_t qh0_u32 = (qh_u32 & 0x03030303) << 4; + uint32_t qh2_u32 = (qh_u32 & 0x0C0C0C0C) << 2; + uint32_t qh4_u32 = (qh_u32 & 0x30303030) << 0; + uint32_t qh6_u32 = (qh_u32 & 0xC0C0C0C0) >> 2; + + uint32_t q0_u32 = ql0_u32_lo4 | qh0_u32; + uint32_t q1_u32 = ql32_u32_lo4 | qh2_u32; + uint32_t q2_u32 = ql0_u32_hi4 | qh4_u32; + uint32_t q3_u32 = ql32_u32_hi4 | qh6_u32; + + uvec4 q0 = uvec4(unpack8(q0_u32)); + uvec4 q1 = uvec4(unpack8(q1_u32)); + uvec4 q2 = uvec4(unpack8(q2_u32)); + uvec4 q3 = uvec4(unpack8(q3_u32)); + + FLOAT_TYPE sum = FLOAT_TYPE(0.0); + [[unroll]] for (int l = 0; l < 4; ++l) { + sum = fma(FLOAT_TYPE(by0[l]) * scales[0], FLOAT_TYPE(int8_t(q0[l]) - 32), + fma(FLOAT_TYPE(by32[l]) * scales[1], FLOAT_TYPE(int8_t(q1[l]) - 32), + fma(FLOAT_TYPE(by64[l]) * scales[2], FLOAT_TYPE(int8_t(q2[l]) - 32), + fma(FLOAT_TYPE(by96[l]) * scales[3], FLOAT_TYPE(int8_t(q3[l]) - 32), sum)))); + } + temp[n] += sum * d; } - temp += sum * d; } - tmp[gl_LocalInvocationID.x] = temp; // sum up partial sums and write back result - + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] = temp[n]; + } barrier(); - [[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) { + [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) { if (tid < s) { - tmp[tid] += tmp[tid + s]; + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + tmpsh[n][tid] += tmpsh[n][tid + s]; + } } barrier(); } if (tid == 0) { - data_d[d_offset + row] = D_TYPE(tmp[0]); + [[unroll]] for (uint n = 0; n < num_rows; ++n) { + data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]); + } + } +} + +void main() { + const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z); + + // do NUM_ROWS at a time, unless there aren't enough remaining rows + if (first_row + NUM_ROWS <= p.stride_d) { + compute_outputs(first_row, NUM_ROWS); + } else { + if (first_row >= p.stride_d) { + return; + } + compute_outputs(first_row, p.stride_d - first_row); } } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp index 6e20b6411..a25808e16 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/soft_max.comp @@ -32,7 +32,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE]; void soft_max(uint num_iters) { const uint tid = gl_LocalInvocationID.x; const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; - const uint rowy = rowx % p.KY; + const uint rowy = (p.KY > 0) ? (rowx % p.KY) : 0; if (rowx >= p.nrows_x) { return; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index c48a228ae..8111c0638 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -78,7 +78,8 @@ void execute_command(const std::string& command, std::string& stdout_str, std::s } PROCESS_INFORMATION pi; - STARTUPINFOA si = { sizeof(STARTUPINFOA) }; + STARTUPINFOA si = {}; + si.cb = sizeof(STARTUPINFOA); si.dwFlags = STARTF_USESTDHANDLES; si.hStdOutput = stdout_write; si.hStdError = stderr_write; @@ -479,6 +480,8 @@ void process_shaders() { string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("rwkv_wkv6_f32", "wkv6.comp", merge_maps(base_dict, {{"A_TYPE", "float"}})); + for (auto &c : compiles) { c.wait(); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp new file mode 100644 index 000000000..35cc6c45f --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/wkv6.comp @@ -0,0 +1,87 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : require + +#define BLOCK_SIZE 64 +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform Parameters { + uint B; + uint T; + uint C; + uint H; +}; + +layout(binding = 0) readonly buffer KBuf { A_TYPE k[]; }; +layout(binding = 1) readonly buffer VBuf { A_TYPE v[]; }; +layout(binding = 2) readonly buffer RBuf { A_TYPE r[]; }; +layout(binding = 3) readonly buffer TimeFBuf { A_TYPE tf[]; }; +layout(binding = 4) readonly buffer TimeDBuf { A_TYPE td[]; }; +layout(binding = 5) readonly buffer StateBuf { A_TYPE state_in[]; }; +layout(binding = 6) buffer DstBuf { A_TYPE dst[]; }; + +shared A_TYPE _k[BLOCK_SIZE], _r[BLOCK_SIZE], _tf[BLOCK_SIZE], _td[BLOCK_SIZE]; + +void main() { + const uint head_size = BLOCK_SIZE; + const uint batch_id = gl_WorkGroupID.x / H; + const uint head_id = gl_WorkGroupID.x % H; + const uint tid = gl_LocalInvocationID.x; + + const uint state_size = C * head_size; + const uint n_seq_tokens = T / B; + + if (batch_id >= B || head_id >= H) { + return; + } + + A_TYPE state[BLOCK_SIZE]; + [[unroll]] for (uint i = 0; i < head_size; i++) { + state[i] = state_in[batch_id * state_size + head_id * head_size * head_size + + i * head_size + tid]; + } + + barrier(); + _tf[tid] = tf[head_id * head_size + tid]; + barrier(); + + const uint start_t = batch_id * n_seq_tokens * C + head_id * head_size + tid; + const uint end_t = (batch_id + 1) * n_seq_tokens * C + head_id * head_size + tid; + + for (uint t = start_t; t < end_t; t += C) { + barrier(); + _k[tid] = k[t]; + _r[tid] = r[t]; + _td[tid] = td[t]; + barrier(); + + const A_TYPE v_val = v[t]; + A_TYPE y = 0.0; + + [[unroll]] for (uint j = 0; j < head_size; j += 4) { + vec4 k_vec = vec4(_k[j], _k[j+1], _k[j+2], _k[j+3]); + vec4 r_vec = vec4(_r[j], _r[j+1], _r[j+2], _r[j+3]); + vec4 tf_vec = vec4(_tf[j], _tf[j+1], _tf[j+2], _tf[j+3]); + vec4 td_vec = vec4(_td[j], _td[j+1], _td[j+2], _td[j+3]); + vec4 s_vec = vec4(state[j], state[j+1], state[j+2], state[j+3]); + + vec4 kv = k_vec * v_val; + + vec4 temp = tf_vec * kv + s_vec; + y += dot(r_vec, temp); + + s_vec = s_vec * td_vec + kv; + state[j] = s_vec.x; + state[j+1] = s_vec.y; + state[j+2] = s_vec.z; + state[j+3] = s_vec.w; + } + + dst[t] = y; + } + + [[unroll]] for (uint i = 0; i < head_size; i++) { + dst[T * C + batch_id * state_size + head_id * head_size * head_size + + i * head_size + tid] = state[i]; + } +} diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 51cc85662..2bbe5f482 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -3760,104 +3760,10 @@ struct ggml_tensor * ggml_clamp( return result; } -// ggml_conv_1d - static int64_t ggml_calc_conv_output_size(int64_t ins, int64_t ks, int s, int p, int d) { return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; } -GGML_API struct ggml_tensor * ggml_conv_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int p0, - int d0) { - struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K] - - struct ggml_tensor * result = - ggml_mul_mat(ctx, - ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K] - ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K] - - result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL] - - return result; -} - -// ggml_conv_1d_ph - -struct ggml_tensor* ggml_conv_1d_ph( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s, - int d) { - return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); -} - -// ggml_conv_transpose_1d - -static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { - return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; -} - -GGML_API struct ggml_tensor * ggml_conv_transpose_1d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int p0, - int d0) { - GGML_ASSERT(ggml_is_matrix(b)); - GGML_ASSERT(a->ne[2] == b->ne[1]); - GGML_ASSERT(a->ne[3] == 1); - - GGML_ASSERT(p0 == 0); - GGML_ASSERT(d0 == 1); - - const int64_t ne[4] = { - ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), - a->ne[1], b->ne[2], 1, - }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - - int32_t params[] = { s0, p0, d0 }; - ggml_set_op_params(result, params, sizeof(params)); - - result->op = GGML_OP_CONV_TRANSPOSE_1D; - result->src[0] = a; - result->src[1] = b; - - return result; -} - -// ggml_conv_depthwise - -struct ggml_tensor * ggml_conv_depthwise_2d( - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int s0, - int s1, - int p0, - int p1, - int d0, - int d1) { - struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); - struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, - ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), - s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] - struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] - - new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] - struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b); - result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] - - return result; -} -// ggml_conv_2d - // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] // a: [OC,IC, KH, KW] // b: [N, IC, IH, IW] @@ -3874,10 +3780,11 @@ struct ggml_tensor * ggml_im2col( int d1, bool is_2D, enum ggml_type dst_type) { - if(is_2D) { + if (is_2D) { GGML_ASSERT(a->ne[2] == b->ne[2]); } else { - GGML_ASSERT(a->ne[1] == b->ne[1]); + //GGML_ASSERT(b->ne[1] % a->ne[1] == 0); + GGML_ASSERT(b->ne[1] == a->ne[1]); GGML_ASSERT(b->ne[3] == 1); } @@ -3928,6 +3835,108 @@ struct ggml_tensor * ggml_im2col_back( return result; } +// ggml_conv_1d + +struct ggml_tensor * ggml_conv_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K] + + struct ggml_tensor * result = + ggml_mul_mat(ctx, + ggml_reshape_2d(ctx, im2col, im2col->ne[0], (im2col->ne[2] * im2col->ne[1])), // [N, OL, IC * K] => [N*OL, IC * K] + ggml_reshape_2d(ctx, a, (a->ne[0] * a->ne[1]), a->ne[2])); // [OC,IC, K] => [OC, IC * K] + + result = ggml_reshape_3d(ctx, result, im2col->ne[1], a->ne[2], im2col->ne[2]); // [N, OC, OL] + + return result; +} + +// ggml_conv_1d_ph + +struct ggml_tensor* ggml_conv_1d_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s, + int d) { + return ggml_conv_1d(ctx, a, b, s, a->ne[0] / 2, d); +} + +// ggml_conv_1d_dw + +struct ggml_tensor * ggml_conv_1d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], 1, a->ne[1], a->ne[2]); + struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]); + + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); + + struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a); + + result = ggml_reshape_3d(ctx, result, b->ne[0], b->ne[1], 1); + + return result; +} + +// ggml_conv_1d_dw_ph + +struct ggml_tensor * ggml_conv_1d_dw_ph( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int d0) { + return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); +} + +// ggml_conv_transpose_1d + +static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { + return (ins - 1) * s - 2 * p + d * (ks - 1) + 1; +} + +GGML_API struct ggml_tensor * ggml_conv_transpose_1d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0) { + GGML_ASSERT(ggml_is_matrix(b)); + GGML_ASSERT(a->ne[2] == b->ne[1]); + GGML_ASSERT(a->ne[3] == 1); + + GGML_ASSERT(p0 == 0); + GGML_ASSERT(d0 == 1); + + const int64_t ne[4] = { + ggml_calc_conv_transpose_1d_output_size(b->ne[0], a->ne[0], s0, 0 /*p0*/, 1 /*d0*/), + a->ne[1], b->ne[2], 1, + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + int32_t params[] = { s0, p0, d0 }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_CONV_TRANSPOSE_1D; + result->src[0] = a; + result->src[1] = b; + + return result; +} + +// ggml_conv_2d + // a: [OC,IC, KH, KW] // b: [N, IC, IH, IW] // result: [N, OC, OH, OW] @@ -3973,6 +3982,31 @@ struct ggml_tensor * ggml_conv_2d_s1_ph( return ggml_conv_2d(ctx, a, b, 1, 1, a->ne[0] / 2, a->ne[1] / 2, 1, 1); } +// ggml_conv_2d_dw + +struct ggml_tensor * ggml_conv_2d_dw( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); + struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, + ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), + s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] + struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] + + new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] + struct ggml_tensor * result = ggml_mul_mat(ctx, new_a, new_b); + result = ggml_reshape_4d(ctx, result, im2col->ne[1], im2col->ne[2], b->ne[2], b->ne[3]); // [N, OC, OH, OW] + + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { @@ -6037,12 +6071,12 @@ struct ggml_tensor * ggml_graph_get_tensor(const struct ggml_cgraph * cgraph, co struct ggml_tensor * ggml_graph_get_grad(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node); - return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grads[igrad] : NULL; + return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grads ? cgraph->grads[igrad] : NULL; } struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) { const size_t igrad = ggml_hash_find(&cgraph->visited_hash_set, node); - return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) ? cgraph->grad_accs[igrad] : NULL; + return igrad != GGML_HASHSET_FULL && ggml_bitset_get(cgraph->visited_hash_set.used, igrad) && cgraph->grad_accs ? cgraph->grad_accs[igrad] : NULL; } void ggml_graph_print(const struct ggml_cgraph * cgraph) { @@ -6489,7 +6523,7 @@ struct gguf_context { void * data; }; -static size_t gguf_type_size(enum gguf_type type) { +size_t gguf_type_size(enum gguf_type type) { GGML_ASSERT(0 <= type && type < GGUF_TYPE_COUNT); return GGUF_TYPE_SIZE[type]; } @@ -6617,13 +6651,7 @@ struct gguf_context * gguf_init_empty(void) { return ctx; } -struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { - FILE * file = ggml_fopen(fname, "rb"); - if (!file) { - fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno)); - return NULL; - } - +struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params) { // offset from start of file size_t offset = 0; @@ -6636,7 +6664,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p for (uint32_t i = 0; i < sizeof(magic); i++) { if (magic[i] != GGUF_MAGIC[i]) { fprintf(stderr, "%s: invalid magic characters '%c%c%c%c'\n", __func__, magic[0], magic[1], magic[2], magic[3]); - fclose(file); return NULL; } } @@ -6647,7 +6674,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p struct gguf_context * ctx = calloc(1, sizeof(struct gguf_context)); if (!ctx) { fprintf(stderr, "%s: failed to allocate memory for context\n", __func__); - fclose(file); return NULL; } @@ -6665,7 +6691,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (ctx->header.version == 1) { fprintf(stderr, "%s: GGUFv1 is no longer supported. please use a more up-to-date version\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6678,7 +6703,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read header\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6688,12 +6712,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p { const uint64_t n_kv = ctx->header.n_kv; - ctx->kv = calloc(n_kv, sizeof(struct gguf_kv)); - if (!ctx->kv) { - fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__); - fclose(file); - gguf_free(ctx); - return NULL; + if (n_kv > 0) { + ctx->kv = calloc(n_kv, sizeof(struct gguf_kv)); + if (!ctx->kv) { + fprintf(stderr, "%s: failed to allocate memory for kv pairs\n", __func__); + gguf_free(ctx); + return NULL; + } } for (uint64_t i = 0; i < n_kv; ++i) { @@ -6740,7 +6765,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // prevent from integer overflow in the malloc below if (kv->value.arr.n >= SIZE_MAX/gguf_type_size(kv->value.arr.type)) { fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); - fclose(file); gguf_free(ctx); return NULL; } @@ -6748,7 +6772,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p kv->value.arr.data = calloc(kv->value.arr.n, gguf_type_size(kv->value.arr.type)); if (!kv->value.arr.data) { fprintf(stderr, "%s: failed to allocate memory for array\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6760,7 +6783,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // prevent from integer overflow in the malloc below if (kv->value.arr.n >= SIZE_MAX/sizeof(struct gguf_str)) { fprintf(stderr, "%s: array size is too large (%" PRIu64 ")\n", __func__, kv->value.arr.n); - fclose(file); gguf_free(ctx); return NULL; } @@ -6768,7 +6790,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p kv->value.arr.data = calloc(kv->value.arr.n, sizeof(struct gguf_str)); if (!kv->value.arr.data) { fprintf(stderr, "%s: failed to allocate memory for array\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6799,7 +6820,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read key-value pairs\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6810,7 +6830,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ctx->infos = calloc(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); if (!ctx->infos) { fprintf(stderr, "%s: failed to allocate memory for tensor infos\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6846,7 +6865,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read tensor info\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6889,7 +6907,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // this tensor type support have been removed: fprintf(stderr, "%s: tensor '%s' of type %d: %s\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type)); - fclose(file); gguf_free(ctx); return NULL; } @@ -6897,7 +6914,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (ne % ggml_blck_size(info->type) != 0) { fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n", __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); - fclose(file); gguf_free(ctx); return NULL; } @@ -6929,7 +6945,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p *params.ctx = ggml_init(pdata); if (*params.ctx == NULL) { fprintf(stderr, "%s: failed to initialize context\n", __func__); - fclose(file); gguf_free(ctx); return NULL; } @@ -6948,7 +6963,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read tensor data\n", __func__); - fclose(file); ggml_free(ctx_data); gguf_free(ctx); return NULL; @@ -6987,7 +7001,6 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { fprintf(stderr, "%s: failed to read the tensor data\n", __func__); - fclose(file); ggml_free(ctx_data); gguf_free(ctx); return NULL; @@ -6996,11 +7009,21 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ggml_set_no_alloc(ctx_data, params.no_alloc); } - fclose(file); - return ctx; } +struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { + FILE * file = ggml_fopen(fname, "rb"); + if (!file) { + fprintf(stderr, "%s: failed to open '%s': '%s'\n", __func__, fname, strerror(errno)); + return NULL; + } + + struct gguf_context * result = gguf_init_from_file_impl(file, params); + fclose(file); + return result; +} + void gguf_free(struct gguf_context * ctx) { if (ctx == NULL) { return; @@ -7460,13 +7483,7 @@ void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const vo // fwrite(val, sizeof(char), size, file); //} -struct gguf_buf { - void * data; - size_t size; - size_t offset; -}; - -static struct gguf_buf gguf_buf_init(size_t size) { +struct gguf_buf gguf_buf_init(size_t size) { struct gguf_buf buf = { /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size), /*buf.size =*/ size, @@ -7476,7 +7493,7 @@ static struct gguf_buf gguf_buf_init(size_t size) { return buf; } -static void gguf_buf_free(struct gguf_buf buf) { +void gguf_buf_free(struct gguf_buf buf) { if (buf.data) { GGML_FREE(buf.data); } @@ -7514,7 +7531,7 @@ static void gguf_bwrite_el(struct gguf_buf * buf, const void * val, size_t el_si buf->offset += el_size; } -static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { +void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * buf, bool only_meta) { // write header gguf_bwrite_el(buf, &ctx->header.magic, sizeof(ctx->header.magic)); gguf_bwrite_el(buf, &ctx->header.version, sizeof(ctx->header.version)); diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0cf939429..273370370 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -90,6 +90,7 @@ class Keys: VOCAB_SIZE = "{arch}.vocab_size" CONTEXT_LENGTH = "{arch}.context_length" EMBEDDING_LENGTH = "{arch}.embedding_length" + FEATURES_LENGTH = "{arch}.features_length" BLOCK_COUNT = "{arch}.block_count" LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count" FEED_FORWARD_LENGTH = "{arch}.feed_forward_length" @@ -122,6 +123,8 @@ class Keys: VALUE_LENGTH = "{arch}.attention.value_length" LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon" LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon" + GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon" + GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups" CAUSAL = "{arch}.attention.causal" Q_LORA_RANK = "{arch}.attention.q_lora_rank" KV_LORA_RANK = "{arch}.attention.kv_lora_rank" @@ -155,6 +158,14 @@ class Keys: class WKV: HEAD_SIZE = "{arch}.wkv.head_size" + class PosNet: + EMBEDDING_LENGTH = "{arch}.posnet.embedding_length" + BLOCK_COUNT = "{arch}.posnet.block_count" + + class ConvNext: + EMBEDDING_LENGTH = "{arch}.convnext.embedding_length" + BLOCK_COUNT = "{arch}.convnext.block_count" + class Tokenizer: MODEL = "tokenizer.ggml.model" PRE = "tokenizer.ggml.pre" @@ -209,57 +220,60 @@ class GGUFType: class MODEL_ARCH(IntEnum): - LLAMA = auto() - FALCON = auto() - BAICHUAN = auto() - GROK = auto() - GPT2 = auto() - GPTJ = auto() - GPTNEOX = auto() - MPT = auto() - STARCODER = auto() - REFACT = auto() - BERT = auto() - NOMIC_BERT = auto() - JINA_BERT_V2 = auto() - BLOOM = auto() - STABLELM = auto() - QWEN = auto() - QWEN2 = auto() - QWEN2MOE = auto() - QWEN2VL = auto() - PHI2 = auto() - PHI3 = auto() - PLAMO = auto() - CODESHELL = auto() - ORION = auto() - INTERNLM2 = auto() - MINICPM = auto() - MINICPM3 = auto() - GEMMA = auto() - GEMMA2 = auto() - STARCODER2 = auto() - RWKV6 = auto() - MAMBA = auto() - XVERSE = auto() - COMMAND_R = auto() - DBRX = auto() - OLMO = auto() - OLMO2 = auto() - OLMOE = auto() - OPENELM = auto() - ARCTIC = auto() - DEEPSEEK2 = auto() - CHATGLM = auto() - BITNET = auto() - T5 = auto() - T5ENCODER = auto() - JAIS = auto() - NEMOTRON = auto() - EXAONE = auto() - GRANITE = auto() - GRANITE_MOE = auto() - CHAMELEON = auto() + LLAMA = auto() + DECI = auto() + FALCON = auto() + BAICHUAN = auto() + GROK = auto() + GPT2 = auto() + GPTJ = auto() + GPTNEOX = auto() + MPT = auto() + STARCODER = auto() + REFACT = auto() + BERT = auto() + NOMIC_BERT = auto() + JINA_BERT_V2 = auto() + BLOOM = auto() + STABLELM = auto() + QWEN = auto() + QWEN2 = auto() + QWEN2MOE = auto() + QWEN2VL = auto() + PHI2 = auto() + PHI3 = auto() + PLAMO = auto() + CODESHELL = auto() + ORION = auto() + INTERNLM2 = auto() + MINICPM = auto() + MINICPM3 = auto() + GEMMA = auto() + GEMMA2 = auto() + STARCODER2 = auto() + RWKV6 = auto() + MAMBA = auto() + XVERSE = auto() + COMMAND_R = auto() + DBRX = auto() + OLMO = auto() + OLMO2 = auto() + OLMOE = auto() + OPENELM = auto() + ARCTIC = auto() + DEEPSEEK = auto() + DEEPSEEK2 = auto() + CHATGLM = auto() + BITNET = auto() + T5 = auto() + T5ENCODER = auto() + JAIS = auto() + NEMOTRON = auto() + EXAONE = auto() + GRANITE = auto() + GRANITE_MOE = auto() + CHAMELEON = auto() + WAVTOKENIZER_DEC = auto() class MODEL_TENSOR(IntEnum): @@ -369,60 +383,79 @@ class MODEL_TENSOR(IntEnum): ENC_OUTPUT_NORM = auto() CLS = auto() # classifier CLS_OUT = auto() # classifier output projection + CONV1D = auto() + CONVNEXT_DW = auto() + CONVNEXT_NORM = auto() + CONVNEXT_PW1 = auto() + CONVNEXT_PW2 = auto() + CONVNEXT_GAMMA = auto() + POSNET_CONV1 = auto() + POSNET_CONV2 = auto() + POSNET_NORM = auto() + POSNET_NORM1 = auto() + POSNET_NORM2 = auto() + POSNET_ATTN_NORM = auto() + POSNET_ATTN_Q = auto() + POSNET_ATTN_K = auto() + POSNET_ATTN_V = auto() + POSNET_ATTN_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { - MODEL_ARCH.LLAMA: "llama", - MODEL_ARCH.FALCON: "falcon", - MODEL_ARCH.BAICHUAN: "baichuan", - MODEL_ARCH.GROK: "grok", - MODEL_ARCH.GPT2: "gpt2", - MODEL_ARCH.GPTJ: "gptj", - MODEL_ARCH.GPTNEOX: "gptneox", - MODEL_ARCH.MPT: "mpt", - MODEL_ARCH.STARCODER: "starcoder", - MODEL_ARCH.REFACT: "refact", - MODEL_ARCH.BERT: "bert", - MODEL_ARCH.NOMIC_BERT: "nomic-bert", - MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", - MODEL_ARCH.BLOOM: "bloom", - MODEL_ARCH.STABLELM: "stablelm", - MODEL_ARCH.QWEN: "qwen", - MODEL_ARCH.QWEN2: "qwen2", - MODEL_ARCH.QWEN2MOE: "qwen2moe", - MODEL_ARCH.QWEN2VL: "qwen2vl", - MODEL_ARCH.PHI2: "phi2", - MODEL_ARCH.PHI3: "phi3", - MODEL_ARCH.PLAMO: "plamo", - MODEL_ARCH.CODESHELL: "codeshell", - MODEL_ARCH.ORION: "orion", - MODEL_ARCH.INTERNLM2: "internlm2", - MODEL_ARCH.MINICPM: "minicpm", - MODEL_ARCH.MINICPM3: "minicpm3", - MODEL_ARCH.GEMMA: "gemma", - MODEL_ARCH.GEMMA2: "gemma2", - MODEL_ARCH.STARCODER2: "starcoder2", - MODEL_ARCH.RWKV6: "rwkv6", - MODEL_ARCH.MAMBA: "mamba", - MODEL_ARCH.XVERSE: "xverse", - MODEL_ARCH.COMMAND_R: "command-r", - MODEL_ARCH.DBRX: "dbrx", - MODEL_ARCH.OLMO: "olmo", - MODEL_ARCH.OLMO2: "olmo2", - MODEL_ARCH.OLMOE: "olmoe", - MODEL_ARCH.OPENELM: "openelm", - MODEL_ARCH.ARCTIC: "arctic", - MODEL_ARCH.DEEPSEEK2: "deepseek2", - MODEL_ARCH.CHATGLM: "chatglm", - MODEL_ARCH.BITNET: "bitnet", - MODEL_ARCH.T5: "t5", - MODEL_ARCH.T5ENCODER: "t5encoder", - MODEL_ARCH.JAIS: "jais", - MODEL_ARCH.NEMOTRON: "nemotron", - MODEL_ARCH.EXAONE: "exaone", - MODEL_ARCH.GRANITE: "granite", - MODEL_ARCH.GRANITE_MOE: "granitemoe", - MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.LLAMA: "llama", + MODEL_ARCH.DECI: "deci", + MODEL_ARCH.FALCON: "falcon", + MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", + MODEL_ARCH.GPT2: "gpt2", + MODEL_ARCH.GPTJ: "gptj", + MODEL_ARCH.GPTNEOX: "gptneox", + MODEL_ARCH.MPT: "mpt", + MODEL_ARCH.STARCODER: "starcoder", + MODEL_ARCH.REFACT: "refact", + MODEL_ARCH.BERT: "bert", + MODEL_ARCH.NOMIC_BERT: "nomic-bert", + MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2", + MODEL_ARCH.BLOOM: "bloom", + MODEL_ARCH.STABLELM: "stablelm", + MODEL_ARCH.QWEN: "qwen", + MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.QWEN2MOE: "qwen2moe", + MODEL_ARCH.QWEN2VL: "qwen2vl", + MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", + MODEL_ARCH.PLAMO: "plamo", + MODEL_ARCH.CODESHELL: "codeshell", + MODEL_ARCH.ORION: "orion", + MODEL_ARCH.INTERNLM2: "internlm2", + MODEL_ARCH.MINICPM: "minicpm", + MODEL_ARCH.MINICPM3: "minicpm3", + MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.GEMMA2: "gemma2", + MODEL_ARCH.STARCODER2: "starcoder2", + MODEL_ARCH.RWKV6: "rwkv6", + MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", + MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.OLMO: "olmo", + MODEL_ARCH.OLMO2: "olmo2", + MODEL_ARCH.OLMOE: "olmoe", + MODEL_ARCH.OPENELM: "openelm", + MODEL_ARCH.ARCTIC: "arctic", + MODEL_ARCH.DEEPSEEK: "deepseek", + MODEL_ARCH.DEEPSEEK2: "deepseek2", + MODEL_ARCH.CHATGLM: "chatglm", + MODEL_ARCH.BITNET: "bitnet", + MODEL_ARCH.T5: "t5", + MODEL_ARCH.T5ENCODER: "t5encoder", + MODEL_ARCH.JAIS: "jais", + MODEL_ARCH.NEMOTRON: "nemotron", + MODEL_ARCH.EXAONE: "exaone", + MODEL_ARCH.GRANITE: "granite", + MODEL_ARCH.GRANITE_MOE: "granitemoe", + MODEL_ARCH.CHAMELEON: "chameleon", + MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -532,6 +565,22 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = { MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm", MODEL_TENSOR.CLS: "cls", MODEL_TENSOR.CLS_OUT: "cls.output", + MODEL_TENSOR.CONV1D: "conv1d", + MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw", + MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm", + MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1", + MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2", + MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma", + MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1", + MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2", + MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm", + MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1", + MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2", + MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm", + MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q", + MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", + MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", + MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -555,6 +604,26 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.DECI: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], MODEL_ARCH.GROK: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1158,6 +1227,29 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.DEEPSEEK: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], MODEL_ARCH.DEEPSEEK2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -1347,6 +1439,28 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.WAVTOKENIZER_DEC: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.CONV1D, + MODEL_TENSOR.CONVNEXT_DW, + MODEL_TENSOR.CONVNEXT_NORM, + MODEL_TENSOR.CONVNEXT_PW1, + MODEL_TENSOR.CONVNEXT_PW2, + MODEL_TENSOR.CONVNEXT_GAMMA, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.POSNET_CONV1, + MODEL_TENSOR.POSNET_CONV2, + MODEL_TENSOR.POSNET_NORM, + MODEL_TENSOR.POSNET_NORM1, + MODEL_TENSOR.POSNET_NORM2, + MODEL_TENSOR.POSNET_ATTN_NORM, + MODEL_TENSOR.POSNET_ATTN_Q, + MODEL_TENSOR.POSNET_ATTN_K, + MODEL_TENSOR.POSNET_ATTN_V, + MODEL_TENSOR.POSNET_ATTN_OUT, + ], # TODO } @@ -1356,6 +1470,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.DECI: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], MODEL_ARCH.BAICHUAN: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, @@ -1380,6 +1498,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.DEEPSEEK: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], MODEL_ARCH.DEEPSEEK2: [ MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 65a64e10d..3023b539a 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -631,6 +631,21 @@ class GGUFWriter: def add_embedding_length(self, length: int) -> None: self.add_uint32(Keys.LLM.EMBEDDING_LENGTH.format(arch=self.arch), length) + def add_features_length(self, length: int) -> None: + self.add_uint32(Keys.LLM.FEATURES_LENGTH.format(arch=self.arch), length) + + def add_posnet_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.PosNet.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_posnet_block_count(self, length: int) -> None: + self.add_uint32(Keys.PosNet.BLOCK_COUNT.format(arch=self.arch), length) + + def add_convnext_embedding_length(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.EMBEDDING_LENGTH.format(arch=self.arch), length) + + def add_convnext_block_count(self, length: int) -> None: + self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length) + def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) @@ -727,6 +742,12 @@ class GGUFWriter: def add_layer_norm_rms_eps(self, value: float) -> None: self.add_float32(Keys.Attention.LAYERNORM_RMS_EPS.format(arch=self.arch), value) + def add_group_norm_eps(self, value: float) -> None: + self.add_float32(Keys.Attention.GROUPNORM_EPS.format(arch=self.arch), value) + + def add_group_norm_groups(self, value: int) -> None: + self.add_uint32(Keys.Attention.GROUPNORM_GROUPS.format(arch=self.arch), value) + def add_causal_attention(self, value: bool) -> None: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f0a7b6478..7009a11d4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -42,6 +42,7 @@ class TensorNameMap: "emb_ln", # nomic-bert "transformer.norm", # openelm "rwkv.blocks.0.pre_ln", # rwkv + "backbone.norm", # wavtokenizer ), # Position embeddings @@ -60,6 +61,7 @@ class TensorNameMap: "lm_head.linear", # phi2 "output_layer", # chatglm "head", # rwkv + "head.out", # wavtokenizer ), # Output norm @@ -80,6 +82,7 @@ class TensorNameMap: "transformer.norm", # openelm "model.norm", # nemotron "rwkv.ln_out", # rwkv + "backbone.final_layer_norm", # wavtokenizer ), # Rope frequencies @@ -90,6 +93,10 @@ class TensorNameMap: MODEL_TENSOR.ROPE_FACTORS_LONG: (), MODEL_TENSOR.ROPE_FACTORS_SHORT: (), + + MODEL_TENSOR.CONV1D: ( + "backbone.embed", # roberta + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { @@ -191,6 +198,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 + "model.layers.{bid}.self_attn.linear_attn", # deci "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j @@ -306,7 +314,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_UP_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek2 + "model.layers.{bid}.mlp.shared_experts.up_proj", # deepseek deepseek2 ), # AWQ-activation gate @@ -338,7 +346,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_GATE_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek2 + "model.layers.{bid}.mlp.shared_experts.gate_proj", # deepseek deepseek2 ), # Feed-forward down @@ -379,7 +387,7 @@ class TensorNameMap: MODEL_TENSOR.FFN_DOWN_SHEXP: ( "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe - "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek2 + "model.layers.{bid}.mlp.shared_experts.down_proj", # deepseek deepseek2 ), MODEL_TENSOR.ATTN_Q_NORM: ( @@ -681,6 +689,8 @@ class TensorNameMap: "encoder.block.{bid}.layer.1.DenseReluDense.wo", # t5 ), + ############################################################################ + # TODO: these do not belong to block_mappings_cfg - move them to mappings_cfg MODEL_TENSOR.ENC_OUTPUT_NORM: ( "encoder.final_layer_norm", # t5 ), @@ -693,6 +703,67 @@ class TensorNameMap: MODEL_TENSOR.CLS_OUT: ( "classifier.out_proj", # roberta ), + ############################################################################# + + MODEL_TENSOR.CONVNEXT_DW: ( + "backbone.convnext.{bid}.dwconv", # wavtokenizer + ), + + MODEL_TENSOR.CONVNEXT_NORM: ( + "backbone.convnext.{bid}.norm", # wavtokenizer + ), + + MODEL_TENSOR.CONVNEXT_PW1: ( + "backbone.convnext.{bid}.pwconv1", # wavtokenizer + ), + + MODEL_TENSOR.CONVNEXT_PW2: ( + "backbone.convnext.{bid}.pwconv2", # wavtokenizer + ), + + MODEL_TENSOR.CONVNEXT_GAMMA: ( + "backbone.convnext.{bid}.gamma", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_CONV1: ( + "backbone.posnet.{bid}.conv1", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_CONV2: ( + "backbone.posnet.{bid}.conv2", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_NORM: ( + "backbone.posnet.{bid}.norm", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_NORM1: ( + "backbone.posnet.{bid}.norm1", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_NORM2: ( + "backbone.posnet.{bid}.norm2", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_ATTN_NORM: ( + "backbone.posnet.{bid}.norm", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_ATTN_Q: ( + "backbone.posnet.{bid}.q", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_ATTN_K: ( + "backbone.posnet.{bid}.k", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_ATTN_V: ( + "backbone.posnet.{bid}.v", # wavtokenizer + ), + + MODEL_TENSOR.POSNET_ATTN_OUT: ( + "backbone.posnet.{bid}.proj_out", # wavtokenizer + ), } # architecture-specific block mappings diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 10e94876c..9c3956256 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.11.0" +version = "0.13.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ diff --git a/gguf-py/tests/test_quants.py b/gguf-py/tests/test_quants.py index 762067814..f04d5acce 100755 --- a/gguf-py/tests/test_quants.py +++ b/gguf-py/tests/test_quants.py @@ -136,7 +136,7 @@ def compare_tensors(t1: np.ndarray, t2: np.ndarray, qtype: GGMLQuantizationType) logger.debug(f"Sample bad block ({diff_bits[bad_block_id]} differing bits):\n{t1[bad_block_id]}\nReference:\n{t2[bad_block_id]}") sum_diff_bits = np.sum(diff_bits) - logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits/(x.size * 8):.6f}%)") + logger.debug(f"{sum_diff_bits} bits differ ({100 * sum_diff_bits / (x.size * 8):.6f}%)") return False diff --git a/include/llama.h b/include/llama.h index f28baf6de..d8e63835f 100644 --- a/include/llama.h +++ b/include/llama.h @@ -482,9 +482,6 @@ extern "C" { // Returns the total number of parameters in the model LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model); - // Get a llama model tensor - LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); - // Returns true if the model contains an encoder that requires llama_encode() call LLAMA_API bool llama_model_has_encoder(const struct llama_model * model); @@ -1143,16 +1140,12 @@ extern "C" { LLAMA_API bool llama_sampler_is_grammar_empty(struct llama_sampler * gsmpl); + /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first. LLAMA_API struct llama_sampler * llama_sampler_init_penalties( - int32_t n_vocab, // llama_n_vocab() - llama_token special_eos_id, // llama_token_eos() - llama_token linefeed_id, // llama_token_nl() - int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat, // 1.0 = disabled - float penalty_freq, // 0.0 = disabled - float penalty_present, // 0.0 = disabled - bool penalize_nl, // consider newlines as a repeatable token - bool ignore_eos); // ignore the end-of-sequence token + int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat, // 1.0 = disabled + float penalty_freq, // 0.0 = disabled + float penalty_present); // 0.0 = disabled /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 LLAMA_API struct llama_sampler * llama_sampler_init_dry( diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index 143d98729..e40d1cc6d 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -20,11 +20,13 @@ if [ -n "$GGML_CUDA" ]; then cmake_opts="-DGGML_CUDA=ON" fi +dir="build-bench" + function run { - rm -fr build > /dev/null - cmake -B build -S . $cmake_opts > /dev/null - cmake --build build -t llama-bench > /dev/null - build/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite + rm -fr ${dir} > /dev/null + cmake -B ${dir} -S . $cmake_opts > /dev/null + cmake --build ${dir} -t llama-bench > /dev/null + ${dir}/bin/llama-bench -o sql -oe md $bench_args | sqlite3 llama-bench.sqlite } git checkout $1 > /dev/null diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 5069ae638..239c458d8 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -126,6 +126,8 @@ connection = sqlite3.connect(input_file) cursor = connection.cursor() builds = cursor.execute("SELECT DISTINCT build_commit FROM test;").fetchall() +commit_short_len = len(builds[0][0]) + try: repo = git.Repo(".", search_parent_directories=True) except git.InvalidGitRepositoryError: @@ -138,11 +140,11 @@ def find_parent_in_data(commit: git.Commit): seen_hexsha8 = set() while heap: depth, current_commit = heapq.heappop(heap) - current_hexsha8 = commit.hexsha[:8] + current_hexsha8 = commit.hexsha[:commit_short_len] if (current_hexsha8,) in builds: return current_hexsha8 for parent in commit.parents: - parent_hexsha8 = parent.hexsha[:8] + parent_hexsha8 = parent.hexsha[:commit_short_len] if parent_hexsha8 not in seen_hexsha8: seen_hexsha8.add(parent_hexsha8) heapq.heappush(heap, (depth + 1, parent)) @@ -156,9 +158,9 @@ def get_all_parent_hexsha8s(commit: git.Commit): while unvisited: current_commit = unvisited.pop(0) - visited.append(current_commit.hexsha[:8]) + visited.append(current_commit.hexsha[:commit_short_len]) for parent in current_commit.parents: - if parent.hexsha[:8] not in visited: + if parent.hexsha[:commit_short_len] not in visited: unvisited.append(parent) return visited @@ -169,10 +171,10 @@ def get_commit_name(hexsha8): if repo is None: return hexsha8 for h in repo.heads: - if h.commit.hexsha[:8] == hexsha8: + if h.commit.hexsha[:commit_short_len] == hexsha8: return h.name for t in repo.tags: - if t.commit.hexsha[:8] == hexsha8: + if t.commit.hexsha[:commit_short_len] == hexsha8: return t.name return hexsha8 @@ -183,13 +185,13 @@ def get_commit_hexsha8(name): return None for h in repo.heads: if h.name == name: - return h.commit.hexsha[:8] + return h.commit.hexsha[:commit_short_len] for t in repo.tags: if t.name == name: - return t.commit.hexsha[:8] + return t.commit.hexsha[:commit_short_len] for c in repo.iter_commits("--all"): - if c.hexsha[:8] == name[:8]: - return c.hexsha[:8] + if c.hexsha[:commit_short_len] == name[:commit_short_len]: + return c.hexsha[:commit_short_len] return None diff --git a/scripts/hf.sh b/scripts/hf.sh index 85c2c4d9a..b251925fa 100755 --- a/scripts/hf.sh +++ b/scripts/hf.sh @@ -26,7 +26,7 @@ function has_cmd { } if has_cmd wget; then - cmd="wget -q --show-progress -c -O %s/%s %s" + cmd="wget -q -c -O %s/%s %s" elif has_cmd curl; then cmd="curl -C - -f --output-dir %s -o %s -L %s" else diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 47eae44f7..b4ac38bbf 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -74d66b63eaf207a24f3e93bb922aba131cbf2906 +e6d93f40dffe8733d5d72f1d8fa6b3ca27ae899f diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp index b554fa694..d66f74a05 100644 --- a/src/llama-grammar.cpp +++ b/src/llama-grammar.cpp @@ -822,15 +822,11 @@ llama_grammar_stacks & llama_grammar_get_stacks(struct llama_grammar * grammar) return grammar->stacks; } -void llama_grammar_accept( - const llama_grammar_rules & rules, - const llama_grammar_stacks & stacks, - const uint32_t chr, - llama_grammar_stacks & stacks_new) { - stacks_new.clear(); - stacks_new.reserve(stacks.size()); +void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr) { + llama_grammar_stacks stacks_new; + stacks_new.reserve(grammar->stacks.size()); - for (const auto & stack : stacks) { + for (const auto & stack : grammar->stacks) { if (stack.empty()) { continue; } @@ -844,9 +840,11 @@ void llama_grammar_accept( if (!llama_grammar_is_end_of_sequence(pos)) { new_stack.push_back(pos); } - llama_grammar_advance_stack(rules, new_stack, stacks_new); + llama_grammar_advance_stack(grammar->rules, new_stack, stacks_new); } } + + grammar->stacks = std::move(stacks_new); } llama_grammar_candidates llama_grammar_reject_candidates_for_stack( @@ -1051,7 +1049,12 @@ void llama_grammar_free_impl(struct llama_grammar * grammar) { } struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar) { - llama_grammar * result = new llama_grammar { grammar.vocab, grammar.rules, grammar.stacks, grammar.partial_utf8, }; + llama_grammar * result = new llama_grammar { + grammar.vocab, + grammar.rules, + grammar.stacks, + grammar.partial_utf8, + }; // redirect elements in stacks to point to new rules for (size_t is = 0; is < result->stacks.size(); is++) { @@ -1059,7 +1062,7 @@ struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & gra for (size_t ir0 = 0; ir0 < grammar.rules.size(); ir0++) { for (size_t ir1 = 0; ir1 < grammar.rules[ir0].size(); ir1++) { if (grammar.stacks[is][ie] == &grammar.rules[ir0][ir1]) { - result->stacks[is][ie] = &result->rules[ir0][ir1]; + result->stacks[is][ie] = &result->rules[ir0][ir1]; } } } @@ -1129,11 +1132,8 @@ void llama_grammar_accept_str(struct llama_grammar & grammar, const std::string const auto decoded = decode_utf8(piece, grammar.partial_utf8); const auto & code_points = decoded.first; - llama_grammar_stacks stacks_new; - for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - llama_grammar_accept(grammar.rules, grammar.stacks, *it, stacks_new); - grammar.stacks = std::move(stacks_new); + llama_grammar_accept(&grammar, *it); } grammar.partial_utf8 = decoded.second; diff --git a/src/llama-grammar.h b/src/llama-grammar.h index 4a55ff5da..f8e281e56 100644 --- a/src/llama-grammar.h +++ b/src/llama-grammar.h @@ -58,6 +58,7 @@ using llama_grammar_rules = std::vector; using llama_grammar_stacks = std::vector; using llama_grammar_candidates = std::vector; +// TODO: remove, needed for tests atm const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar); llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar); @@ -65,11 +66,7 @@ const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar // be positioned at a character range (see `llama_grammar_advance_stack`), and // produces the N possible stacks if the given char is accepted at those // positions -void llama_grammar_accept( - const llama_grammar_rules & rules, - const llama_grammar_stacks & stacks, - uint32_t chr, - llama_grammar_stacks & stacks_new); +void llama_grammar_accept(struct llama_grammar * grammar, uint32_t chr); std::vector llama_grammar_reject_candidates_for_stack( const llama_grammar_rules & rules, diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp index f5e4c56d1..b032f8192 100644 --- a/src/llama-sampling.cpp +++ b/src/llama-sampling.cpp @@ -1428,19 +1428,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab // penalties struct llama_sampler_penalties { - const int32_t n_vocab; - const llama_token special_eos_id; - const llama_token linefeed_id; - const int32_t penalty_last_n; const float penalty_repeat; const float penalty_freq; const float penalty_present; - const bool penalize_nl; - const bool ignore_eos; - ring_buffer prev; + + // a frequency map to count token occurrences + std::unordered_map token_count; }; static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) { @@ -1453,76 +1449,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to return; } + ctx->token_count[token]++; + + // if the ring buffer is full, remove the oldest token + if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) { + const auto old = ctx->prev.front(); + + ctx->token_count[old]--; + if (ctx->token_count[old] == 0) { + ctx->token_count.erase(old); + } + } + ctx->prev.push_back(token); + +#if 0 + // sanity check + std::unordered_map tmp; + for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) { + tmp[ctx->prev.rat(i)]++; + } + + assert(ctx->token_count == tmp); +#endif } static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) { auto * ctx = (llama_sampler_penalties *) smpl->ctx; - if (ctx->ignore_eos) { - assert(ctx->special_eos_id >= 0); - - // optimistically check if the candidates are not yet sorted/shuffled/truncated - if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) { - cur_p->data[ctx->special_eos_id].logit = -INFINITY; - } else { - // else, search for the special EOS token - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].id == ctx->special_eos_id) { - cur_p->data[i].logit = -INFINITY; - break; - } - } - } - } - if ((ctx->penalty_last_n == 0) || (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) { return; } - bool nl_found = false; - size_t nl_idx = 0; - float nl_logit = -INFINITY; - if (!ctx->penalize_nl) { - assert(ctx->linefeed_id >= 0); - - // optimistically check if the candidates are not yet sorted/shuffled/truncated - if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) { - nl_found = true; - nl_idx = ctx->linefeed_id; - nl_logit = cur_p->data[ctx->linefeed_id].logit; - } else { - // else, search for the linefeed token - for (size_t i = 0; i < cur_p->size; ++i) { - if (cur_p->data[i].id == ctx->linefeed_id) { - nl_found = true; - nl_idx = i; - nl_logit = cur_p->data[i].logit; - break; - } - } - } - } - - // Create a frequency map to count occurrences of each token in last_tokens - // TODO: optimize this by maintaining the token count in the sampler context - using llama_token_cnt = std::unordered_map; - llama_token_cnt token_count; - - for (int i = 0; i < std::min(ctx->penalty_last_n, ctx->prev.size()); ++i) { - token_count[ctx->prev.rat(i)]++; - } - // Apply frequency and presence penalties to the cur_p for (size_t i = 0; i < cur_p->size; ++i) { - const auto token_iter = token_count.find(cur_p->data[i].id); - if (token_iter == token_count.end()) { + const auto token_iter = ctx->token_count.find(cur_p->data[i].id); + if (token_iter == ctx->token_count.end()) { continue; } const int count = token_iter->second; + assert(count > 0 && count <= ctx->penalty_last_n); + // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong. // This is common fix for this problem, which is to multiply by the penalty instead of dividing. if (cur_p->data[i].logit <= 0) { @@ -1535,30 +1505,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok } cur_p->sorted = false; - - if (!ctx->penalize_nl && nl_found) { - // restore the logit of the newline token if it was penalized - cur_p->data[nl_idx].logit = nl_logit; - } } static void llama_sampler_penalties_reset(struct llama_sampler * smpl) { auto * ctx = (llama_sampler_penalties *) smpl->ctx; ctx->prev.clear(); + ctx->token_count.clear(); } static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) { const auto * ctx = (const llama_sampler_penalties *) smpl->ctx; auto * result = llama_sampler_init_penalties( - ctx->n_vocab, - ctx->special_eos_id, - ctx->linefeed_id, ctx->penalty_last_n, ctx->penalty_repeat, ctx->penalty_freq, - ctx->penalty_present, - ctx->penalize_nl, - ctx->ignore_eos); + ctx->penalty_present); // copy the state { @@ -1585,38 +1546,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = { }; struct llama_sampler * llama_sampler_init_penalties( - int32_t n_vocab, - llama_token special_eos_id, - llama_token linefeed_id, int32_t penalty_last_n, float penalty_repeat, float penalty_freq, - float penalty_present, - bool penalize_nl, - bool ignore_eos) { - if (linefeed_id == LLAMA_TOKEN_NULL) { - penalize_nl = true; - } - - if (special_eos_id == LLAMA_TOKEN_NULL) { - ignore_eos = false; - } - + float penalty_present) { penalty_last_n = std::max(penalty_last_n, 0); return new llama_sampler { /* .iface = */ &llama_sampler_penalties_i, /* .ctx = */ new llama_sampler_penalties { - /* .n_vocab = */ n_vocab, - /* .special_eos_id = */ special_eos_id, - /* .linefeed_id = */ linefeed_id, /* .penalty_last_n = */ penalty_last_n, /* .penalty_repeat = */ penalty_repeat, /* .penalty_freq = */ penalty_freq, /* .penalty_present = */ penalty_present, - /* .penalize_nl = */ penalize_nl, - /* .ignore_eos = */ ignore_eos, /* .prev = */ ring_buffer(penalty_last_n), + /* .token_count = */ {}, }, }; } @@ -1644,7 +1588,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std if (word.find(str) != std::string::npos) { token_sequences.emplace(token_id, std::vector()); } else { - size_t word_len = word.size(), str_len = str.size(); + size_t word_len = word.size(); + size_t str_len = str.size(); size_t pos = -1; while ((pos = word.find(str[0], pos + 1)) != std::string::npos) { bool match = true; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 8c9aaf5a0..0a477d6dd 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -738,7 +738,7 @@ struct llm_tokenizer_wpm_session { std::vector words(1, ""); for (const uint32_t cpt : cpts_nfd) { - const auto flags = unicode_cpt_flags(cpt); + const auto flags = unicode_cpt_flags_from_cpt(cpt); if (flags.is_whitespace) { if (words.back().size()) { // finish previous word if any @@ -1657,7 +1657,7 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t } llama_token llama_token_bos_impl(const struct llama_vocab & vocab) { - return vocab.special_bos_id; + return vocab.type != LLAMA_VOCAB_TYPE_WPM ? vocab.special_bos_id : vocab.special_cls_id; } llama_token llama_token_eos_impl(const struct llama_vocab & vocab) { @@ -1867,6 +1867,10 @@ int32_t llama_detokenize_impl( int32_t text_len_max, bool remove_special, bool unparse_special) { + if (vocab.type == LLAMA_VOCAB_TYPE_NONE) { + return 0; + } + GGML_ASSERT(vocab.tokenizer && "Tokenizer not initialized. Call llama_vocab::init_tokenizer() first."); int32_t avail = text_len_max; diff --git a/src/llama-vocab.h b/src/llama-vocab.h index 4bb16d2e4..a9b0da5ef 100644 --- a/src/llama-vocab.h +++ b/src/llama-vocab.h @@ -45,7 +45,7 @@ struct llama_vocab { id special_unk_id = 0; id special_sep_id = LLAMA_TOKEN_NULL; id special_pad_id = LLAMA_TOKEN_NULL; - id special_cls_id = LLAMA_TOKEN_NULL; + id special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930 id special_mask_id = LLAMA_TOKEN_NULL; id linefeed_id = 13; diff --git a/src/llama.cpp b/src/llama.cpp index 55c59b329..9a829b5b0 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -146,6 +146,7 @@ static std::string format(const char * fmt, ...) { enum llm_arch { LLM_ARCH_LLAMA, + LLM_ARCH_DECI, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, LLM_ARCH_GROK, @@ -184,6 +185,7 @@ enum llm_arch { LLM_ARCH_OLMOE, LLM_ARCH_OPENELM, LLM_ARCH_ARCTIC, + LLM_ARCH_DEEPSEEK, LLM_ARCH_DEEPSEEK2, LLM_ARCH_CHATGLM, LLM_ARCH_BITNET, @@ -196,62 +198,66 @@ enum llm_arch { LLM_ARCH_GRANITE, LLM_ARCH_GRANITE_MOE, LLM_ARCH_CHAMELEON, + LLM_ARCH_WAVTOKENIZER_DEC, LLM_ARCH_UNKNOWN, }; static const std::map LLM_ARCH_NAMES = { - { LLM_ARCH_LLAMA, "llama" }, - { LLM_ARCH_FALCON, "falcon" }, - { LLM_ARCH_GROK, "grok" }, - { LLM_ARCH_GPT2, "gpt2" }, - { LLM_ARCH_GPTJ, "gptj" }, - { LLM_ARCH_GPTNEOX, "gptneox" }, - { LLM_ARCH_MPT, "mpt" }, - { LLM_ARCH_BAICHUAN, "baichuan" }, - { LLM_ARCH_STARCODER, "starcoder" }, - { LLM_ARCH_REFACT, "refact" }, - { LLM_ARCH_BERT, "bert" }, - { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, - { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, - { LLM_ARCH_BLOOM, "bloom" }, - { LLM_ARCH_STABLELM, "stablelm" }, - { LLM_ARCH_QWEN, "qwen" }, - { LLM_ARCH_QWEN2, "qwen2" }, - { LLM_ARCH_QWEN2MOE, "qwen2moe" }, - { LLM_ARCH_QWEN2VL, "qwen2vl" }, - { LLM_ARCH_PHI2, "phi2" }, - { LLM_ARCH_PHI3, "phi3" }, - { LLM_ARCH_PLAMO, "plamo" }, - { LLM_ARCH_CODESHELL, "codeshell" }, - { LLM_ARCH_ORION, "orion" }, - { LLM_ARCH_INTERNLM2, "internlm2" }, - { LLM_ARCH_MINICPM, "minicpm" }, - { LLM_ARCH_MINICPM3, "minicpm3" }, - { LLM_ARCH_GEMMA, "gemma" }, - { LLM_ARCH_GEMMA2, "gemma2" }, - { LLM_ARCH_STARCODER2, "starcoder2" }, - { LLM_ARCH_MAMBA, "mamba" }, - { LLM_ARCH_XVERSE, "xverse" }, - { LLM_ARCH_COMMAND_R, "command-r" }, - { LLM_ARCH_DBRX, "dbrx" }, - { LLM_ARCH_OLMO, "olmo" }, - { LLM_ARCH_OLMO2, "olmo2" }, - { LLM_ARCH_OLMOE, "olmoe" }, - { LLM_ARCH_OPENELM, "openelm" }, - { LLM_ARCH_ARCTIC, "arctic" }, - { LLM_ARCH_DEEPSEEK2, "deepseek2" }, - { LLM_ARCH_CHATGLM, "chatglm" }, - { LLM_ARCH_BITNET, "bitnet" }, - { LLM_ARCH_T5, "t5" }, - { LLM_ARCH_T5ENCODER, "t5encoder" }, - { LLM_ARCH_JAIS, "jais" }, - { LLM_ARCH_NEMOTRON, "nemotron" }, - { LLM_ARCH_EXAONE, "exaone" }, - { LLM_ARCH_RWKV6, "rwkv6" }, - { LLM_ARCH_GRANITE, "granite" }, - { LLM_ARCH_GRANITE_MOE, "granitemoe" }, - { LLM_ARCH_CHAMELEON, "chameleon" }, - { LLM_ARCH_UNKNOWN, "(unknown)" }, + { LLM_ARCH_LLAMA, "llama" }, + { LLM_ARCH_DECI, "deci" }, + { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GROK, "grok" }, + { LLM_ARCH_GPT2, "gpt2" }, + { LLM_ARCH_GPTJ, "gptj" }, + { LLM_ARCH_GPTNEOX, "gptneox" }, + { LLM_ARCH_MPT, "mpt" }, + { LLM_ARCH_BAICHUAN, "baichuan" }, + { LLM_ARCH_STARCODER, "starcoder" }, + { LLM_ARCH_REFACT, "refact" }, + { LLM_ARCH_BERT, "bert" }, + { LLM_ARCH_NOMIC_BERT, "nomic-bert" }, + { LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" }, + { LLM_ARCH_BLOOM, "bloom" }, + { LLM_ARCH_STABLELM, "stablelm" }, + { LLM_ARCH_QWEN, "qwen" }, + { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_QWEN2MOE, "qwen2moe" }, + { LLM_ARCH_QWEN2VL, "qwen2vl" }, + { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PHI3, "phi3" }, + { LLM_ARCH_PLAMO, "plamo" }, + { LLM_ARCH_CODESHELL, "codeshell" }, + { LLM_ARCH_ORION, "orion" }, + { LLM_ARCH_INTERNLM2, "internlm2" }, + { LLM_ARCH_MINICPM, "minicpm" }, + { LLM_ARCH_MINICPM3, "minicpm3" }, + { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_GEMMA2, "gemma2" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_XVERSE, "xverse" }, + { LLM_ARCH_COMMAND_R, "command-r" }, + { LLM_ARCH_DBRX, "dbrx" }, + { LLM_ARCH_OLMO, "olmo" }, + { LLM_ARCH_OLMO2, "olmo2" }, + { LLM_ARCH_OLMOE, "olmoe" }, + { LLM_ARCH_OPENELM, "openelm" }, + { LLM_ARCH_ARCTIC, "arctic" }, + { LLM_ARCH_DEEPSEEK, "deepseek" }, + { LLM_ARCH_DEEPSEEK2, "deepseek2" }, + { LLM_ARCH_CHATGLM, "chatglm" }, + { LLM_ARCH_BITNET, "bitnet" }, + { LLM_ARCH_T5, "t5" }, + { LLM_ARCH_T5ENCODER, "t5encoder" }, + { LLM_ARCH_JAIS, "jais" }, + { LLM_ARCH_NEMOTRON, "nemotron" }, + { LLM_ARCH_EXAONE, "exaone" }, + { LLM_ARCH_RWKV6, "rwkv6" }, + { LLM_ARCH_GRANITE, "granite" }, + { LLM_ARCH_GRANITE_MOE, "granitemoe" }, + { LLM_ARCH_CHAMELEON, "chameleon" }, + { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, }; enum llm_kv { @@ -271,6 +277,7 @@ enum llm_kv { LLM_KV_VOCAB_SIZE, LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, + LLM_KV_FEATURES_LENGTH, LLM_KV_BLOCK_COUNT, LLM_KV_LEADING_DENSE_BLOCK_COUNT, LLM_KV_FEED_FORWARD_LENGTH, @@ -302,6 +309,8 @@ enum llm_kv { LLM_KV_ATTENTION_VALUE_LENGTH, LLM_KV_ATTENTION_LAYERNORM_EPS, LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, + LLM_KV_ATTENTION_GROUPNORM_EPS, + LLM_KV_ATTENTION_GROUPNORM_GROUPS, LLM_KV_ATTENTION_CAUSAL, LLM_KV_ATTENTION_Q_LORA_RANK, LLM_KV_ATTENTION_KV_LORA_RANK, @@ -365,6 +374,12 @@ enum llm_kv { LLM_KV_ADAPTER_TYPE, LLM_KV_ADAPTER_LORA_ALPHA, + LLM_KV_POSNET_EMBEDDING_LENGTH, + LLM_KV_POSNET_BLOCK_COUNT, + + LLM_KV_CONVNEXT_EMBEDDING_LENGTH, + LLM_KV_CONVNEXT_BLOCK_COUNT, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, @@ -388,6 +403,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_VOCAB_SIZE, "%s.vocab_size" }, { LLM_KV_CONTEXT_LENGTH, "%s.context_length" }, { LLM_KV_EMBEDDING_LENGTH, "%s.embedding_length" }, + { LLM_KV_FEATURES_LENGTH, "%s.features_length" }, { LLM_KV_BLOCK_COUNT, "%s.block_count" }, { LLM_KV_LEADING_DENSE_BLOCK_COUNT, "%s.leading_dense_block_count" }, { LLM_KV_FEED_FORWARD_LENGTH, "%s.feed_forward_length" }, @@ -419,6 +435,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_VALUE_LENGTH, "%s.attention.value_length" }, { LLM_KV_ATTENTION_LAYERNORM_EPS, "%s.attention.layer_norm_epsilon" }, { LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, "%s.attention.layer_norm_rms_epsilon" }, + { LLM_KV_ATTENTION_GROUPNORM_EPS, "%s.attention.group_norm_epsilon" }, + { LLM_KV_ATTENTION_GROUPNORM_GROUPS, "%s.attention.group_norm_groups" }, { LLM_KV_ATTENTION_CAUSAL, "%s.attention.causal" }, { LLM_KV_ATTENTION_Q_LORA_RANK, "%s.attention.q_lora_rank" }, { LLM_KV_ATTENTION_KV_LORA_RANK, "%s.attention.kv_lora_rank" }, @@ -449,6 +467,12 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_WKV_HEAD_SIZE, "%s.wkv.head_size" }, + { LLM_KV_POSNET_EMBEDDING_LENGTH, "%s.posnet.embedding_length" }, + { LLM_KV_POSNET_BLOCK_COUNT, "%s.posnet.block_count" }, + + { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" }, + { LLM_KV_CONVNEXT_BLOCK_COUNT, "%s.convnext.block_count" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -607,6 +631,22 @@ enum llm_tensor { LLM_TENSOR_ENC_OUTPUT_NORM, LLM_TENSOR_CLS, LLM_TENSOR_CLS_OUT, + LLM_TENSOR_CONV1D, + LLM_TENSOR_CONVNEXT_DW, + LLM_TENSOR_CONVNEXT_NORM, + LLM_TENSOR_CONVNEXT_PW1, + LLM_TENSOR_CONVNEXT_PW2, + LLM_TENSOR_CONVNEXT_GAMMA, + LLM_TENSOR_POS_NET_CONV1, + LLM_TENSOR_POS_NET_CONV2, + LLM_TENSOR_POS_NET_NORM, + LLM_TENSOR_POS_NET_NORM1, + LLM_TENSOR_POS_NET_NORM2, + LLM_TENSOR_POS_NET_ATTN_NORM, + LLM_TENSOR_POS_NET_ATTN_Q, + LLM_TENSOR_POS_NET_ATTN_K, + LLM_TENSOR_POS_NET_ATTN_V, + LLM_TENSOR_POS_NET_ATTN_OUT, }; static const std::map> LLM_TENSOR_NAMES = { @@ -636,6 +676,32 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, + { + LLM_ARCH_DECI, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, { LLM_ARCH_BAICHUAN, { @@ -1309,6 +1375,33 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, + { + LLM_ARCH_DEEPSEEK, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + }, + }, { LLM_ARCH_DEEPSEEK2, { @@ -1564,6 +1657,31 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, + { + LLM_ARCH_WAVTOKENIZER_DEC, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_CONV1D, "conv1d" }, + { LLM_TENSOR_CONVNEXT_DW, "convnext.%d.dw" }, + { LLM_TENSOR_CONVNEXT_NORM, "convnext.%d.norm" }, + { LLM_TENSOR_CONVNEXT_PW1, "convnext.%d.pw1" }, + { LLM_TENSOR_CONVNEXT_PW2, "convnext.%d.pw2" }, + { LLM_TENSOR_CONVNEXT_GAMMA, "convnext.%d.gamma" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_POS_NET_CONV1, "posnet.%d.conv1" }, + { LLM_TENSOR_POS_NET_CONV2, "posnet.%d.conv2" }, + { LLM_TENSOR_POS_NET_NORM, "posnet.%d.norm" }, + { LLM_TENSOR_POS_NET_NORM1, "posnet.%d.norm1" }, + { LLM_TENSOR_POS_NET_NORM2, "posnet.%d.norm2" }, + { LLM_TENSOR_POS_NET_ATTN_NORM, "posnet.%d.attn_norm" }, + { LLM_TENSOR_POS_NET_ATTN_Q, "posnet.%d.attn_q" }, + { LLM_TENSOR_POS_NET_ATTN_K, "posnet.%d.attn_k" }, + { LLM_TENSOR_POS_NET_ATTN_V, "posnet.%d.attn_v" }, + { LLM_TENSOR_POS_NET_ATTN_OUT, "posnet.%d.attn_output" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -1583,6 +1701,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN, LLM_CHAT_TEMPLATE_MISTRAL_V7, LLM_CHAT_TEMPLATE_PHI_3, + LLM_CHAT_TEMPLATE_FALCON_3, LLM_CHAT_TEMPLATE_ZEPHYR, LLM_CHAT_TEMPLATE_MONARCH, LLM_CHAT_TEMPLATE_GEMMA, @@ -1600,6 +1719,8 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_EXAONE_3, LLM_CHAT_TEMPLATE_RWKV_WORLD, LLM_CHAT_TEMPLATE_GRANITE, + LLM_CHAT_TEMPLATE_GIGACHAT, + LLM_CHAT_TEMPLATE_MEGREZ, LLM_CHAT_TEMPLATE_UNKNOWN, }; @@ -1614,6 +1735,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN }, { "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 }, { "phi3", LLM_CHAT_TEMPLATE_PHI_3 }, + { "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 }, { "zephyr", LLM_CHAT_TEMPLATE_ZEPHYR }, { "monarch", LLM_CHAT_TEMPLATE_MONARCH }, { "gemma", LLM_CHAT_TEMPLATE_GEMMA }, @@ -1631,6 +1753,8 @@ static const std::map LLM_CHAT_TEMPLATES = { { "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 }, { "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD }, { "granite", LLM_CHAT_TEMPLATE_GRANITE }, + { "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT }, + { "megrez", LLM_CHAT_TEMPLATE_MEGREZ }, }; static llm_arch llm_arch_from_string(const std::string & name) { @@ -2452,15 +2576,26 @@ static const size_t kiB = 1024; static const size_t MiB = 1024*kiB; static const size_t GiB = 1024*MiB; +struct llama_hparams_posnet { + uint32_t n_embd; + uint32_t n_layer; +}; + +struct llama_hparams_convnext { + uint32_t n_embd; + uint32_t n_layer; +}; + struct llama_hparams { bool vocab_only; bool rope_finetuned; bool use_par_res; bool swin_norm; - uint32_t n_vocab; + uint32_t n_vocab = 0; uint32_t n_ctx_train; // context size the model was trained on uint32_t n_embd; + uint32_t n_embd_features = 0; uint32_t n_layer; uint32_t n_rot; uint32_t n_swa = 0; // sliding window attention (SWA) @@ -2471,6 +2606,10 @@ struct llama_hparams { uint32_t n_vocab_type = 0; // for BERT-style token types uint32_t n_rel_attn_bkts = 0; + // for WavTokenizer + struct llama_hparams_posnet posnet; + struct llama_hparams_convnext convnext; + std::array n_head_arr; std::array n_head_kv_arr; std::array n_ff_arr; @@ -2485,6 +2624,9 @@ struct llama_hparams { float f_norm_eps; float f_norm_rms_eps; + float f_norm_group_eps; + + uint32_t n_norm_groups; float f_attn_logit_softcapping = 50.0f; float f_final_logit_softcapping = 30.0f; @@ -2530,66 +2672,6 @@ struct llama_hparams { enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE; - bool operator!=(const llama_hparams & other) const { - if (this->vocab_only != other.vocab_only) return true; - if (this->n_vocab != other.n_vocab) return true; - if (this->n_ctx_train != other.n_ctx_train) return true; - if (this->n_embd != other.n_embd) return true; - if (this->n_layer != other.n_layer) return true; - if (this->n_rot != other.n_rot) return true; - if (this->n_swa != other.n_swa) return true; - if (this->n_embd_head_k != other.n_embd_head_k) return true; - if (this->n_embd_head_v != other.n_embd_head_v) return true; - if (this->n_expert != other.n_expert) return true; - if (this->n_expert_used != other.n_expert_used) return true; - - if (this->n_head_arr != other.n_head_arr) return true; - if (this->n_head_kv_arr != other.n_head_kv_arr) return true; - if (this->n_ff_arr != other.n_ff_arr) return true; - - if (this->n_rel_attn_bkts != other.n_rel_attn_bkts) return true; - if (this->n_layer_dense_lead != other.n_layer_dense_lead) return true; - if (this->n_lora_q != other.n_lora_q) return true; - if (this->n_lora_kv != other.n_lora_kv) return true; - if (this->n_ff_exp != other.n_ff_exp) return true; - if (this->n_ff_shexp != other.n_ff_shexp) return true; - if (this->n_expert_shared != other.n_expert_shared) return true; - - if (this->rope_finetuned != other.rope_finetuned) return true; - if (this->n_ctx_orig_yarn != other.n_ctx_orig_yarn) return true; - if (std::equal(std::begin(this->rope_sections), - std::end(this->rope_sections), - std::begin(other.rope_sections))) return true; - - if (this->ssm_d_conv != other.ssm_d_conv) return true; - if (this->ssm_d_inner != other.ssm_d_inner) return true; - if (this->ssm_d_state != other.ssm_d_state) return true; - if (this->ssm_dt_rank != other.ssm_dt_rank) return true; - if (this->ssm_dt_b_c_rms != other.ssm_dt_b_c_rms) return true; - - if (this->rescale_every_n_layers != other.rescale_every_n_layers) return true; - if (this->time_mix_extra_dim != other.time_mix_extra_dim) return true; - if (this->time_decay_extra_dim != other.time_decay_extra_dim) return true; - if (this->wkv_head_size != other.wkv_head_size) return true; - - if (this->dec_start_token_id != other.dec_start_token_id) return true; - - const float EPSILON = 1e-9f; - - if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; - if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; - if (!is_float_close(this->rope_attn_factor, other.rope_attn_factor, EPSILON)) return true; - if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; - if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; - if (!is_float_close(this->expert_weights_scale, other.expert_weights_scale, EPSILON)) return true; - if (!is_float_close(this->rope_yarn_log_mul, other.rope_yarn_log_mul, EPSILON)) return true; - if (!is_float_close(this->f_residual_scale, other.f_residual_scale, EPSILON)) return true; - if (!is_float_close(this->f_embedding_scale, other.f_embedding_scale, EPSILON)) return true; - if (!is_float_close(this->f_attention_scale, other.f_attention_scale, EPSILON)) return true; - - return false; - } - uint32_t n_head(uint32_t il = 0) const { if (il < n_layer) { return n_head_arr[il]; @@ -2642,21 +2724,21 @@ struct llama_hparams { if (wkv_head_size != 0) { // for RWKV models return 2 * n_embd; - } else { - // TODO: maybe support other convolution strides than 1 - // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed - return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; } + + // TODO: maybe support other convolution strides than 1 + // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed + return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner; } uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings if (wkv_head_size != 0) { // corresponds to RWKV's wkv_states size return n_embd * wkv_head_size; - } else { - // corresponds to Mamba's ssm_states size - return ssm_d_state * ssm_d_inner; } + + // corresponds to Mamba's ssm_states size + return ssm_d_state * ssm_d_inner; } }; @@ -2694,142 +2776,187 @@ struct llama_cparams { void * cb_eval_user_data; }; -// TODO: separate into "llama_layer_enc" and "llama_layer_dec" -struct llama_layer { - llama_layer() { - // initialize all pointers to NULL - std::memset(this, 0, sizeof(*this)); - } +struct llama_layer_posnet { + // resnet + struct ggml_tensor * norm1 = nullptr; + struct ggml_tensor * norm1_b = nullptr; - // normalization - struct ggml_tensor * attn_norm; - struct ggml_tensor * attn_norm_b; - struct ggml_tensor * attn_norm_2; - struct ggml_tensor * attn_norm_2_b; - struct ggml_tensor * attn_q_norm; - struct ggml_tensor * attn_q_norm_b; - struct ggml_tensor * attn_k_norm; - struct ggml_tensor * attn_k_norm_b; - struct ggml_tensor * attn_out_norm; - struct ggml_tensor * attn_out_norm_b; - struct ggml_tensor * attn_q_a_norm; - struct ggml_tensor * attn_kv_a_norm; - struct ggml_tensor * attn_sub_norm; - struct ggml_tensor * attn_post_norm; - struct ggml_tensor * ffn_sub_norm; - struct ggml_tensor * attn_norm_cross; - struct ggml_tensor * attn_norm_enc; + struct ggml_tensor * conv1 = nullptr; + struct ggml_tensor * conv1_b = nullptr; + + struct ggml_tensor * norm2 = nullptr; + struct ggml_tensor * norm2_b = nullptr; + + struct ggml_tensor * conv2 = nullptr; + struct ggml_tensor * conv2_b = nullptr; // attention - struct ggml_tensor * wq; - struct ggml_tensor * wk; - struct ggml_tensor * wv; - struct ggml_tensor * wo; - struct ggml_tensor * wqkv; - struct ggml_tensor * wq_a; - struct ggml_tensor * wq_b; - struct ggml_tensor * wkv_a_mqa; - struct ggml_tensor * wkv_b; - struct ggml_tensor * wq_cross; - struct ggml_tensor * wk_cross; - struct ggml_tensor * wv_cross; - struct ggml_tensor * wo_cross; - struct ggml_tensor * wq_enc; - struct ggml_tensor * wk_enc; - struct ggml_tensor * wv_enc; - struct ggml_tensor * wo_enc; + struct ggml_tensor * attn_norm = nullptr; + struct ggml_tensor * attn_norm_b = nullptr; + + struct ggml_tensor * attn_q = nullptr; + struct ggml_tensor * attn_q_b = nullptr; + + struct ggml_tensor * attn_k = nullptr; + struct ggml_tensor * attn_k_b = nullptr; + + struct ggml_tensor * attn_v = nullptr; + struct ggml_tensor * attn_v_b = nullptr; + + struct ggml_tensor * attn_o = nullptr; + struct ggml_tensor * attn_o_b = nullptr; + + // normalize + struct ggml_tensor * norm = nullptr; + struct ggml_tensor * norm_b = nullptr; +}; + +struct llama_layer_convnext { + struct ggml_tensor * dw = nullptr; + struct ggml_tensor * dw_b = nullptr; + + struct ggml_tensor * norm = nullptr; + struct ggml_tensor * norm_b = nullptr; + + struct ggml_tensor * pw1 = nullptr; + struct ggml_tensor * pw1_b = nullptr; + + struct ggml_tensor * pw2 = nullptr; + struct ggml_tensor * pw2_b = nullptr; + + struct ggml_tensor * gamma = nullptr; +}; + +struct llama_layer { + // normalization + struct ggml_tensor * attn_norm = nullptr; + struct ggml_tensor * attn_norm_b = nullptr; + struct ggml_tensor * attn_norm_2 = nullptr; + struct ggml_tensor * attn_norm_2_b = nullptr; + struct ggml_tensor * attn_q_norm = nullptr; + struct ggml_tensor * attn_q_norm_b = nullptr; + struct ggml_tensor * attn_k_norm = nullptr; + struct ggml_tensor * attn_k_norm_b = nullptr; + struct ggml_tensor * attn_out_norm = nullptr; + struct ggml_tensor * attn_out_norm_b = nullptr; + struct ggml_tensor * attn_q_a_norm = nullptr; + struct ggml_tensor * attn_kv_a_norm = nullptr; + struct ggml_tensor * attn_sub_norm = nullptr; + struct ggml_tensor * attn_post_norm = nullptr; + struct ggml_tensor * ffn_sub_norm = nullptr; + struct ggml_tensor * attn_norm_cross = nullptr; + struct ggml_tensor * attn_norm_enc = nullptr; + + // attention + struct ggml_tensor * wq = nullptr; + struct ggml_tensor * wk = nullptr; + struct ggml_tensor * wv = nullptr; + struct ggml_tensor * wo = nullptr; + struct ggml_tensor * wqkv = nullptr; + struct ggml_tensor * wq_a = nullptr; + struct ggml_tensor * wq_b = nullptr; + struct ggml_tensor * wkv_a_mqa = nullptr; + struct ggml_tensor * wkv_b = nullptr; + struct ggml_tensor * wq_cross = nullptr; + struct ggml_tensor * wk_cross = nullptr; + struct ggml_tensor * wv_cross = nullptr; + struct ggml_tensor * wo_cross = nullptr; + struct ggml_tensor * wq_enc = nullptr; + struct ggml_tensor * wk_enc = nullptr; + struct ggml_tensor * wv_enc = nullptr; + struct ggml_tensor * wo_enc = nullptr; // attention bias - struct ggml_tensor * bq; - struct ggml_tensor * bk; - struct ggml_tensor * bv; - struct ggml_tensor * bo; - struct ggml_tensor * bqkv; + struct ggml_tensor * bq = nullptr; + struct ggml_tensor * bk = nullptr; + struct ggml_tensor * bv = nullptr; + struct ggml_tensor * bo = nullptr; + struct ggml_tensor * bqkv = nullptr; // relative position bias - struct ggml_tensor * attn_rel_b; - struct ggml_tensor * attn_rel_b_enc; - struct ggml_tensor * attn_rel_b_cross; + struct ggml_tensor * attn_rel_b = nullptr; + struct ggml_tensor * attn_rel_b_enc = nullptr; + struct ggml_tensor * attn_rel_b_cross = nullptr; // normalization - struct ggml_tensor * ffn_norm; - struct ggml_tensor * ffn_norm_b; - struct ggml_tensor * ffn_post_norm; - struct ggml_tensor * layer_out_norm; - struct ggml_tensor * layer_out_norm_b; - struct ggml_tensor * ffn_norm_exps; - struct ggml_tensor * ffn_norm_enc; + struct ggml_tensor * ffn_norm = nullptr; + struct ggml_tensor * ffn_norm_b = nullptr; + struct ggml_tensor * ffn_post_norm = nullptr; + struct ggml_tensor * layer_out_norm = nullptr; + struct ggml_tensor * layer_out_norm_b = nullptr; + struct ggml_tensor * ffn_norm_exps = nullptr; + struct ggml_tensor * ffn_norm_enc = nullptr; // ff - struct ggml_tensor * ffn_gate; // w1 - struct ggml_tensor * ffn_down; // w2 - struct ggml_tensor * ffn_up; // w3 - struct ggml_tensor * ffn_gate_enc; - struct ggml_tensor * ffn_down_enc; - struct ggml_tensor * ffn_up_enc; + struct ggml_tensor * ffn_gate = nullptr; // w1 + struct ggml_tensor * ffn_down = nullptr; // w2 + struct ggml_tensor * ffn_up = nullptr; // w3 + struct ggml_tensor * ffn_gate_enc = nullptr; + struct ggml_tensor * ffn_down_enc = nullptr; + struct ggml_tensor * ffn_up_enc = nullptr; // ff MoE - struct ggml_tensor * ffn_gate_inp; - struct ggml_tensor * ffn_gate_exps; - struct ggml_tensor * ffn_down_exps; - struct ggml_tensor * ffn_up_exps ; + struct ggml_tensor * ffn_gate_inp = nullptr; + struct ggml_tensor * ffn_gate_exps = nullptr; + struct ggml_tensor * ffn_down_exps = nullptr; + struct ggml_tensor * ffn_up_exps = nullptr; // ff shared expert (shexp) - struct ggml_tensor * ffn_gate_inp_shexp; - struct ggml_tensor * ffn_gate_shexp; - struct ggml_tensor * ffn_down_shexp; - struct ggml_tensor * ffn_up_shexp; + struct ggml_tensor * ffn_gate_inp_shexp = nullptr; + struct ggml_tensor * ffn_gate_shexp = nullptr; + struct ggml_tensor * ffn_down_shexp = nullptr; + struct ggml_tensor * ffn_up_shexp = nullptr; // ff bias - struct ggml_tensor * ffn_gate_b; - struct ggml_tensor * ffn_down_b; // b2 - struct ggml_tensor * ffn_up_b; // b3 - struct ggml_tensor * ffn_act; + struct ggml_tensor * ffn_gate_b = nullptr; + struct ggml_tensor * ffn_down_b = nullptr; // b2 + struct ggml_tensor * ffn_up_b = nullptr; // b3 + struct ggml_tensor * ffn_act = nullptr; // mamba proj - struct ggml_tensor * ssm_in; - struct ggml_tensor * ssm_x; - struct ggml_tensor * ssm_dt; - struct ggml_tensor * ssm_out; + struct ggml_tensor * ssm_in = nullptr; + struct ggml_tensor * ssm_x = nullptr; + struct ggml_tensor * ssm_dt = nullptr; + struct ggml_tensor * ssm_out = nullptr; // mamba - struct ggml_tensor * ssm_conv1d; - struct ggml_tensor * ssm_a; - struct ggml_tensor * ssm_d; + struct ggml_tensor * ssm_conv1d = nullptr; + struct ggml_tensor * ssm_a = nullptr; + struct ggml_tensor * ssm_d = nullptr; // mamba bias - struct ggml_tensor * ssm_conv1d_b; - struct ggml_tensor * ssm_dt_b; + struct ggml_tensor * ssm_conv1d_b = nullptr; + struct ggml_tensor * ssm_dt_b = nullptr; // rwkv - struct ggml_tensor * time_mix_w1; - struct ggml_tensor * time_mix_w2; - struct ggml_tensor * time_mix_lerp_x; - struct ggml_tensor * time_mix_lerp_w; - struct ggml_tensor * time_mix_lerp_k; - struct ggml_tensor * time_mix_lerp_v; - struct ggml_tensor * time_mix_lerp_r; - struct ggml_tensor * time_mix_lerp_g; + struct ggml_tensor * time_mix_w1 = nullptr; + struct ggml_tensor * time_mix_w2 = nullptr; + struct ggml_tensor * time_mix_lerp_x = nullptr; + struct ggml_tensor * time_mix_lerp_w = nullptr; + struct ggml_tensor * time_mix_lerp_k = nullptr; + struct ggml_tensor * time_mix_lerp_v = nullptr; + struct ggml_tensor * time_mix_lerp_r = nullptr; + struct ggml_tensor * time_mix_lerp_g = nullptr; - struct ggml_tensor * time_mix_first; - struct ggml_tensor * time_mix_decay; - struct ggml_tensor * time_mix_decay_w1; - struct ggml_tensor * time_mix_decay_w2; - struct ggml_tensor * time_mix_key; - struct ggml_tensor * time_mix_value; - struct ggml_tensor * time_mix_receptance; - struct ggml_tensor * time_mix_gate; + struct ggml_tensor * time_mix_first = nullptr; + struct ggml_tensor * time_mix_decay = nullptr; + struct ggml_tensor * time_mix_decay_w1 = nullptr; + struct ggml_tensor * time_mix_decay_w2 = nullptr; + struct ggml_tensor * time_mix_key = nullptr; + struct ggml_tensor * time_mix_value = nullptr; + struct ggml_tensor * time_mix_receptance = nullptr; + struct ggml_tensor * time_mix_gate = nullptr; - struct ggml_tensor * time_mix_ln; - struct ggml_tensor * time_mix_ln_b; - struct ggml_tensor * time_mix_output; + struct ggml_tensor * time_mix_ln = nullptr; + struct ggml_tensor * time_mix_ln_b = nullptr; + struct ggml_tensor * time_mix_output = nullptr; - struct ggml_tensor * channel_mix_lerp_k; - struct ggml_tensor * channel_mix_lerp_r; + struct ggml_tensor * channel_mix_lerp_k = nullptr; + struct ggml_tensor * channel_mix_lerp_r = nullptr; - struct ggml_tensor * channel_mix_key; - struct ggml_tensor * channel_mix_receptance; - struct ggml_tensor * channel_mix_value; + struct ggml_tensor * channel_mix_key = nullptr; + struct ggml_tensor * channel_mix_receptance = nullptr; + struct ggml_tensor * channel_mix_value = nullptr; // long rope factors struct ggml_tensor * rope_long = nullptr; @@ -2837,13 +2964,17 @@ struct llama_layer { struct ggml_tensor * rope_freqs = nullptr; // bitnet scale - struct ggml_tensor * wq_scale; - struct ggml_tensor * wk_scale; - struct ggml_tensor * wv_scale; - struct ggml_tensor * wo_scale; - struct ggml_tensor * ffn_gate_scale; - struct ggml_tensor * ffn_up_scale; - struct ggml_tensor * ffn_down_scale; + struct ggml_tensor * wq_scale = nullptr; + struct ggml_tensor * wk_scale = nullptr; + struct ggml_tensor * wv_scale = nullptr; + struct ggml_tensor * wo_scale = nullptr; + struct ggml_tensor * ffn_gate_scale = nullptr; + struct ggml_tensor * ffn_up_scale = nullptr; + struct ggml_tensor * ffn_down_scale = nullptr; + + struct llama_layer_posnet posnet; + + struct llama_layer_convnext convnext; }; // very similar to llama_batch, @@ -2974,6 +3105,9 @@ struct llama_model { struct ggml_tensor * cls_out = nullptr; struct ggml_tensor * cls_out_b = nullptr; + struct ggml_tensor * conv1d = nullptr; + struct ggml_tensor * conv1d_b = nullptr; + std::vector layers; // gguf metadata @@ -3058,6 +3192,7 @@ struct llama_sbatch { // batch indices of the output std::vector out_ids; std::vector seq; + const llama_batch * batch = nullptr; // buffers for the ubatch @@ -3478,6 +3613,17 @@ static int llama_get_device_count(const llama_model & model) { return (int) model.devices.size(); } +static struct ggml_tensor * llama_get_model_tensor(const struct llama_model * model, const char * name) { + auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model->tensors_by_name.end()) { + return nullptr; + } + return it->second; +} + template static bool buft_supported(ggml_backend_buffer_type_t buft, ggml_backend_dev_t dev, F & fn) { ggml_init_params params = { @@ -3531,7 +3677,9 @@ static bool llama_kv_cache_init( const struct llama_hparams & hparams = model.hparams; - const int64_t n_layer = hparams.n_layer; + const int32_t n_layer = hparams.n_layer; + + LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer); cache.has_shift = false; @@ -3572,10 +3720,12 @@ static bool llama_kv_cache_init( cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - for (int i = 0; i < (int) n_layer; i++) { + for (int i = 0; i < n_layer; i++) { const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s(); + LLAMA_LOG_DEBUG("%s: layer %d: n_embd_k_gqa = %d, n_embd_v_gqa = %d\n", __func__, i, n_embd_k_gqa, n_embd_v_gqa); + ggml_backend_buffer_type_t buft; if (offload) { auto * dev = model.dev_layer.at(i).dev; @@ -5488,7 +5638,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_GENERAL_NAME, model.name, false); // get hparams kv - ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab); + ml.get_key(LLM_KV_VOCAB_SIZE, hparams.n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab, false); // everything past this point is not vocab-related if (hparams.vocab_only) { @@ -5501,6 +5651,16 @@ static void llm_load_hparams( ml.get_key(LLM_KV_EXPERT_COUNT, hparams.n_expert, false); ml.get_key(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used, false); + if (model.arch == LLM_ARCH_WAVTOKENIZER_DEC) { + ml.get_key(LLM_KV_FEATURES_LENGTH, hparams.n_embd_features); + + ml.get_key(LLM_KV_POSNET_EMBEDDING_LENGTH, hparams.posnet.n_embd); + ml.get_key(LLM_KV_POSNET_BLOCK_COUNT, hparams.posnet.n_layer); + + ml.get_key(LLM_KV_CONVNEXT_EMBEDDING_LENGTH, hparams.convnext.n_embd); + ml.get_key(LLM_KV_CONVNEXT_BLOCK_COUNT, hparams.convnext.n_layer); + } + GGML_ASSERT(hparams.n_expert <= LLAMA_MAX_EXPERTS); GGML_ASSERT(hparams.n_expert_used <= hparams.n_expert); if (hparams.n_expert > 0) { @@ -5509,13 +5669,13 @@ static void llm_load_hparams( GGML_ASSERT(hparams.n_expert_used == 0); } - // zero-out the per-layer hparams + // zero-out the array hparams std::fill(hparams.n_head_arr.begin(), hparams.n_head_arr.end(), 0); std::fill(hparams.n_head_kv_arr.begin(), hparams.n_head_kv_arr.end(), 0); std::fill(hparams.n_ff_arr.begin(), hparams.n_ff_arr.end(), 0); - ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer); - ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer); + ml.get_key_or_arr(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, hparams.n_layer, false); // n_head_kv is optional, default to n_head hparams.n_head_kv_arr = hparams.n_head_arr; @@ -5564,7 +5724,7 @@ static void llm_load_hparams( ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false); - if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) { + if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_DECI || model.arch == LLM_ARCH_FALCON) { if (hparams.n_rot != hparams.n_embd_head_k) { throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd_head_k)); } @@ -5604,6 +5764,15 @@ static void llm_load_hparams( } } } break; + case LLM_ARCH_DECI: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 80: model.type = e_model::MODEL_70B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_MINICPM: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -6094,6 +6263,19 @@ static void llm_load_hparams( model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_DEEPSEEK: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); + ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); + ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); + + switch (hparams.n_layer) { + case 28: model.type = e_model::MODEL_20B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_DEEPSEEK2: { bool is_lite = (hparams.n_layer == 27); @@ -6247,6 +6429,13 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_WAVTOKENIZER_DEC: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_GROUPNORM_EPS, hparams.f_norm_group_eps); + ml.get_key(LLM_KV_ATTENTION_GROUPNORM_GROUPS, hparams.n_norm_groups); + ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn); + } break; default: (void)0; } @@ -6276,7 +6465,7 @@ static void llm_load_vocab( ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - if (tokenizer_model == "no_vocab") { + if (tokenizer_model == "no_vocab" || tokenizer_model == "none") { vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens @@ -6414,7 +6603,8 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "llama3" || tokenizer_pre == "llama-v3" || - tokenizer_pre == "llama-bpe") { + tokenizer_pre == "llama-bpe"|| + tokenizer_pre == "falcon3") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; vocab.tokenizer_ignore_merges = true; vocab.tokenizer_add_bos = true; @@ -6440,10 +6630,12 @@ static void llm_load_vocab( tokenizer_pre == "phi-2" || tokenizer_pre == "jina-es" || tokenizer_pre == "jina-de" || + tokenizer_pre == "gigachat" || tokenizer_pre == "jina-v1-en" || tokenizer_pre == "jina-v2-es" || tokenizer_pre == "jina-v2-de" || - tokenizer_pre == "jina-v2-code") { + tokenizer_pre == "jina-v2-code" || + tokenizer_pre == "roberta-bpe") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; } else if ( tokenizer_pre == "refact") { @@ -6513,6 +6705,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "minerva-7b") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MINERVA; + } else if ( + tokenizer_pre == "megrez") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -7091,6 +7286,13 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: max token length = %d\n", __func__, vocab.max_token_len); + if (model.arch == LLM_ARCH_DEEPSEEK) { + LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); + LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); + LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); + LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); + } + if (model.arch == LLM_ARCH_DEEPSEEK2) { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); @@ -7247,6 +7449,22 @@ static const std::map llm_tensor_info_mapping = { {LLM_TENSOR_FFN_UP_EXPS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT_ID}}, // this tensor is loaded for T5, but never used {LLM_TENSOR_DEC_CROSS_ATTN_REL_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_NONE}}, + {LLM_TENSOR_CONV1D, {LLM_TENSOR_LAYER_INPUT, GGML_OP_IM2COL}}, + {LLM_TENSOR_POS_NET_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_POS_NET_NORM1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_POS_NET_NORM2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_POS_NET_CONV1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_POS_NET_CONV2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_POS_NET_ATTN_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_POS_NET_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_POS_NET_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_POS_NET_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_POS_NET_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONVNEXT_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_IM2COL}}, + {LLM_TENSOR_CONVNEXT_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, }; // checks if the weight tensor can be used with the specified buffer type and device @@ -7351,6 +7569,12 @@ static bool weight_buft_supported(const llama_hparams & hparams, ggml_tensor * w ggml_tensor * state = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, S, n_seqs, S, H); op_tensor = ggml_rwkv_wkv6(ctx, k, v, r, tf, td, state); } break; + case GGML_OP_IM2COL: + { + const int n_embd = hparams.n_embd; + ggml_tensor * b = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, n_embd, w->ne[1], 1, 1); + op_tensor = ggml_im2col(ctx, w, b, 1, 0, 0, 0, 1, 0, false, GGML_TYPE_F16); + } break; default: GGML_ABORT("%s: missing test for op %s for tensor %s", __func__, ggml_op_name(op), w->name); } @@ -7481,7 +7705,8 @@ static bool llm_load_tensors( model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - const int n_layer = hparams.n_layer; + const int n_layer = hparams.n_layer; + bool use_mmap_buffer = true; // build a list of buffer types for the CPU and GPU devices @@ -7756,6 +7981,68 @@ static bool llm_load_tensors( } } } break; + case LLM_ARCH_DECI: + { + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); + + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(i); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(i); + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(i); + const int64_t n_ff = hparams.n_ff(i); + const int64_t n_head = hparams.n_head(i); + const int64_t n_head_kv = hparams.n_head_kv(i); + + if (n_head_kv == 0 && n_head > 0) { + // linear attention for DeciLMCausalModel + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + } + else if (n_head_kv > 0) { + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_head_k * n_head}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_v_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_head_k * n_head, n_embd}, 0); + } + + // optional bias tensors + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) { + layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + layer.rope_short = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_SHORT, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + else { + layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, llama_model_loader::TENSOR_NOT_REQUIRED | (i != 0 ? llama_model_loader::TENSOR_DUPLICATED : 0)); + } + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // optional MLP bias + layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, llama_model_loader::TENSOR_NOT_REQUIRED); + layer.ffn_up_b = create_tensor(tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, llama_model_loader::TENSOR_NOT_REQUIRED); + } + } break; case LLM_ARCH_MINICPM3: { const int64_t n_embd_head_qk_rope = hparams.n_rot; @@ -8865,6 +9152,55 @@ static bool llm_load_tensors( layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}, 0); } } break; + case LLM_ARCH_DEEPSEEK: + { + + const int64_t n_ff_exp = hparams.n_ff_exp; + const int64_t n_expert_shared = hparams.n_expert_shared; + + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + + if (i < (int) hparams.n_layer_dense_lead) { + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } else { + layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}, 0); + + if (n_expert == 0) { + throw std::runtime_error("n_expert must be > 0"); + } + if (n_expert_used == 0) { + throw std::runtime_error("n_expert_used must be > 0"); + } + + // MoE branch + layer.ffn_gate_exps = create_tensor(tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}, 0); + layer.ffn_up_exps = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}, 0); + + // Shared expert branch + layer.ffn_gate_shexp = create_tensor(tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + layer.ffn_down_shexp = create_tensor(tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff_exp * n_expert_shared, n_embd}, 0); + layer.ffn_up_shexp = create_tensor(tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff_exp * n_expert_shared}, 0); + } + } + } break; case LLM_ARCH_DEEPSEEK2: { const bool is_lite = (hparams.n_layer == 27); @@ -9235,9 +9571,9 @@ static bool llm_load_tensors( } break; case LLM_ARCH_CHAMELEON: { - model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - // output + // output model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, llama_model_loader::TENSOR_NOT_REQUIRED); // if output is NULL, init from the input tok embed @@ -9266,6 +9602,109 @@ static bool llm_load_tensors( layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_WAVTOKENIZER_DEC: + { + model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {hparams.n_embd_features, n_vocab}, 0); + + model.conv1d = create_tensor(tn(LLM_TENSOR_CONV1D, "weight"), {7, hparams.n_embd_features, hparams.posnet.n_embd}, 0); + model.conv1d_b = create_tensor(tn(LLM_TENSOR_CONV1D, "bias"), {1, hparams.posnet.n_embd}, 0); + + // posnet + { + const int64_t n_embd = hparams.posnet.n_embd; + + for (uint32_t i = 0; i < hparams.posnet.n_layer; ++i) { + auto & layer = model.layers[i].posnet; + + // posnet: + // + // - resnet + // - resnet + // - attn + // - resnet + // - resnet + // - norm + // + switch (i) { + case 0: + case 1: + case 3: + case 4: + { + layer.norm1 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "weight", i), {1, n_embd}, 0); + layer.norm1_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM1, "bias", i), {1, n_embd}, 0); + + layer.conv1 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "weight", i), {3, n_embd, n_embd}, 0); + layer.conv1_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV1, "bias", i), {1, n_embd}, 0); + + layer.norm2 = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "weight", i), {1, n_embd}, 0); + layer.norm2_b = create_tensor(tn(LLM_TENSOR_POS_NET_NORM2, "bias", i), {1, n_embd}, 0); + + layer.conv2 = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "weight", i), {3, n_embd, n_embd}, 0); + layer.conv2_b = create_tensor(tn(LLM_TENSOR_POS_NET_CONV2, "bias", i), {1, n_embd}, 0); + } break; + case 2: + { + layer.attn_norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); + layer.attn_norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + + layer.attn_q = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_q_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_Q, "bias", i), {1, n_embd}, 0); + + layer.attn_k = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_k_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_K, "bias", i), {1, n_embd}, 0); + + layer.attn_v = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_v_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_V, "bias", i), {1, n_embd}, 0); + + layer.attn_o = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "weight", i), {1, n_embd, n_embd}, 0); + layer.attn_o_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_OUT, "bias", i), {1, n_embd}, 0); + } break; + case 5: + { + layer.norm = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "weight", i), {1, n_embd}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_POS_NET_ATTN_NORM, "bias", i), {1, n_embd}, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; + } + } + + GGML_ASSERT(hparams.posnet.n_embd == hparams.convnext.n_embd); + + model.tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {hparams.posnet.n_embd}, 0); + model.tok_norm_b = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {hparams.posnet.n_embd}, 0); + + // convnext + { + const int64_t n_embd = hparams.convnext.n_embd; + + for (uint32_t i = 0; i < hparams.convnext.n_layer; ++i) { + auto & layer = model.layers[i].convnext; + + layer.dw = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "weight", i), {7, 1, n_embd}, 0); + layer.dw_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_DW, "bias", i), {1, n_embd}, 0); + + layer.norm = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "weight", i), {n_embd}, 0); + layer.norm_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_NORM, "bias", i), {n_embd}, 0); + + layer.pw1 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "weight", i), {n_embd, n_ff}, 0); + layer.pw1_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW1, "bias", i), {n_ff}, 0); + + layer.pw2 = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "weight", i), {n_ff, n_embd}, 0); + layer.pw2_b = create_tensor(tn(LLM_TENSOR_CONVNEXT_PW2, "bias", i), {n_embd}, 0); + + layer.gamma = create_tensor(tn(LLM_TENSOR_CONVNEXT_GAMMA, "weight", i), {n_embd}, 0); + } + + // output + model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + model.output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0); + } + + model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {hparams.convnext.n_embd, n_embd}, 0); + model.output_b = create_tensor(tn(LLM_TENSOR_OUTPUT, "bias"), {n_embd}, 0); + } break; default: throw std::runtime_error("unknown architecture"); } @@ -9485,6 +9924,7 @@ enum llm_ffn_gate_type { enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, + LLM_NORM_GROUP, }; static struct ggml_tensor * llm_build_inp_embd( @@ -9505,7 +9945,7 @@ static struct ggml_tensor * llm_build_inp_embd( inpL = ggml_get_rows(ctx, tok_embd, lctx.inp_tokens); } else { - lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); + lctx.inp_embd = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, batch.n_tokens); inpL = lctx.inp_embd; ggml_set_input(lctx.inp_embd); } @@ -9626,8 +10066,14 @@ static struct ggml_tensor * llm_build_norm( const llm_build_cb & cb, int il) { switch (type) { - case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; - case LLM_NORM_RMS: cur = ggml_rms_norm(ctx, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM: cur = ggml_norm (ctx, cur, hparams.f_norm_eps); break; + case LLM_NORM_RMS: cur = ggml_rms_norm (ctx, cur, hparams.f_norm_rms_eps); break; + case LLM_NORM_GROUP: + { + cur = ggml_reshape_3d(ctx, cur, cur->ne[0], 1, cur->ne[1]); + cur = ggml_group_norm(ctx, cur, hparams.n_norm_groups, hparams.f_norm_group_eps); + cur = ggml_reshape_2d(ctx, cur, cur->ne[0], cur->ne[2]); + } break; } if (mw || mb) { @@ -10966,6 +11412,167 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_deci() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_head = hparams.n_head(il); + + if (n_head == 0) { + // attention-free layer of Llama-3_1-Nemotron-51B + cur = inpL; + } else { + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + } + + if (n_head > 0 && n_head_kv == 0) { + // "linear attention" of Llama-3_1-Nemotron-51B + cur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wo, cur); + cb(cur, "wo", il); + } else if (n_head > 0) { + // self-attention + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + // modified to support attention-free layer of Llama-3_1-Nemotron-51B + struct ggml_tensor * ffn_inp = cur; + if (n_head > 0) { + ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + } + + // feed-forward network + if (model.layers[il].ffn_gate_inp == nullptr) { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, NULL, + model.layers[il].ffn_gate, model.layers[il].ffn_gate_b, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + // For Granite architecture + if (hparams.f_residual_scale) { + cur = ggml_scale(ctx0, cur, hparams.f_residual_scale); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + // For Granite architecture + if (hparams.f_logit_scale) { + cur = ggml_scale(ctx0, cur, 1.0f / hparams.f_logit_scale); + } + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_baichuan() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -12995,7 +13602,13 @@ struct llm_build_context { struct ggml_tensor * inp_pos = build_inp_pos(); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask_swa = build_inp_KQ_mask_swa(); + struct ggml_tensor * KQ_mask = nullptr; + if (hparams.n_swa == 0) { + // Phi-4 doesn't use sliding window attention + KQ_mask = build_inp_KQ_mask(); + } else { + KQ_mask = build_inp_KQ_mask_swa(); + } for (int il = 0; il < n_layer; ++il) { auto residual = inpL; @@ -13053,7 +13666,7 @@ struct llm_build_context { cur = llm_build_kv(ctx0, lctx, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask_swa, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il); } if (il == n_layer - 1) { @@ -15219,6 +15832,161 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_deepseek() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // rope freq factors for llama3; may return nullptr for llama2 and other models + struct ggml_tensor * rope_factors = build_rope_factors(il); + + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_ext( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, lctx, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, kq_scale, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + if ((uint32_t) il < hparams.n_layer_dense_lead) { + cur = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } else { + // MoE branch + ggml_tensor * moe_out = + llm_build_moe_ffn(ctx0, lctx, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + false, hparams.expert_weights_scale, + cb, il); + cb(moe_out, "ffn_moe_out", il); + + // FFN shared expert + { + ggml_tensor * ffn_shexp = llm_build_ffn(ctx0, lctx, cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(ffn_shexp, "ffn_shexp", il); + + cur = ggml_add(ctx0, moe_out, ffn_shexp); + cb(cur, "ffn_out", il); + } + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cur = lctx.cvec.apply_to(ctx0, cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_deepseek2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); @@ -15598,7 +16366,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_t5_encoder() { + struct ggml_cgraph * build_t5_enc() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -15730,7 +16498,7 @@ struct llm_build_context { return gf; } - struct ggml_cgraph * build_t5_decoder() { + struct ggml_cgraph * build_t5_dec() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); // mutable variable, needed during the last layer of the computation to skip unused tokens @@ -16679,6 +17447,158 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_wavtokenizer_dec() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, inpL)); + + cur = ggml_conv_1d_ph(ctx0, model.conv1d, cur, 1, 1); + cur = ggml_add(ctx0, cur, model.conv1d_b); + + // posnet + for (uint32_t il = 0; il < hparams.posnet.n_layer; ++il) { + const auto & layer = model.layers[il].posnet; + + inpL = cur; + + switch (il) { + case 0: + case 1: + case 3: + case 4: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm1, + layer.norm1_b, + LLM_NORM_GROUP, cb, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, layer.conv1, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv1_b); + + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm2, + layer.norm2_b, + LLM_NORM_GROUP, cb, 0); + + cur = ggml_mul(ctx0, ggml_sigmoid(ctx0, cur), cur); + + cur = ggml_conv_1d_ph(ctx0, layer.conv2, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.conv2_b); + + cur = ggml_add(ctx0, cur, inpL); + } break; + case 2: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.attn_norm, + layer.attn_norm_b, + LLM_NORM_GROUP, cb, 0); + + struct ggml_tensor * q; + struct ggml_tensor * k; + struct ggml_tensor * v; + + q = ggml_conv_1d_ph(ctx0, layer.attn_q, cur, 1, 1); + k = ggml_conv_1d_ph(ctx0, layer.attn_k, cur, 1, 1); + v = ggml_conv_1d_ph(ctx0, layer.attn_v, cur, 1, 1); + + q = ggml_add(ctx0, q, layer.attn_q_b); + k = ggml_add(ctx0, k, layer.attn_k_b); + v = ggml_add(ctx0, v, layer.attn_v_b); + + q = ggml_cont(ctx0, ggml_transpose(ctx0, q)); + k = ggml_cont(ctx0, ggml_transpose(ctx0, k)); + + struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); + + kq = ggml_soft_max_ext(ctx0, kq, nullptr, 1.0f/sqrtf(float(hparams.posnet.n_embd)), 0.0f); + + cur = ggml_mul_mat(ctx0, kq, v); + + cur = ggml_conv_1d_ph(ctx0, layer.attn_o, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.attn_o_b); + + cur = ggml_add(ctx0, cur, inpL); + } break; + case 5: + { + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm, + layer.norm_b, + LLM_NORM_GROUP, cb, 0); + } break; + default: GGML_ABORT("unknown posnet layer"); + }; + } + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = llm_build_norm(ctx0, cur, hparams, + model.tok_norm, + model.tok_norm_b, + LLM_NORM, cb, -1); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = cur; + + // convnext + for (uint32_t il = 0; il < hparams.convnext.n_layer; ++il) { + const auto & layer = model.layers[il].convnext; + + cur = inpL; + + cur = ggml_conv_1d_dw_ph(ctx0, layer.dw, cur, 1, 1); + cur = ggml_add(ctx0, cur, layer.dw_b); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = llm_build_norm(ctx0, cur, hparams, + layer.norm, + layer.norm_b, + LLM_NORM, cb, -1); + + cur = llm_build_ffn(ctx0, lctx, cur, + layer.pw1, layer.pw1_b, NULL, + NULL, NULL, NULL, + layer.pw2, layer.pw2_b, NULL, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + + cur = ggml_mul(ctx0, cur, layer.gamma); + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + inpL = ggml_add(ctx0, cur, inpL); + } + + cur = inpL; + + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, + model.output_norm_b, + LLM_NORM, cb, -1); + + // lm_head + cur = llm_build_lora_mm(lctx, ctx0, model.output, cur); + + cur = ggml_add(ctx0, cur, model.output_b); + cb(cur, "result_embd", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -16767,6 +17687,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_llama(); } break; + case LLM_ARCH_DECI: + { + result = llm.build_deci(); + } break; case LLM_ARCH_BAICHUAN: { result = llm.build_baichuan(); @@ -16906,6 +17830,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_arctic(); } break; + case LLM_ARCH_DEEPSEEK: + { + result = llm.build_deepseek(); + } break; case LLM_ARCH_DEEPSEEK2: { result = llm.build_deepseek2(); @@ -16921,14 +17849,14 @@ static struct ggml_cgraph * llama_build_graph( case LLM_ARCH_T5: { if (lctx.is_encoding) { - result = llm.build_t5_encoder(); + result = llm.build_t5_enc(); } else { - result = llm.build_t5_decoder(); + result = llm.build_t5_dec(); } } break; case LLM_ARCH_T5ENCODER: { - result = llm.build_t5_encoder(); + result = llm.build_t5_enc(); } break; case LLM_ARCH_JAIS: { @@ -16950,6 +17878,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_chameleon(); } break; + case LLM_ARCH_WAVTOKENIZER_DEC: + { + result = llm.build_wavtokenizer_dec(); + } break; default: GGML_ABORT("fatal error"); } @@ -17041,30 +17973,35 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch) } if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { - GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); - const int64_t n_tokens = ubatch.n_tokens; + //GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); - GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); - int32_t * data = (int32_t *) lctx.inp_out_ids->data; - - if (lctx.n_outputs == n_tokens) { - for (int i = 0; i < n_tokens; ++i) { - data[i] = i; - } - } else if (ubatch.output) { - int32_t n_outputs = 0; - for (int i = 0; i < n_tokens; ++i) { - if (ubatch.output[i]) { - data[n_outputs++] = i; - } - } - // the graph needs to have been passed the correct number of outputs - GGML_ASSERT(lctx.n_outputs == n_outputs); - } else if (lctx.n_outputs == 1) { - // only keep last output - data[0] = n_tokens - 1; + if (!lctx.inp_out_ids) { + LLAMA_LOG_WARN("%s: 'lctx.inp_out_ids' is not created\n", __func__); } else { - GGML_ASSERT(lctx.n_outputs == 0); + const int64_t n_tokens = ubatch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); + int32_t * data = (int32_t *) lctx.inp_out_ids->data; + + if (lctx.n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (ubatch.output) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (ubatch.output[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(lctx.n_outputs == n_outputs); + } else if (lctx.n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(lctx.n_outputs == 0); + } } } @@ -17735,6 +18672,7 @@ static int llama_decode_internal( embd = nullptr; // do not extract embeddings when not needed GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } + // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); ggml_backend_sched_alloc_graph(lctx.sched.get(), gf); @@ -20123,10 +21061,12 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_T5ENCODER: case LLM_ARCH_JAIS: case LLM_ARCH_RWKV6: + case LLM_ARCH_WAVTOKENIZER_DEC: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values case LLM_ARCH_LLAMA: + case LLM_ARCH_DECI: case LLM_ARCH_BAICHUAN: case LLM_ARCH_STARCODER: case LLM_ARCH_PLAMO: @@ -20137,6 +21077,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_COMMAND_R: case LLM_ARCH_OLMO: case LLM_ARCH_ARCTIC: + case LLM_ARCH_DEEPSEEK: case LLM_ARCH_DEEPSEEK2: case LLM_ARCH_CHATGLM: case LLM_ARCH_GRANITE: @@ -20239,17 +21180,6 @@ uint64_t llama_model_n_params(const struct llama_model * model) { return model->n_elements; } -struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { - auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), - [name](const std::pair & it) { - return it.first == name; - }); - if (it == model->tensors_by_name.end()) { - return nullptr; - } - return it->second; -} - bool llama_model_has_encoder(const struct llama_model * model) { switch (model->arch) { case LLM_ARCH_T5: return true; @@ -21958,6 +22888,8 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) { } } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) { return LLM_CHAT_TEMPLATE_PHI_3; + } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) { + return LLM_CHAT_TEMPLATE_FALCON_3; } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) { return LLM_CHAT_TEMPLATE_ZEPHYR; } else if (tmpl_contains("bos_token + message['role']")) { @@ -22002,6 +22934,10 @@ static llm_chat_template llama_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_RWKV_WORLD; } else if (tmpl_contains("<|start_of_role|>")) { return LLM_CHAT_TEMPLATE_GRANITE; + } else if (tmpl_contains("message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1]")) { + return LLM_CHAT_TEMPLATE_GIGACHAT; + } else if (tmpl_contains("<|role_start|>")) { + return LLM_CHAT_TEMPLATE_MEGREZ; } return LLM_CHAT_TEMPLATE_UNKNOWN; } @@ -22109,6 +23045,15 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|assistant|>\n"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_FALCON_3) { + // Falcon 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>\n" << message->content << "\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } } else if (tmpl == LLM_CHAT_TEMPLATE_ZEPHYR) { // zephyr template for (auto message : chat) { @@ -22326,6 +23271,42 @@ static int32_t llama_chat_apply_template_internal( if (add_ass) { ss << "<|start_of_role|>assistant<|end_of_role|>\n"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_GIGACHAT) { + // GigaChat template + bool has_system = !chat.empty() && std::string(chat[0]->role) == "system"; + + // Handle system message if present + if (has_system) { + ss << "" << chat[0]->content << "<|message_sep|>"; + } else { + ss << ""; + } + + // Process remaining messages + for (size_t i = has_system ? 1 : 0; i < chat.size(); i++) { + std::string role(chat[i]->role); + if (role == "user") { + ss << "user<|role_sep|>" << chat[i]->content << "<|message_sep|>" + << "available functions<|role_sep|>[]<|message_sep|>"; + } else if (role == "assistant") { + ss << "assistant<|role_sep|>" << chat[i]->content << "<|message_sep|>"; + } + } + + // Add generation prompt if needed + if (add_ass) { + ss << "assistant<|role_sep|>"; + } + } else if (tmpl == LLM_CHAT_TEMPLATE_MEGREZ) { + // Megrez template + for (auto message : chat) { + std::string role(message->role); + ss << "<|role_start|>" << role << "<|role_end|>" << message->content << "<|turn_end|>"; + } + + if (add_ass) { + ss << "<|role_start|>assistant<|role_end|>"; + } } else { // template not supported return -1; @@ -22345,15 +23326,15 @@ int32_t llama_chat_apply_template( std::string curr_tmpl(tmpl == nullptr ? "" : tmpl); if (tmpl == nullptr) { GGML_ASSERT(model != nullptr); - // load template from model - std::vector model_template(2048, 0); // longest known template is about 1200 bytes - std::string template_key = "tokenizer.chat_template"; - int32_t res = llama_model_meta_val_str(model, template_key.c_str(), model_template.data(), model_template.size()); - if (res < 0) { + + // load template from model, if available + const auto & it = model->gguf_kv.find("tokenizer.chat_template"); + if (it != model->gguf_kv.end() && it->second.size() > 0) { + curr_tmpl = it->second; + } + else { // worst case: there is no information about template, we will use chatml by default - curr_tmpl = "chatml"; // see llama_chat_apply_template_internal - } else { - curr_tmpl = std::string(model_template.data(), model_template.size()); + curr_tmpl = "chatml"; // see llama_chat_apply_template_internal } } diff --git a/src/unicode.cpp b/src/unicode.cpp index 3d4592635..8ed6b1a51 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -71,15 +71,15 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { throw std::invalid_argument("failed to convert utf8 to codepoint"); } -//static std::vector unicode_cpt_to_utf16(uint32_t cp) { +//static std::vector unicode_cpt_to_utf16(uint32_t cpt) { // std::vector result; -// if (/* 0x0000 <= cp && */ cp <= 0xffff) { -// result.emplace_back(cp); +// if (/* 0x0000 <= cpt && */ cpt <= 0xffff) { +// result.emplace_back(cpt); // return result; // } -// if (0x10000 <= cp && cp <= 0x10ffff) { -// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10)); -// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff)); +// if (0x10000 <= cpt && cpt <= 0x10ffff) { +// result.emplace_back(0xd800 | ((cpt - 0x10000) >> 10)); +// result.emplace_back(0xdc00 | ((cpt - 0x10000) & 0x03ff)); // return result; // } // throw std::invalid_argument("failed to convert codepoint to utf16"); @@ -120,8 +120,8 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { // return result; //} -static std::vector unicode_cpt_flags_array() { - std::vector cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); +static std::vector unicode_cpt_flags_array() { + std::vector cpt_flags(MAX_CODEPOINTS, unicode_cpt_flags::UNDEFINED); assert (unicode_ranges_flags.begin()[0].first == 0); assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS); @@ -253,8 +253,8 @@ static std::vector unicode_regex_split_custom_gpt2(const std::string & t return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - auto _get_flags = [&] (const size_t pos) -> codepoint_flags { - return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; + auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{}; }; size_t _prev_end = offset_ini; @@ -371,8 +371,8 @@ static std::vector unicode_regex_split_custom_llama3(const std::string & return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE; }; - auto _get_flags = [&] (const size_t pos) -> codepoint_flags { - return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; + auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{}; }; size_t _prev_end = offset_ini; @@ -572,29 +572,29 @@ static std::vector unicode_regex_split_custom(const std::string & text, // interface // -std::string unicode_cpt_to_utf8(uint32_t cp) { +std::string unicode_cpt_to_utf8(uint32_t cpt) { std::string result; - if (/* 0x00 <= cp && */ cp <= 0x7f) { - result.push_back(cp); + if (/* 0x00 <= cpt && */ cpt <= 0x7f) { + result.push_back(cpt); return result; } - if (0x80 <= cp && cp <= 0x7ff) { - result.push_back(0xc0 | ((cp >> 6) & 0x1f)); - result.push_back(0x80 | (cp & 0x3f)); + if (0x80 <= cpt && cpt <= 0x7ff) { + result.push_back(0xc0 | ((cpt >> 6) & 0x1f)); + result.push_back(0x80 | (cpt & 0x3f)); return result; } - if (0x800 <= cp && cp <= 0xffff) { - result.push_back(0xe0 | ((cp >> 12) & 0x0f)); - result.push_back(0x80 | ((cp >> 6) & 0x3f)); - result.push_back(0x80 | (cp & 0x3f)); + if (0x800 <= cpt && cpt <= 0xffff) { + result.push_back(0xe0 | ((cpt >> 12) & 0x0f)); + result.push_back(0x80 | ((cpt >> 6) & 0x3f)); + result.push_back(0x80 | (cpt & 0x3f)); return result; } - if (0x10000 <= cp && cp <= 0x10ffff) { - result.push_back(0xf0 | ((cp >> 18) & 0x07)); - result.push_back(0x80 | ((cp >> 12) & 0x3f)); - result.push_back(0x80 | ((cp >> 6) & 0x3f)); - result.push_back(0x80 | (cp & 0x3f)); + if (0x10000 <= cpt && cpt <= 0x10ffff) { + result.push_back(0xf0 | ((cpt >> 18) & 0x07)); + result.push_back(0x80 | ((cpt >> 12) & 0x3f)); + result.push_back(0x80 | ((cpt >> 6) & 0x3f)); + result.push_back(0x80 | (cpt & 0x3f)); return result; } @@ -624,19 +624,19 @@ std::vector unicode_cpts_from_utf8(const std::string & utf8) { return result; } -codepoint_flags unicode_cpt_flags(const uint32_t cp) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); +unicode_cpt_flags unicode_cpt_flags_from_cpt(const uint32_t cpt) { + static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED); static const auto cpt_flags = unicode_cpt_flags_array(); - return cp < cpt_flags.size() ? cpt_flags[cp] : undef; + return cpt < cpt_flags.size() ? cpt_flags[cpt] : undef; } -codepoint_flags unicode_cpt_flags(const std::string & utf8) { - static const codepoint_flags undef(codepoint_flags::UNDEFINED); +unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8) { + static const unicode_cpt_flags undef(unicode_cpt_flags::UNDEFINED); if (utf8.empty()) { return undef; // undefined } size_t offset = 0; - return unicode_cpt_flags(unicode_cpt_from_utf8(utf8, offset)); + return unicode_cpt_flags_from_cpt(unicode_cpt_from_utf8(utf8, offset)); } std::string unicode_byte_to_utf8(uint8_t byte) { @@ -649,41 +649,41 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) { return map.at(utf8); } -uint32_t unicode_tolower(uint32_t cp) { +uint32_t unicode_tolower(uint32_t cpt) { // binary search - auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp, + auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cpt, [](const std::pair & pair, uint32_t value) { return pair.first < value; }); - if (it != unicode_map_lowercase.end() && it->first == cp) { + if (it != unicode_map_lowercase.end() && it->first == cpt) { return it->second; } - return cp; // Return the original code point if no lowercase mapping is found + return cpt; // Return the original code point if no lowercase mapping is found } std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { // unicode categories static const std::map k_ucat_enum = { - { "\\p{N}", codepoint_flags::NUMBER }, - { "\\p{L}", codepoint_flags::LETTER }, - { "\\p{P}", codepoint_flags::PUNCTUATION }, + { "\\p{N}", unicode_cpt_flags::NUMBER }, + { "\\p{L}", unicode_cpt_flags::LETTER }, + { "\\p{P}", unicode_cpt_flags::PUNCTUATION }, }; static const std::map k_ucat_cpt = { - { codepoint_flags::NUMBER, 0xD1 }, - { codepoint_flags::LETTER, 0xD2 }, - { codepoint_flags::PUNCTUATION, 0xD3 }, + { unicode_cpt_flags::NUMBER, 0xD1 }, + { unicode_cpt_flags::LETTER, 0xD2 }, + { unicode_cpt_flags::PUNCTUATION, 0xD3 }, }; static const std::map k_ucat_map = { - { codepoint_flags::NUMBER, "\x30-\x39" }, // 0-9 - { codepoint_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z - { codepoint_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} + { unicode_cpt_flags::NUMBER, "\x30-\x39" }, // 0-9 + { unicode_cpt_flags::LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z + { unicode_cpt_flags::PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} }; // compute collapsed codepoints only if needed by at least one regex bool need_collapse = false; - for (auto & regex_expr : regex_exprs) { + for (const auto & regex_expr : regex_exprs) { // search for unicode categories for (const auto & ucat : k_ucat_enum) { if (std::string::npos != regex_expr.find(ucat.first)) { @@ -709,7 +709,7 @@ std::vector unicode_regex_split(const std::string & text, const std continue; } - const auto flags = unicode_cpt_flags(cpts[i]); + const auto flags = unicode_cpt_flags_from_cpt(cpts[i]); if (flags.is_whitespace) { //NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does. @@ -725,7 +725,7 @@ std::vector unicode_regex_split(const std::string & text, const std std::vector bpe_offsets = { cpts.size() }; - for (auto & regex_expr : regex_exprs) { + for (const auto & regex_expr : regex_exprs) { // first, see if we have an efficient custom regex implementation auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets); @@ -739,7 +739,7 @@ std::vector unicode_regex_split(const std::string & text, const std // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category // with the corresponding collapsed representation bool use_collapsed = false; - for (auto & ucat : k_ucat_enum) { + for (const auto & ucat : k_ucat_enum) { if (std::string::npos != regex_expr.find(ucat.first)) { use_collapsed = true; break; @@ -805,7 +805,7 @@ std::vector unicode_regex_split(const std::string & text, const std // std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback std::wstring wtext(cpts.begin(), cpts.end()); for (size_t i = 0; i < wtext.size(); ++i) { - if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) { + if (wtext[i] > 0x7F && unicode_cpt_flags_from_cpt(wtext[i]).is_whitespace) { wtext[i] = 0x0B; } } diff --git a/src/unicode.h b/src/unicode.h index 008532a24..c27098df7 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -4,9 +4,7 @@ #include #include -// TODO: prefix all symbols with "llama_" - -struct codepoint_flags { +struct unicode_cpt_flags { enum { UNDEFINED = 0x0001, NUMBER = 0x0002, // regex: \p{N} @@ -35,7 +33,7 @@ struct codepoint_flags { uint16_t is_nfd : 1; // decode from uint16 - inline codepoint_flags(const uint16_t flags=0) { + inline unicode_cpt_flags(const uint16_t flags = 0) { *reinterpret_cast(this) = flags; } @@ -50,18 +48,19 @@ struct codepoint_flags { size_t unicode_len_utf8(char src); -std::string unicode_cpt_to_utf8(uint32_t cp); -uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset); +std::string unicode_cpt_to_utf8 (uint32_t cpt); +uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset); + std::vector unicode_cpts_from_utf8(const std::string & utf8); std::vector unicode_cpts_normalize_nfd(const std::vector & cpts); -codepoint_flags unicode_cpt_flags(const uint32_t cp); -codepoint_flags unicode_cpt_flags(const std::string & utf8); +unicode_cpt_flags unicode_cpt_flags_from_cpt (uint32_t cpt); +unicode_cpt_flags unicode_cpt_flags_from_utf8(const std::string & utf8); std::string unicode_byte_to_utf8(uint8_t byte); -uint8_t unicode_utf8_to_byte(const std::string & utf8); +uint8_t unicode_utf8_to_byte(const std::string & utf8); -uint32_t unicode_tolower(uint32_t cp); +uint32_t unicode_tolower(uint32_t cpt); std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index dc1c1e3c9..4430e9bb4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -129,6 +129,7 @@ llama_target_and_test(test-arg-parser.cpp) llama_target_and_test(test-chat-template.cpp) # llama_target_and_test(test-opt.cpp) # SLOW +llama_target_and_test(test-gguf.cpp) llama_target_and_test(test-backend-ops.cpp) llama_target_and_test(test-antiprompts.cpp) llama_target_and_test(test-tool-call.cpp) diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index b9454ba59..ccdd3fb57 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3549,8 +3549,8 @@ static std::vector> make_test_cases_eval() { for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { for (ggml_type type_dst : all_types) { - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); - test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 4, 4, 4})); + test_cases.emplace_back(new test_cpy(type_src, type_dst, {256, 2, 3, 4}, {0, 2, 1, 3})); // cpy by rows } } for (ggml_type type_src : {GGML_TYPE_F16, GGML_TYPE_F32}) { diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 9a246069f..341a920e0 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -75,6 +75,10 @@ int main(void) { "{%- if messages[0][\"role\"] == \"system\" %}\n {%- set system_message = messages[0][\"content\"] %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr(\"role\", \"equalto\", \"user\") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n {%- if not (message.role == \"tool\" or message.role == \"tool_results\" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n {%- if (message[\"role\"] == \"user\") != (ns.index % 2 == 0) %}\n {{- raise_exception(\"After the optional system message, conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif %}\n {%- set ns.index = ns.index + 1 %}\n {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n {%- if message[\"role\"] == \"user\" %}\n {%- if tools is not none and (message == user_messages[-1]) %}\n {{- \"[AVAILABLE_TOOLS][\" }}\n {%- for tool in tools %}\n {%- set tool = tool.function %}\n {{- '{\"type\": \"function\", \"function\": {' }}\n {%- for key, val in tool.items() if key != \"return\" %}\n {%- if val is string %}\n {{- '\"' + key + '\": \"' + val + '\"' }}\n {%- else %}\n {{- '\"' + key + '\": ' + val|tojson }}\n {%- endif %}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- endif %}\n {%- endfor %}\n {{- \"}}\" }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" }}\n {%- endif %}\n {%- endfor %}\n {{- \"[/AVAILABLE_TOOLS]\" }}\n {%- endif %}\n {%- if loop.last and system_message is defined %}\n {{- \"[INST]\" + system_message + \"\\n\\n\" + message[\"content\"] + \"[/INST]\" }}\n {%- else %}\n {{- \"[INST]\" + message[\"content\"] + \"[/INST]\" }}\n {%- endif %}\n {%- elif (message.tool_calls is defined and message.tool_calls is not none) %}\n {{- \"[TOOL_CALLS][\" }}\n {%- for tool_call in message.tool_calls %}\n {%- set out = tool_call.function|tojson %}\n {{- out[:-1] }}\n {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- ', \"id\": \"' + tool_call.id + '\"}' }}\n {%- if not loop.last %}\n {{- \", \" }}\n {%- else %}\n {{- \"]\" + eos_token }}\n {%- endif %}\n {%- endfor %}\n {%- elif message[\"role\"] == \"assistant\" %}\n {{- message[\"content\"] + eos_token}}\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {{- '[TOOL_RESULTS]{\"content\": ' + content|string + \", \" }}\n {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length 9!\") }}\n {%- endif %}\n {{- '\"call_id\": \"' + message.tool_call_id + '\"}[/TOOL_RESULTS]' }}\n {%- else %}\n {{- raise_exception(\"Only user and assistant roles are supported, with the exception of an initial optional system message!\") }}\n {%- endif %}\n{%- endfor %}\n", // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template) "{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'system' %}{{ '[SYSTEM_PROMPT] ' + message['content'] + '[/SYSTEM_PROMPT]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token }}{% else %}{{ raise_exception('Only user, system and assistant roles are supported!') }}{% endif %}{% endfor %}", + // ai-sage/GigaChat-20B-A3B-instruct + "{% if messages[0]['role'] == 'system' -%}\n {%- set loop_messages = messages[1:] -%}\n {%- set system_message = bos_token + messages[0]['content'] + additional_special_tokens[1] -%}\n{%- else -%}\n {%- set loop_messages = messages -%}\n {%- set system_message = bos_token + '' -%}\n{%- endif -%}\n{%- for message in loop_messages %}\n {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}\n {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}\n {% endif %}\n \n {%- if loop.index0 == 0 -%}\n {{ system_message -}}\n {%- endif -%}\n {%- if message['role'] == 'user' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {{ 'available functions' + additional_special_tokens[0] + additional_special_tokens[2] + additional_special_tokens[3] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if message['role'] == 'assistant' -%}\n {{ message['role'] + additional_special_tokens[0] + message['content'] + additional_special_tokens[1] -}}\n {%- endif -%}\n {%- if loop.last and add_generation_prompt -%}\n {{ 'assistant' + additional_special_tokens[0] -}}\n {%- endif -%}\n{%- endfor %}", + // Infinigence/Megrez-3B-Instruct + u8"{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|role_start|>system<|role_end|>你是Megrez-3B-Instruct,将针对用户的问题给出详细的、积极的回答。<|turn_end|>' }}{% endif %}{{ '<|role_start|>' + message['role'] + '<|role_end|>' + message['content'] + '<|turn_end|>' }}{% endfor %}{% if add_generation_prompt %}{{ '<|role_start|>assistant<|role_end|>' }}{% endif %}" }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -129,6 +133,10 @@ int main(void) { "[INST]You are a helpful assistant\n\nHello[/INST]Hi there[INST]Who are you[/INST] I am an assistant [INST]Another question[/INST]", // mistralai/Mistral-Large-Instruct-2411 (mistralai 'v7' template) "[SYSTEM_PROMPT] You are a helpful assistant[/SYSTEM_PROMPT][INST] Hello[/INST] Hi there
[INST] Who are you[/INST] I am an assistant [INST] Another question[/INST]", + // ai-sage/GigaChat-20B-A3B-instruct + "You are a helpful assistant<|message_sep|>user<|role_sep|>Hello<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>Hi there<|message_sep|>user<|role_sep|>Who are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|> I am an assistant <|message_sep|>user<|role_sep|>Another question<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>", + // Infinigence/Megrez-3B-Instruct + "<|role_start|>system<|role_end|>You are a helpful assistant<|turn_end|><|role_start|>user<|role_end|>Hello<|turn_end|><|role_start|>assistant<|role_end|>Hi there<|turn_end|><|role_start|>user<|role_end|>Who are you<|turn_end|><|role_start|>assistant<|role_end|> I am an assistant <|turn_end|><|role_start|>user<|role_end|>Another question<|turn_end|><|role_start|>assistant<|role_end|>", }; std::vector formatted_chat(1024); int32_t res; @@ -190,6 +198,7 @@ int main(void) { assert(fmt_sys("mistral") == "[INST] You are a helpful assistant\n"); // for old pre-v1 templates assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>"); + assert(fmt_sys("gigachat") == "You are a helpful assistant<|message_sep|>"); // test llama_chat_format_single for user message @@ -214,6 +223,7 @@ int main(void) { assert(fmt_single("mistral") == "[INST] How are you [/INST]"); // for old pre-v1 templates assert(fmt_single("gemma") == "\nuser\nHow are you\nmodel\n"); assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"); + assert(fmt_single("gigachat") == "user<|role_sep|>How are you<|message_sep|>available functions<|role_sep|>[]<|message_sep|>assistant<|role_sep|>"); return 0; } diff --git a/tests/test-gguf.cpp b/tests/test-gguf.cpp new file mode 100644 index 000000000..1bb5fb47c --- /dev/null +++ b/tests/test-gguf.cpp @@ -0,0 +1,1303 @@ +#include "ggml.h" +#include "ggml-backend.h" +#include "../ggml/src/ggml-impl.h" + +#include +#include +#include +#include +#include +#include +#include + +constexpr int offset_has_kv = 1000; +constexpr int offset_has_tensors = 2000; +constexpr int offset_has_data = 3000; + +enum handcrafted_file_type { + HANDCRAFTED_HEADER_BAD_MAGIC = 10, + HANDCRAFTED_HEADER_BAD_VERSION_1 = 20, + HANDCRAFTED_HEADER_BAD_VERSION_FUTURE = 30, + HANDCRAFTED_HEADER_BAD_N_TENSORS = 40, + HANDCRAFTED_HEADER_BAD_N_KV = 50, + HANDCRAFTED_HEADER_EMPTY = 800, + + HANDCRAFTED_KV_BAD_KEY_SIZE = 10 + offset_has_kv, + HANDCRAFTED_KV_BAD_TYPE = 20 + offset_has_kv, + HANDCRAFTED_KV_BAD_VALUE_SIZE = 30 + offset_has_kv, + HANDCRAFTED_KV_DUPLICATE_KEY = 40 + offset_has_kv, + HANDCRAFTED_KV_SUCCESS = 800 + offset_has_kv, + + HANDCRAFTED_TENSORS_BAD_NAME_SIZE = 10 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_N_DIMS = 20 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_SHAPE = 30 + offset_has_tensors, + HANDCRAFTED_TENSORS_NE_TOO_BIG = 40 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_TYPE = 50 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_OFFSET = 60 + offset_has_tensors, + HANDCRAFTED_TENSORS_DUPLICATE_NAME = 70 + offset_has_tensors, + HANDCRAFTED_TENSORS_BAD_ALIGNMENT = 80 + offset_has_tensors, + HANDCRAFTED_TENSORS_SUCCESS = 800 + offset_has_tensors, + HANDCRAFTED_TENSORS_CUSTOM_ALIGN = 810 + offset_has_tensors, + + HANDCRAFTED_DATA_NOT_ENOUGH_DATA = 10 + offset_has_data, + HANDCRAFTED_DATA_BAD_ALIGNMENT = 20 + offset_has_data, + HANDCRAFTED_DATA_SUCCESS = 800 + offset_has_data, + HANDCRAFTED_DATA_CUSTOM_ALIGN = 810 + offset_has_data, +}; + +std::string handcrafted_file_type_name(const enum handcrafted_file_type hft) { + switch (hft) { + case HANDCRAFTED_HEADER_BAD_MAGIC: return "HEADER_BAD_MAGIC"; + case HANDCRAFTED_HEADER_BAD_VERSION_1: return "HEADER_BAD_VERSION_1"; + case HANDCRAFTED_HEADER_BAD_VERSION_FUTURE: return "HEADER_BAD_VERSION_FUTURE"; + case HANDCRAFTED_HEADER_BAD_N_KV: return "HEADER_BAD_N_KV"; + case HANDCRAFTED_HEADER_BAD_N_TENSORS: return "HEADER_BAD_N_TENSORS"; + case HANDCRAFTED_HEADER_EMPTY: return "HEADER_EMPTY"; + + case HANDCRAFTED_KV_BAD_KEY_SIZE: return "KV_BAD_KEY_SIZE"; + case HANDCRAFTED_KV_BAD_TYPE: return "KV_BAD_TYPE"; + case HANDCRAFTED_KV_BAD_VALUE_SIZE: return "KV_BAD_VALUE_SIZE"; + case HANDCRAFTED_KV_DUPLICATE_KEY: return "KV_DUPLICATE_KEY"; + case HANDCRAFTED_KV_SUCCESS: return "KV_RANDOM_KV"; + + case HANDCRAFTED_TENSORS_BAD_NAME_SIZE: return "TENSORS_BAD_NAME_SIZE"; + case HANDCRAFTED_TENSORS_BAD_N_DIMS: return "TENSORS_BAD_N_DIMS"; + case HANDCRAFTED_TENSORS_BAD_SHAPE: return "TENSORS_BAD_SHAPE"; + case HANDCRAFTED_TENSORS_NE_TOO_BIG: return "TENSORS_NE_TOO_BIG"; + case HANDCRAFTED_TENSORS_BAD_TYPE: return "TENSORS_BAD_TYPE"; + case HANDCRAFTED_TENSORS_BAD_OFFSET: return "TENSORS_BAD_OFFSET"; + case HANDCRAFTED_TENSORS_DUPLICATE_NAME: return "TENSORS_DUPLICATE_NAME"; + case HANDCRAFTED_TENSORS_BAD_ALIGNMENT: return "TENSORS_BAD_ALIGNMENT"; + case HANDCRAFTED_TENSORS_SUCCESS: return "TENSORS_SUCCESS"; + case HANDCRAFTED_TENSORS_CUSTOM_ALIGN: return "TENSORS_CUSTOM_ALIGN"; + + case HANDCRAFTED_DATA_NOT_ENOUGH_DATA: return "DATA_NOT_ENOUGH_DATA"; + case HANDCRAFTED_DATA_BAD_ALIGNMENT: return "DATA_BAD_ALIGNMENT"; + case HANDCRAFTED_DATA_SUCCESS: return "DATA_SUCCESS"; + case HANDCRAFTED_DATA_CUSTOM_ALIGN: return "DATA_CUSTOM_ALIGN"; + } + GGML_ABORT("fatal error"); +} + +static bool expect_context_not_null(const enum handcrafted_file_type hft) { + if (hft < offset_has_kv) { + return hft >= HANDCRAFTED_HEADER_EMPTY; + } + if (hft < offset_has_tensors) { + return hft >= HANDCRAFTED_KV_SUCCESS; + } + if (hft < offset_has_data) { + return hft >= HANDCRAFTED_TENSORS_SUCCESS; + } + return hft >= HANDCRAFTED_DATA_SUCCESS; +} + +typedef std::pair> tensor_config_t; + +std::vector get_tensor_configs(std::mt19937 & rng) { + std::vector tensor_configs; + tensor_configs.reserve(100); + + for (int i = 0; i < 100; ++i) { + const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT); + if (ggml_type_size(type) == 0) { + continue; + } + + std::array shape = {1, 1, 1, 1}; + shape[0] = (1 + rng() % 10) * ggml_blck_size(type); + const int n_dims = 1 + rng() % GGML_MAX_DIMS; + for (int i = 1; i < n_dims; ++i) { + shape[i] = 1 + rng() % 10; + } + + tensor_configs.push_back(std::make_pair(type, shape)); + } + + return tensor_configs; +} + +std::vector> get_kv_types(std::mt19937 rng) { + std::vector> kv_types; + kv_types.reserve(100); + + for (int i = 0; i < 100; ++i) { + const gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT); + + if (type == GGUF_TYPE_ARRAY) { + const gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT); + if (type_arr == GGUF_TYPE_ARRAY) { + continue; + } + kv_types.push_back(std::make_pair(type, type_arr)); + continue; + } + + kv_types.push_back(std::make_pair(type, gguf_type(-1))); + } + std::shuffle(kv_types.begin(), kv_types.end(), rng); + + return kv_types; +} + +static void helper_write(const void * data, const size_t nbytes, FILE * file) { + GGML_ASSERT(fwrite(data, 1, nbytes, file) == nbytes); +} + +static FILE * get_handcrafted_file(const unsigned int seed, const enum handcrafted_file_type hft, const int extra_bytes = 0) { + FILE * file = tmpfile(); + + std::mt19937 rng(seed); + + if (hft == HANDCRAFTED_HEADER_BAD_MAGIC) { + const char bad_magic[4] = {'F', 'U', 'G', 'G'}; + helper_write(bad_magic, sizeof(bad_magic), file); + } else { + helper_write(GGUF_MAGIC, 4, file); + } + + if (hft == HANDCRAFTED_HEADER_BAD_VERSION_1) { + const uint32_t version = 1; + helper_write(&version, sizeof(version), file); + } else if (hft == HANDCRAFTED_HEADER_BAD_VERSION_FUTURE) { + const uint32_t version = GGUF_VERSION + 1; + helper_write(&version, sizeof(version), file); + } else { + const uint32_t version = GGUF_VERSION; + helper_write(&version, sizeof(version), file); + } + + std::vector tensor_configs; + if (hft >= offset_has_tensors) { + tensor_configs = get_tensor_configs(rng); + } + + if (hft == HANDCRAFTED_HEADER_BAD_N_TENSORS) { + const uint64_t n_tensors = -1; + helper_write(&n_tensors, sizeof(n_tensors), file); + } else { + const uint64_t n_tensors = tensor_configs.size(); + helper_write(&n_tensors, sizeof(n_tensors), file); + } + + std::vector> kv_types; + if (hft >= offset_has_kv) { + kv_types = get_kv_types(rng); + } + { + uint64_t n_kv = kv_types.size(); + if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { + n_kv += 1; + } else if (hft == HANDCRAFTED_HEADER_BAD_N_KV) { + n_kv = -1; + } + helper_write(&n_kv, sizeof(n_kv), file); + } + + if (hft < offset_has_kv) { + for (int i = 0; i < extra_bytes; ++i) { + const char tmp = 0; + helper_write(&tmp, sizeof(tmp), file); + } + rewind(file); + return file; + } + + for (int i = 0; i < int(kv_types.size()); ++i) { + const enum gguf_type type = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].first); + const enum gguf_type type_arr = gguf_type(hft == HANDCRAFTED_KV_BAD_TYPE ? -1 : kv_types[i].second); + + const std::string key = "my_key_" + std::to_string((hft == HANDCRAFTED_KV_DUPLICATE_KEY ? i/2 : i)); + + if (hft == HANDCRAFTED_KV_BAD_KEY_SIZE) { + const uint64_t n = -1; + helper_write(&n, sizeof(n), file); + } else { + const uint64_t n = key.length(); + helper_write(&n, sizeof(n), file); + } + helper_write(key.data(), key.length(), file); + + { + const int32_t type32 = int32_t(type); + helper_write(&type32, sizeof(type32), file); + } + + uint32_t data[16]; + for (int j = 0; j < 16; ++j) { + data[j] = rng(); + if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) { + data[j] |= 0x01010101; // avoid random null-termination of string + } + } + + if (type == GGUF_TYPE_STRING) { + const uint64_t n = rng() % sizeof(data); + helper_write(&n, sizeof(n), file); + helper_write(data, n, file); + continue; + } + + if (type == GGUF_TYPE_ARRAY) { + { + const int32_t type32 = int32_t(type_arr); + helper_write(&type32, sizeof(type32), file); + } + if (type_arr == GGUF_TYPE_STRING) { + const uint64_t nstr = rng() % (16 + 1); + helper_write(&nstr, sizeof(nstr), file); + for (uint64_t istr = 0; istr < nstr; ++istr) { + const uint64_t n = rng() % (sizeof(uint32_t) + 1); + helper_write(&n, sizeof(n), file); + helper_write(&data[istr], n, file); + } + continue; + } + const size_t type_size = gguf_type_size(type_arr); + const uint64_t n = (rng() % sizeof(data)) / type_size; + helper_write(&n, sizeof(n), file); + helper_write(&data, n*type_size, file); + continue; + } + + size_t type_size = hft == HANDCRAFTED_KV_BAD_TYPE ? 1 : gguf_type_size(type); + if (hft == HANDCRAFTED_KV_BAD_VALUE_SIZE) { + type_size += rng() % 3; + } + helper_write(data, type_size, file); + } + + if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { + const std::string key = "general.alignment"; + { + const uint64_t n = key.length(); + helper_write(&n, sizeof(n), file); + } + helper_write(key.data(), key.length(), file); + + const int32_t type = gguf_type(GGUF_TYPE_UINT32); + helper_write(&type, sizeof(type), file); + + const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT + 1; + helper_write(&alignment, sizeof(alignment), file); + } + + if (hft < offset_has_tensors) { + for (int i = 0; i < extra_bytes; ++i) { + const char tmp = 0; + helper_write(&tmp, sizeof(tmp), file); + } + rewind(file); + return file; + } + + uint32_t alignment = GGUF_DEFAULT_ALIGNMENT; + if (hft == HANDCRAFTED_TENSORS_BAD_ALIGNMENT || hft == HANDCRAFTED_DATA_BAD_ALIGNMENT) { + alignment -= 1; + } else if (hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN) { + alignment += 1; + } + + uint64_t offset = 0; + for (int i = 0; i < int(tensor_configs.size()); ++i) { + const ggml_type type = tensor_configs[i].first; + const std::array shape = tensor_configs[i].second; + + std::string name = "my_tensor"; + if (hft != HANDCRAFTED_TENSORS_DUPLICATE_NAME) { + name += "_" + std::to_string(i); + } + if (hft == HANDCRAFTED_TENSORS_BAD_NAME_SIZE) { + name += "_with_a_very_long_name_which_is_longer_than_what_is_allowed_for_ggml_tensors"; + GGML_ASSERT(name.length() >= GGML_MAX_NAME); + } + { + const uint64_t n = name.length(); + helper_write(&n, sizeof(n), file); + } + helper_write(name.data(), name.length(), file); + + uint32_t n_dims = hft == HANDCRAFTED_TENSORS_NE_TOO_BIG ? 2 : 1; + for (int i = GGML_MAX_DIMS-1; i >= 1; --i) { + if (shape[i] != 1) { + n_dims = i + 1; + break; + } + } + if (hft == HANDCRAFTED_TENSORS_BAD_N_DIMS) { + const uint32_t n_dims_bad = GGML_MAX_DIMS + 1; + helper_write(&n_dims_bad, sizeof(n_dims_bad), file); + } else { + helper_write(&n_dims, sizeof(n_dims), file); + } + + if (hft == HANDCRAFTED_TENSORS_BAD_SHAPE) { + for (uint32_t j = 0; j < n_dims; ++j) { + const int64_t bad_dim = -1; + helper_write(&bad_dim, sizeof(bad_dim), file); + } + } else if (hft == HANDCRAFTED_TENSORS_NE_TOO_BIG){ + for (uint32_t j = 0; j < n_dims; ++j) { + const int64_t big_dim = 4*int64_t(INT32_MAX); + helper_write(&big_dim, sizeof(big_dim), file); + } + } else { + helper_write(shape.data(), n_dims*sizeof(int64_t), file); + } + + { + const int32_t type32 = hft == HANDCRAFTED_TENSORS_BAD_TYPE ? -1 : int32_t(type); + helper_write(&type32, sizeof(type32), file); + } + + if (hft == HANDCRAFTED_TENSORS_BAD_OFFSET) { + const uint64_t bad_offset = -1; + helper_write(&bad_offset, sizeof(bad_offset), file); + } else { + helper_write(&offset, sizeof(offset), file); + } + + int64_t ne = shape[0]; + for (uint32_t i = 1; i < n_dims; ++i) { + ne *= shape[i]; + } + offset += GGML_PAD(ggml_row_size(type, ne), alignment); + } + + const uint32_t alignment_overshoot = ftell(file) % alignment; + if (alignment_overshoot != 0) { + for (size_t i = alignment_overshoot; i < alignment; ++i) { + const char pad = 0; + helper_write(&pad, sizeof(pad), file); + } + } + + if (hft >= offset_has_data) { + rng.seed(seed + 1); + uint64_t nbytes = offset; + if (hft == HANDCRAFTED_DATA_NOT_ENOUGH_DATA) { + nbytes -= 1; + } + for (uint64_t i = 0; i < nbytes; ++i) { + const uint8_t random_byte = i % 256; + helper_write(&random_byte, sizeof(random_byte), file); + } + } + + for (int i = 0; i < extra_bytes; ++i) { + const char tmp = 0; + helper_write(&tmp, sizeof(tmp), file); + } + rewind(file); + return file; +} + +static bool handcrafted_check_header(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_kv, const bool has_tensors, const bool alignment_defined) { + if (!gguf_ctx) { + return false; + } + + std::mt19937 rng(seed); + + std::vector tensor_configs; + if (has_tensors) { + tensor_configs = get_tensor_configs(rng); + } + std::vector> kv_types; + if (has_kv) { + kv_types = get_kv_types(rng); + } + + bool ok = true; + + if (gguf_get_version(gguf_ctx) != GGUF_VERSION) { + ok = false; + } + if (gguf_get_n_tensors(gguf_ctx) != int(tensor_configs.size())) { + ok = false; + } + if (gguf_get_n_kv(gguf_ctx) != int(alignment_defined ? kv_types.size() + 1 : kv_types.size())) { + ok = false; + } + + return ok; +} + +static bool handcrafted_check_kv(const gguf_context * gguf_ctx, const unsigned int seed, const bool has_tensors, const bool alignment_defined) { + if (!gguf_ctx) { + return false; + } + + std::mt19937 rng(seed); + + std::vector tensor_configs; + if (has_tensors) { + tensor_configs = get_tensor_configs(rng); + } + + std::vector> kv_types = get_kv_types(rng); + + bool ok = true; + + for (int i = 0; i < int(kv_types.size()); ++i) { + const enum gguf_type type = gguf_type(kv_types[i].first); + const enum gguf_type type_arr = gguf_type(kv_types[i].second); + + const std::string key = "my_key_" + std::to_string(i); + + uint32_t data[16]; + for (int j = 0; j < 16; ++j) { + data[j] = rng(); + if (type == GGUF_TYPE_STRING || type_arr == GGUF_TYPE_STRING) { + data[j] |= 0x01010101; // avoid random null-termination of string + } + } + + const char * data8 = reinterpret_cast(data); + const int id = gguf_find_key(gguf_ctx, key.c_str()); + + if (type == GGUF_TYPE_STRING) { + const char * str = gguf_get_val_str(gguf_ctx, id); + const uint64_t n = strlen(str); + const uint64_t n_expected = rng() % sizeof(data); + if (n != n_expected) { + ok = false; + continue; + } + if (!std::equal(str, str + n, data8)) { + ok = false; + } + continue; + } + + if (type == GGUF_TYPE_ARRAY) { + const size_t type_size = gguf_type_size(type_arr); + const uint64_t arr_n = gguf_get_arr_n(gguf_ctx, id); + + if (type_arr == GGUF_TYPE_STRING) { + const uint64_t nstr_expected = rng() % (16 + 1); + if (arr_n != nstr_expected) { + ok = false; + continue; + } + for (uint64_t istr = 0; istr < nstr_expected; ++istr) { + const char * str = gguf_get_arr_str(gguf_ctx, id, istr); + const uint64_t n = strlen(str); + const uint64_t n_expected = rng() % (sizeof(uint32_t) + 1); + + if (n != n_expected) { + ok = false; + continue; + } + const char * str_expected = reinterpret_cast(&data[istr]); + if (strncmp(str, str_expected, n) != 0) { + ok = false; + continue; + } + } + continue; + } + + const uint64_t arr_n_expected = (rng() % sizeof(data)) / type_size; + if (arr_n != arr_n_expected) { + ok = false; + continue; + } + + const char * data_gguf = reinterpret_cast(gguf_get_arr_data(gguf_ctx, id)); + if (!std::equal(data8, data8 + arr_n*type_size, data_gguf)) { + ok = false; + } + continue; + } + + const char * data_gguf = reinterpret_cast(gguf_get_val_data(gguf_ctx, id)); + if (!std::equal(data8, data8 + gguf_type_size(type), data_gguf)) { + ok = false; + } + } + + const uint32_t expected_alignment = alignment_defined ? GGUF_DEFAULT_ALIGNMENT + 1 : GGUF_DEFAULT_ALIGNMENT; + if (gguf_get_alignment(gguf_ctx) != expected_alignment) { + ok = false; + } + + return ok; +} + +static bool handcrafted_check_tensors(const gguf_context * gguf_ctx, const unsigned int seed) { + if (!gguf_ctx) { + return false; + } + + std::mt19937 rng(seed); + + std::vector tensor_configs = get_tensor_configs(rng); + + // Call get_kv_types to get the same RNG state: + get_kv_types(rng); + + bool ok = true; + + const int id_alignment = gguf_find_key(gguf_ctx, "general.alignment"); + const uint32_t alignment = id_alignment >= 0 ? gguf_get_val_u32(gguf_ctx, id_alignment) : GGUF_DEFAULT_ALIGNMENT; + + uint64_t expected_offset = 0; + for (int i = 0; i < int(tensor_configs.size()); ++i) { + const ggml_type type = tensor_configs[i].first; + const std::array shape = tensor_configs[i].second; + + const std::string name = "my_tensor_" + std::to_string(i); + const int id = gguf_find_tensor(gguf_ctx, name.c_str()); + + if (id >= 0) { + if (std::string(gguf_get_tensor_name(gguf_ctx, id)) != name) { + ok = false; + } + + if (gguf_get_tensor_type(gguf_ctx, id) != type) { + ok = false; + } + } else { + ok = false; + continue; + } + + const size_t offset = gguf_get_tensor_offset(gguf_ctx, id); + + if (offset != expected_offset) { + ok = false; + } + + int64_t ne = shape[0]; + for (size_t j = 1; j < GGML_MAX_DIMS; ++j) { + ne *= shape[j]; + } + expected_offset += GGML_PAD(ggml_row_size(type, ne), alignment); + } + + return ok; +} + +static bool handcrafted_check_tensor_data(const gguf_context * gguf_ctx, const unsigned int seed, FILE * file) { + if (!gguf_ctx) { + return false; + } + + std::mt19937 rng(seed); + + std::vector tensor_configs = get_tensor_configs(rng); + + bool ok = true; + + const uint32_t alignment = GGUF_DEFAULT_ALIGNMENT; + + for (int i = 0; i < int(tensor_configs.size()); ++i) { + const ggml_type type = tensor_configs[i].first; + const std::array shape = tensor_configs[i].second; + + int64_t ne = shape[0]; + for (size_t j = 1; j < GGML_MAX_DIMS; ++j) { + ne *= shape[j]; + } + const size_t size = ggml_row_size(type, ne); + + const std::string name = "my_tensor_" + std::to_string(i); + const size_t offset = gguf_get_tensor_offset(gguf_ctx, gguf_find_tensor(gguf_ctx, name.c_str())); + + std::vector data(size); + GGML_ASSERT(fseek(file, gguf_get_data_offset(gguf_ctx) + offset, SEEK_SET) == 0); + GGML_ASSERT(fread(data.data(), 1, size, file) == size); + + for (size_t j = 0; j < size; ++j) { + const uint8_t expected_byte = (j + offset) % 256; + if (data[j] != expected_byte) { + ok = false; + } + } + } + + return ok; +} + +static std::pair test_handcrafted_file(const unsigned int seed) { + int npass = 0; + int ntest = 0; + + const std::vector hfts = { + HANDCRAFTED_HEADER_BAD_MAGIC, + HANDCRAFTED_HEADER_BAD_VERSION_1, + // HANDCRAFTED_FILE_TYPE_BAD_VERSION_FUTURE, // FIXME + HANDCRAFTED_HEADER_BAD_N_KV, + HANDCRAFTED_HEADER_BAD_N_TENSORS, + HANDCRAFTED_HEADER_EMPTY, + + HANDCRAFTED_KV_BAD_KEY_SIZE, + HANDCRAFTED_KV_BAD_TYPE, + // HANDCRAFTED_KV_BAD_VALUE_SIZE, // FIXME sanitizer limit + // HANDCRAFTED_FILE_TYPE_DUPLICATE_KEY, // FIXME + HANDCRAFTED_KV_SUCCESS, + + HANDCRAFTED_TENSORS_BAD_NAME_SIZE, + HANDCRAFTED_TENSORS_BAD_N_DIMS, + HANDCRAFTED_TENSORS_BAD_SHAPE, + HANDCRAFTED_TENSORS_NE_TOO_BIG, + HANDCRAFTED_TENSORS_BAD_TYPE, + // HANDCRAFTED_TENSORS_BAD_OFFSET, // FIXME + HANDCRAFTED_TENSORS_DUPLICATE_NAME, + // HANDCRAFTED_TENSORS_BAD_ALIGNMENT, // FIXME + HANDCRAFTED_TENSORS_SUCCESS, + HANDCRAFTED_TENSORS_CUSTOM_ALIGN, + + HANDCRAFTED_DATA_NOT_ENOUGH_DATA, + // HANDCRAFTED_DATA_BAD_ALIGNMENT, // FIXME + HANDCRAFTED_DATA_SUCCESS, + HANDCRAFTED_DATA_CUSTOM_ALIGN, + }; + + for (enum handcrafted_file_type hft : hfts) { + printf("%s: handcrafted_file_type=%s\n", __func__, handcrafted_file_type_name(hft).c_str()); + FILE * file = get_handcrafted_file(seed, hft); + +#ifdef _WIN32 + if (!file) { + printf("%s: failed to create tmpfile(), needs elevated privileges on Windows"); + printf("%s: skipping tests"); + continue; + } +#else + GGML_ASSERT(file); +#endif // _WIN32 + + struct ggml_context * ctx = nullptr; + struct gguf_init_params gguf_params = { + /*no_alloc =*/ false, + /*ctx =*/ hft >= offset_has_data ? &ctx : nullptr, + }; + struct gguf_context * gguf_ctx = gguf_init_from_file_impl(file, gguf_params); + + if (expect_context_not_null(hft)) { + printf("%s: - context_not_null: ", __func__); + } else { + printf("%s: - context_null: ", __func__); + } + if (bool(gguf_ctx) == expect_context_not_null(hft)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + if (false && hft >= offset_has_data && !expect_context_not_null(hft)) { // FIXME + printf("%s: - no_dangling_ggml_context_pointer: ", __func__); + if (ctx) { + printf("\033[1;31mFAIL\033[0m\n"); + } else { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } + ntest++; + } + + if (false && expect_context_not_null(hft)) { // FIXME + FILE * file_eb = get_handcrafted_file(seed, hft, /*extra_bytes =*/ 1); + struct gguf_context * gguf_ctx_eb = gguf_init_from_file_impl(file_eb, gguf_params); + + printf("%s: - context_null_with_extra_bytes: ", __func__); + if (gguf_ctx_eb) { + printf("\033[1;31mFAIL\033[0m\n"); + } else { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } + ntest++; + + gguf_free(gguf_ctx_eb); + fclose(file_eb); + } + + const bool alignment_defined = hft == HANDCRAFTED_TENSORS_CUSTOM_ALIGN || hft == HANDCRAFTED_DATA_CUSTOM_ALIGN; + + if (expect_context_not_null(hft)) { + printf("%s: - check_header: ", __func__); + if (handcrafted_check_header(gguf_ctx, seed, hft >= offset_has_kv, hft >= offset_has_tensors, alignment_defined)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + } + + if (expect_context_not_null(hft) && hft >= offset_has_kv) { + printf("%s: - check_kv: ", __func__); + if (handcrafted_check_kv(gguf_ctx, seed, hft >= offset_has_tensors, alignment_defined)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + } + + if (expect_context_not_null(hft) && hft >= offset_has_tensors) { + printf("%s: - check_tensors: ", __func__); + if (handcrafted_check_tensors(gguf_ctx, seed)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + } + + if (expect_context_not_null(hft) && hft >= offset_has_data) { + printf("%s: - check_tensor_data: ", __func__); + if (handcrafted_check_tensor_data(gguf_ctx, seed, file)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + } + + if (gguf_ctx) { + ggml_free(ctx); + gguf_free(gguf_ctx); + } + fclose(file); + printf("\n"); + } + + return std::make_pair(npass, ntest); +} + +struct random_gguf_context_result { + struct gguf_context * gguf_ctx; + struct ggml_context * ctx; + ggml_backend_buffer_t buffer; +}; + +static struct random_gguf_context_result get_random_gguf_context(ggml_backend_t backend, const unsigned int seed) { + std::mt19937 rng(seed); + + struct gguf_context * gguf_ctx = gguf_init_empty(); + + for (int i = 0; i < 256; ++i) { + const std::string key = "my_key_" + std::to_string(rng() % 1024); + const enum gguf_type type = gguf_type(rng() % GGUF_TYPE_COUNT); + + if (type == GGUF_TYPE_STRING || type == GGUF_TYPE_ARRAY) { + continue; // FIXME memory leak + } + + switch (type) { + case GGUF_TYPE_UINT8: gguf_set_val_u8 (gguf_ctx, key.c_str(), rng() % (1 << 7)); break; + case GGUF_TYPE_INT8: gguf_set_val_i8 (gguf_ctx, key.c_str(), rng() % (1 << 7) - (1 << 6)); break; + case GGUF_TYPE_UINT16: gguf_set_val_u16 (gguf_ctx, key.c_str(), rng() % (1 << 15)); break; + case GGUF_TYPE_INT16: gguf_set_val_i16 (gguf_ctx, key.c_str(), rng() % (1 << 15) - (1 << 14)); break; + case GGUF_TYPE_UINT32: gguf_set_val_u32 (gguf_ctx, key.c_str(), rng()); break; + case GGUF_TYPE_INT32: gguf_set_val_i32 (gguf_ctx, key.c_str(), rng() - (1 << 30)); break; + case GGUF_TYPE_FLOAT32: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024 - 512); break; + case GGUF_TYPE_BOOL: gguf_set_val_bool(gguf_ctx, key.c_str(), rng() % 2 == 0); break; + case GGUF_TYPE_STRING: gguf_set_val_str (gguf_ctx, key.c_str(), std::to_string(rng()).c_str()); break; + case GGUF_TYPE_UINT64: gguf_set_val_u64 (gguf_ctx, key.c_str(), rng()); break; + case GGUF_TYPE_INT64: gguf_set_val_i64 (gguf_ctx, key.c_str(), rng() - (1 << 30)); break; + case GGUF_TYPE_FLOAT64: gguf_set_val_f32 (gguf_ctx, key.c_str(), rng() % 1024 - 512); break; + case GGUF_TYPE_ARRAY: { + const enum gguf_type type_arr = gguf_type(rng() % GGUF_TYPE_COUNT); + const uint64_t ne = rng() % 1024; + + switch (type_arr) { + case GGUF_TYPE_UINT8: + case GGUF_TYPE_INT8: + case GGUF_TYPE_UINT16: + case GGUF_TYPE_INT16: + case GGUF_TYPE_UINT32: + case GGUF_TYPE_INT32: + case GGUF_TYPE_FLOAT32: + case GGUF_TYPE_BOOL: + case GGUF_TYPE_UINT64: + case GGUF_TYPE_INT64: + case GGUF_TYPE_FLOAT64: { + const size_t nbytes = ne*gguf_type_size(type_arr); + std::vector random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t)); + for (size_t j = 0; j < random_data.size(); ++j) { + random_data[j] = rng(); + } + gguf_set_arr_data(gguf_ctx, key.c_str(), type_arr, random_data.data(), ne); + } break; + case GGUF_TYPE_STRING: { + std::vector data_cpp(ne); + std::vector data_c(ne); + for (size_t j = 0; j < data_cpp.size(); ++j) { + data_cpp[j] = std::to_string(rng()); + data_c[j] = data_cpp[j].c_str(); + } + gguf_set_arr_str(gguf_ctx, key.c_str(), data_c.data(), ne); + } break; + case GGUF_TYPE_ARRAY: { + break; // not supported + } + case GGUF_TYPE_COUNT: + default: { + GGML_ABORT("fatal error"); + } break; + } + } break; + case GGUF_TYPE_COUNT: + default: { + GGML_ABORT("fatal error"); + } break; + } + } + + struct ggml_init_params ggml_params = { + /*.mem_size =*/ 256*ggml_tensor_overhead(), + /*.mem_buffer =*/ nullptr, + /*.no_alloc =*/ true, + }; + struct ggml_context * ctx = ggml_init(ggml_params); + + for (int i = 0; i < 256; ++i) { + const std::string name = "my_tensor_" + std::to_string(i); + const enum ggml_type type = ggml_type(rng() % GGML_TYPE_COUNT); + const size_t type_size = ggml_type_size(type); + + if (type_size == 0) { + continue; + } + + const int n_dims = 1 + rng() % GGML_MAX_DIMS; + int64_t ne[GGML_MAX_DIMS]; + ne[0] = (1 + rng() % 10) * ggml_blck_size(type); + for (int j = 1; j < n_dims; ++j) { + ne[j] = 1 + rng() % 10; + } + + struct ggml_tensor * tensor = ggml_new_tensor(ctx, type, n_dims, ne); + ggml_set_name(tensor, name.c_str()); + } + + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); + for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + const size_t nbytes = ggml_nbytes(t); + std::vector random_data((nbytes + sizeof(uint32_t) - 1) / sizeof(uint32_t)); + for (size_t j = 0; j < random_data.size(); ++j) { + random_data[j] = rng(); + } + ggml_backend_tensor_set(t, random_data.data(), 0, nbytes); + + gguf_add_tensor(gguf_ctx, t); + } + + return {gguf_ctx, ctx, buf}; +} + +static bool all_kv_in_other(const gguf_context * ctx, const gguf_context * other) { + bool ok = true; + + const int n_kv = gguf_get_n_kv(ctx); + for (int id = 0; id < n_kv; ++id) { + const char * name = gguf_get_key(ctx, id); + + const int idx_other = gguf_find_key(other, name); + if (idx_other < 0) { + ok = false; + continue; + } + + const gguf_type type = gguf_get_kv_type(ctx, id); + if (type != gguf_get_kv_type(other, idx_other)) { + ok = false; + continue; + } + + if (type == GGUF_TYPE_ARRAY) { + const int arr_n = gguf_get_arr_n(ctx, id); + if (arr_n != gguf_get_arr_n(other, idx_other)) { + ok = false; + continue; + } + + const gguf_type type_arr = gguf_get_arr_type(ctx, id); + if (type_arr != gguf_get_arr_type(other, idx_other)) { + ok = false; + continue; + } + + if (type_arr == GGUF_TYPE_STRING) { + for (int arr_i = 0; arr_i < arr_n; ++arr_i) { + const std::string str = gguf_get_arr_str(ctx, id, arr_i); + const std::string str_other = gguf_get_arr_str(other, idx_other, arr_i); + if (str != str_other) { + ok = false; + } + } + continue; + } + + const char * data = reinterpret_cast(gguf_get_arr_data(ctx, id)); + const char * data_other = reinterpret_cast(gguf_get_arr_data(other, idx_other)); + if (!std::equal(data, data + arr_n*gguf_type_size(type_arr), data_other)) { + ok = false; + } + continue; + } + + if (type == GGUF_TYPE_STRING) { + const std::string str = gguf_get_val_str(ctx, id); + const std::string str_other = gguf_get_val_str(other, idx_other); + if (str != str_other) { + ok = false; + } + continue; + } + + const char * data = reinterpret_cast(gguf_get_val_data(ctx, id)); + const char * data_other = reinterpret_cast(gguf_get_val_data(other, idx_other)); + if (!std::equal(data, data + gguf_type_size(type), data_other)) { + ok = false; + } + } + + return ok; +} + +static bool all_tensors_in_other(const gguf_context * ctx, const gguf_context * other) { + bool ok = true; + + const int n_tensors = gguf_get_n_tensors(ctx); + for (int id = 0; id < n_tensors; ++id) { + const std::string name = gguf_get_tensor_name(ctx, id); + + const int idx_other = gguf_find_tensor(other, name.c_str()); + if (id != idx_other) { + ok = false; + if (idx_other < 0) { + continue; + } + } + + const ggml_type type = gguf_get_tensor_type(ctx, id); + if (type != gguf_get_tensor_type(other, id)) { + ok = false; + } + + const size_t offset = gguf_get_tensor_offset(ctx, id); + if (offset != gguf_get_tensor_offset(other, id)) { + ok = false; + } + } + + return ok; +} + +static bool same_tensor_data(const struct ggml_context * orig, const struct ggml_context * read) { + bool ok = true; + + struct ggml_tensor * t_orig = ggml_get_first_tensor(orig); + struct ggml_tensor * t_read = ggml_get_first_tensor(read); + while (t_orig) { + if (!t_read) { + ok = false; + break; + } + + const size_t nbytes = ggml_nbytes(t_orig); + if (ggml_nbytes(t_read) != nbytes) { + ok = false; + break; + } + std::vector data_orig(nbytes); + ggml_backend_tensor_get(t_orig, data_orig.data(), 0, nbytes); + if (!std::equal(data_orig.data(), data_orig.data() + nbytes, reinterpret_cast(t_read->data))) { + ok = false; + } + + t_orig = ggml_get_next_tensor(orig, t_orig); + t_read = ggml_get_next_tensor(orig, t_read); + } + if (t_read) { + ok = false; + } + + return true; +} + +static std::pair test_roundtrip(ggml_backend_dev_t dev, const unsigned int seed, const bool only_meta) { + FILE * file = tmpfile(); +#ifdef _WIN32 + if (!file) { + printf("%s: failed to create tmpfile(), needs elevated privileges on Windows"); + printf("%s: skipping tests"); + return std::make_pair(0, 0); + } +#else + GGML_ASSERT(file); +#endif // _WIN32 + + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { + return std::make_pair(0, 0); // FIXME + } + + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + printf("%s: device=%s, backend=%s, only_meta=%s\n", + __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend), only_meta ? "yes" : "no"); + + int npass = 0; + int ntest = 0; + + struct gguf_context * gguf_ctx_0; + struct ggml_context * ctx_0; + ggml_backend_buffer_t bbuf; + { + struct random_gguf_context_result result = get_random_gguf_context(backend, seed); + gguf_ctx_0 = result.gguf_ctx; + ctx_0 = result.ctx; + bbuf = result.buffer; + } + + struct gguf_buf gbuf = gguf_buf_init(16 * 1024); + gguf_write_to_buf(gguf_ctx_0, &gbuf, only_meta); + helper_write(gbuf.data, gbuf.offset, file); + rewind(file); + + struct ggml_context * ctx_1 = nullptr; + struct gguf_init_params gguf_params = { + /*no_alloc =*/ false, + /*ctx =*/ only_meta ? nullptr : &ctx_1, + }; + struct gguf_context * gguf_ctx_1 = gguf_init_from_file_impl(file, gguf_params); + + printf("%s: same_version: ", __func__); + if (gguf_get_version(gguf_ctx_0) == gguf_get_version(gguf_ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: same_n_kv: ", __func__); + if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: same_n_tensors: ", __func__); + if (gguf_get_n_tensors(gguf_ctx_0) == gguf_get_n_tensors(gguf_ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: all_orig_kv_in_read: ", __func__); + if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: all_read_kv_in_orig: ", __func__); + if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: all_orig_tensors_in_read: ", __func__); + if (all_tensors_in_other(gguf_ctx_0, gguf_ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: all_read_tensors_in_orig: ", __func__); + if (all_tensors_in_other(gguf_ctx_1, gguf_ctx_0)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + if (!only_meta) { + printf("%s: same_tensor_data: ", __func__); + if (same_tensor_data(ctx_0, ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + } + + ggml_backend_buffer_free(bbuf); + ggml_free(ctx_0); + ggml_free(ctx_1); + gguf_free(gguf_ctx_0); + gguf_free(gguf_ctx_1); + gguf_buf_free(gbuf); + ggml_backend_free(backend); + GGML_ASSERT(fclose(file) == 0); + + printf("\n"); + return std::make_pair(npass, ntest); +} + +static std::pair test_gguf_set_kv(ggml_backend_dev_t dev, const unsigned int seed) { + ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr); + printf("%s: device=%s, backend=%s\n", __func__, ggml_backend_dev_description(dev), ggml_backend_name(backend)); + + int npass = 0; + int ntest = 0; + + struct gguf_context * gguf_ctx_0; + struct ggml_context * ctx_0; + ggml_backend_buffer_t bbuf_0; + { + struct random_gguf_context_result result = get_random_gguf_context(backend, seed); + gguf_ctx_0 = result.gguf_ctx; + ctx_0 = result.ctx; + bbuf_0 = result.buffer; + } + + struct gguf_context * gguf_ctx_1; + struct ggml_context * ctx_1; + ggml_backend_buffer_t bbuf_1; + { + struct random_gguf_context_result result = get_random_gguf_context(backend, seed + 1); + gguf_ctx_1 = result.gguf_ctx; + ctx_1 = result.ctx; + bbuf_1 = result.buffer; + } + + struct gguf_context * gguf_ctx_2 = gguf_init_empty(); + + gguf_set_kv(gguf_ctx_1, gguf_ctx_0); + gguf_set_kv(gguf_ctx_2, gguf_ctx_0); + + printf("%s: same_n_kv: ", __func__); + if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_2)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: all_kv_0_in_1: ", __func__); + if (all_kv_in_other(gguf_ctx_0, gguf_ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: all_kv_0_in_2: ", __func__); + if (all_kv_in_other(gguf_ctx_0, gguf_ctx_2)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + gguf_set_kv(gguf_ctx_0, gguf_ctx_1); + + printf("%s: same_n_kv_after_double_copy: ", __func__); + if (gguf_get_n_kv(gguf_ctx_0) == gguf_get_n_kv(gguf_ctx_1)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + printf("%s: all_kv_1_in_0_after_double_copy: ", __func__); + if (all_kv_in_other(gguf_ctx_1, gguf_ctx_0)) { + printf("\033[1;32mOK\033[0m\n"); + npass++; + } else { + printf("\033[1;31mFAIL\033[0m\n"); + } + ntest++; + + ggml_backend_buffer_free(bbuf_0); + ggml_backend_buffer_free(bbuf_1); + ggml_free(ctx_0); + ggml_free(ctx_1); + gguf_free(gguf_ctx_0); + gguf_free(gguf_ctx_1); + gguf_free(gguf_ctx_2); + ggml_backend_free(backend); + + printf("\n"); + return std::make_pair(npass, ntest); +} + +static void print_usage() { + printf("usage: test-gguf [seed]\n"); + printf(" if no seed is unspecified then a random seed is used\n"); +} + +int main(int argc, char ** argv) { + if (argc > 2) { + print_usage(); + return 1; + } + + std::random_device rd; + const unsigned int seed = argc < 2 ? rd() : std::stoi(argv[1]); + + // Initialize ggml backends early so the prints aren't interleaved with the test results: + ggml_backend_dev_count(); + fprintf(stderr, "\n"); + + int npass = 0; + int ntest = 0; + { + std::pair result = test_handcrafted_file(seed); + npass += result.first; + ntest += result.second; + } + + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + ggml_backend_dev_t dev = ggml_backend_dev_get(i); + + for (bool only_meta : {true, false}) { + std::pair result = test_roundtrip(dev, seed, only_meta); + npass += result.first; + ntest += result.second; + } + + { + std::pair result = test_gguf_set_kv(dev, seed); + npass += result.first; + ntest += result.second; + } + } + + printf("%d/%d tests passed\n", npass, ntest); + if (npass != ntest) { + printf("\033[1;31mFAIL\033[0m\n"); + return 1; + } + printf("\033[1;32mOK\033[0m\n"); + return 0; +} diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp index 5cc0cdb04..e1bdbb925 100644 --- a/tests/test-grammar-integration.cpp +++ b/tests/test-grammar-integration.cpp @@ -32,13 +32,10 @@ static bool test_build_grammar_fails(const std::string & grammar_str) { static bool match_string(const std::string & input, llama_grammar * grammar) { const auto cpts = unicode_cpts_from_utf8(input); - const llama_grammar_rules & rules = llama_grammar_get_rules (grammar); - llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); + auto & stacks_cur = llama_grammar_get_stacks(grammar); for (const auto & cpt : cpts) { - const llama_grammar_stacks stacks_prev = llama_grammar_get_stacks(grammar); // copy - - llama_grammar_accept(rules, stacks_prev, cpt, stacks_cur); + llama_grammar_accept(grammar, cpt); if (stacks_cur.empty()) { // no stacks means that the grammar failed to match at this point @@ -63,7 +60,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str, auto * grammar = build_grammar(grammar_str); // Save the original grammar stacks so that we can reset after every new string we want to test - const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); + const llama_grammar_stacks stacks_org = llama_grammar_get_stacks(grammar); // copy llama_grammar_stacks & stacks_cur = llama_grammar_get_stacks(grammar); diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp index 6f1374ca8..e2129206b 100644 --- a/tests/test-llama-grammar.cpp +++ b/tests/test-llama-grammar.cpp @@ -113,12 +113,10 @@ int main() } } - llama_grammar * grammar = NULL; std::vector grammar_rules(parsed_grammar.c_rules()); - grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); - if (grammar == nullptr) - { + llama_grammar * grammar = llama_grammar_init_impl(nullptr, grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + if (grammar == nullptr) { throw std::runtime_error("Failed to initialize llama_grammar"); } diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp index e5c9e75e4..c0dcb4848 100644 --- a/tests/test-sampling.cpp +++ b/tests/test-sampling.cpp @@ -145,7 +145,7 @@ static void test_penalties( sampler_tester tester(probs, probs_expected); const size_t n_vocab = probs.size(); - auto * sampler = llama_sampler_init_penalties(n_vocab, LLAMA_TOKEN_NULL, LLAMA_TOKEN_NULL, last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence, false, false); + auto * sampler = llama_sampler_init_penalties(last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence); for (size_t i = 0; i < last_tokens.size(); i++) { llama_sampler_accept(sampler, last_tokens[i]);