diff --git a/.clang-tidy b/.clang-tidy index 3078beacc..952c0cca8 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -12,6 +12,7 @@ Checks: > -readability-implicit-bool-conversion, -readability-magic-numbers, -readability-uppercase-literal-suffix, + -readability-simplify-boolean-expr, clang-analyzer-*, -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, performance-*, diff --git a/.devops/full-cuda.Dockerfile b/.devops/full-cuda.Dockerfile index 77a9ddc14..059fd2695 100644 --- a/.devops/full-cuda.Dockerfile +++ b/.devops/full-cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev COPY requirements.txt requirements.txt COPY requirements requirements @@ -26,8 +26,10 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 +# Enable cURL +ENV LLAMA_CURL=1 RUN make diff --git a/.devops/full-rocm.Dockerfile b/.devops/full-rocm.Dockerfile index 8b9633dc4..6ecf3bcc7 100644 --- a/.devops/full-rocm.Dockerfile +++ b/.devops/full-rocm.Dockerfile @@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ +# Enable cURL +ENV LLAMA_CURL=1 +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + RUN make ENTRYPOINT ["/app/.devops/tools.sh"] diff --git a/.devops/full.Dockerfile b/.devops/full.Dockerfile index cef1297d3..432fb5dad 100644 --- a/.devops/full.Dockerfile +++ b/.devops/full.Dockerfile @@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ - apt-get install -y build-essential python3 python3-pip git + apt-get install -y build-essential python3 python3-pip git libcurl4-openssl-dev COPY requirements.txt requirements.txt COPY requirements requirements @@ -15,6 +15,9 @@ WORKDIR /app COPY . . +ENV LLAMA_CURL=1 + + RUN make ENV LC_ALL=C.utf8 diff --git a/.devops/llama-cpp-clblast.srpm.spec b/.devops/llama-cpp-clblast.srpm.spec index 076f29695..774f63ddd 100644 --- a/.devops/llama-cpp-clblast.srpm.spec +++ b/.devops/llama-cpp-clblast.srpm.spec @@ -1,5 +1,5 @@ # SRPM for building from source and packaging an RPM for RPM-based distros. -# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal diff --git a/.devops/llama-cpp-cublas.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec similarity index 79% rename from .devops/llama-cpp-cublas.srpm.spec rename to .devops/llama-cpp-cuda.srpm.spec index f847ebb1e..ba9cb7cbb 100644 --- a/.devops/llama-cpp-cublas.srpm.spec +++ b/.devops/llama-cpp-cuda.srpm.spec @@ -1,5 +1,5 @@ # SRPM for building from source and packaging an RPM for RPM-based distros. -# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal @@ -12,7 +12,7 @@ # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries. # It is up to the user to install the correct vendor-specific support. -Name: llama.cpp-cublas +Name: llama.cpp-cuda Version: %( date "+%%Y%%m%%d" ) Release: 1%{?dist} Summary: CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL) @@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options. %setup -n llama.cpp-master %build -make -j LLAMA_CUBLAS=1 +make -j LLAMA_CUDA=1 %install mkdir -p %{buildroot}%{_bindir}/ -cp -p main %{buildroot}%{_bindir}/llamacppcublas -cp -p server %{buildroot}%{_bindir}/llamacppcublasserver -cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple +cp -p main %{buildroot}%{_bindir}/llamacppcuda +cp -p server %{buildroot}%{_bindir}/llamacppcudaserver +cp -p simple %{buildroot}%{_bindir}/llamacppcudasimple mkdir -p %{buildroot}/usr/lib/systemd/system -%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacublas.service +%{__cat} < %{buildroot}/usr/lib/systemd/system/llamacuda.service [Unit] Description=Llama.cpp server, CPU only (no GPU support in this build). After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target @@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t [Service] Type=simple EnvironmentFile=/etc/sysconfig/llama -ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS +ExecStart=/usr/bin/llamacppcudaserver $LLAMA_ARGS ExecReload=/bin/kill -s HUP $MAINPID Restart=never @@ -67,10 +67,10 @@ rm -rf %{buildroot} rm -rf %{_builddir}/* %files -%{_bindir}/llamacppcublas -%{_bindir}/llamacppcublasserver -%{_bindir}/llamacppcublassimple -/usr/lib/systemd/system/llamacublas.service +%{_bindir}/llamacppcuda +%{_bindir}/llamacppcudaserver +%{_bindir}/llamacppcudasimple +/usr/lib/systemd/system/llamacuda.service %config /etc/sysconfig/llama %pre diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec index 446213d69..1d9e4f425 100644 --- a/.devops/llama-cpp.srpm.spec +++ b/.devops/llama-cpp.srpm.spec @@ -1,5 +1,5 @@ # SRPM for building from source and packaging an RPM for RPM-based distros. -# https://fedoraproject.org/wiki/How_to_create_an_RPM_package +# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages # Built and maintained by John Boero - boeroboy@gmail.com # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal diff --git a/.devops/main-cuda.Dockerfile b/.devops/main-cuda.Dockerfile index 2b7faf7c1..b937a4829 100644 --- a/.devops/main-cuda.Dockerfile +++ b/.devops/main-cuda.Dockerfile @@ -20,8 +20,8 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 RUN make diff --git a/.devops/main-intel.Dockerfile b/.devops/main-intel.Dockerfile index 572e5d8ea..274b91b71 100644 --- a/.devops/main-intel.Dockerfile +++ b/.devops/main-intel.Dockerfile @@ -10,14 +10,12 @@ WORKDIR /app COPY . . -RUN mkdir build && \ - cd build && \ - if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ +RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ echo "LLAMA_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ fi && \ - cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ - cmake --build . --config Release --target main + cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ + cmake --build build --config Release --target main FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime diff --git a/.devops/main-vulkan.Dockerfile b/.devops/main-vulkan.Dockerfile index bca460365..6c2b2ed5b 100644 --- a/.devops/main-vulkan.Dockerfile +++ b/.devops/main-vulkan.Dockerfile @@ -14,10 +14,8 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key # Build it WORKDIR /app COPY . . -RUN mkdir build && \ - cd build && \ - cmake .. -DLLAMA_VULKAN=1 && \ - cmake --build . --config Release --target main +RUN cmake -B build -DLLAMA_VULKAN=1 && \ + cmake --build build --config Release --target main # Clean up WORKDIR / diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix index c7fa2203a..2c0ae4e2a 100644 --- a/.devops/nix/package.nix +++ b/.devops/nix/package.nix @@ -4,13 +4,14 @@ config, stdenv, mkShell, + runCommand, cmake, ninja, pkg-config, git, python3, mpi, - openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations + blas, cudaPackages, darwin, rocmPackages, @@ -23,7 +24,7 @@ useOpenCL useRocm useVulkan - ], + ] && blas.meta.available, useCuda ? config.cudaSupport, useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL, useMpi ? false, # Increases the runtime closure size by ~700M @@ -35,7 +36,8 @@ # It's necessary to consistently use backendStdenv when building with CUDA support, # otherwise we get libstdc++ errors downstream. effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv, - enableStatic ? effectiveStdenv.hostPlatform.isStatic + enableStatic ? effectiveStdenv.hostPlatform.isStatic, + precompileMetalShaders ? false }@inputs: let @@ -65,10 +67,15 @@ let strings.optionalString (suffices != [ ]) ", accelerated with ${strings.concatStringsSep ", " suffices}"; + executableSuffix = effectiveStdenv.hostPlatform.extensions.executable; + # TODO: package the Python in this repository in a Nix-like way. # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo # is PEP 517-compatible, and ensure the correct .dist-info is generated. # https://peps.python.org/pep-0517/ + # + # TODO: Package up each Python script or service appropriately, by making + # them into "entrypoints" llama-python = python3.withPackages ( ps: [ ps.numpy @@ -87,6 +94,11 @@ let ] ); + xcrunHost = runCommand "xcrunHost" {} '' + mkdir -p $out/bin + ln -s /usr/bin/xcrun $out/bin + ''; + # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64 # separately darwinBuildInputs = @@ -150,13 +162,18 @@ effectiveStdenv.mkDerivation ( postPatch = '' substituteInPlace ./ggml-metal.m \ --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - - # TODO: Package up each Python script or service appropriately. - # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`, - # we could make those *.py into setuptools' entrypoints - substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python" + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";" ''; + # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015, + # `default.metallib` may be compiled with Metal compiler from XCode + # and we need to escape sandbox on MacOS to access Metal compiler. + # `xcrun` is used find the path of the Metal compiler, which is varible + # and not on $PATH + # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion + __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders; + nativeBuildInputs = [ cmake @@ -173,6 +190,8 @@ effectiveStdenv.mkDerivation ( ] ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static + ] ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ + xcrunHost ]; buildInputs = @@ -181,6 +200,7 @@ effectiveStdenv.mkDerivation ( ++ optionals useMpi [ mpi ] ++ optionals useOpenCL [ clblast ] ++ optionals useRocm rocmBuildInputs + ++ optionals useBlas [ blas ] ++ optionals useVulkan vulkanBuildInputs; cmakeFlags = @@ -191,7 +211,7 @@ effectiveStdenv.mkDerivation ( (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true) (cmakeBool "LLAMA_BLAS" useBlas) (cmakeBool "LLAMA_CLBLAST" useOpenCL) - (cmakeBool "LLAMA_CUBLAS" useCuda) + (cmakeBool "LLAMA_CUDA" useCuda) (cmakeBool "LLAMA_HIPBLAS" useRocm) (cmakeBool "LLAMA_METAL" useMetalKit) (cmakeBool "LLAMA_MPI" useMpi) @@ -216,14 +236,16 @@ effectiveStdenv.mkDerivation ( # Should likely use `rocmPackages.clr.gpuTargets`. "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102" ] - ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ] - ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ]; + ++ optionals useMetalKit [ + (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") + (cmakeBool "LLAMA_METAL_EMBED_LIBRARY" (!precompileMetalShaders)) + ]; # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level, # if they haven't been added yet. postInstall = '' - mv $out/bin/main $out/bin/llama - mv $out/bin/server $out/bin/llama-server + mv $out/bin/main${executableSuffix} $out/bin/llama${executableSuffix} + mv $out/bin/server${executableSuffix} $out/bin/llama-server${executableSuffix} mkdir -p $out/include cp $src/llama.h $out/include/ ''; diff --git a/.devops/server-cuda.Dockerfile b/.devops/server-cuda.Dockerfile index 4f83904bc..59a52ba21 100644 --- a/.devops/server-cuda.Dockerfile +++ b/.devops/server-cuda.Dockerfile @@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build ARG CUDA_DOCKER_ARCH=all RUN apt-get update && \ - apt-get install -y build-essential git + apt-get install -y build-essential git libcurl4-openssl-dev WORKDIR /app @@ -20,13 +20,18 @@ COPY . . # Set nvcc architecture ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH} -# Enable cuBLAS -ENV LLAMA_CUBLAS=1 +# Enable CUDA +ENV LLAMA_CUDA=1 +# Enable cURL +ENV LLAMA_CURL=1 RUN make FROM ${BASE_CUDA_RUN_CONTAINER} as runtime +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + COPY --from=build /app/server /server ENTRYPOINT [ "/server" ] diff --git a/.devops/server-intel.Dockerfile b/.devops/server-intel.Dockerfile index 312f2df80..a8e451fa9 100644 --- a/.devops/server-intel.Dockerfile +++ b/.devops/server-intel.Dockerfile @@ -4,23 +4,24 @@ FROM intel/oneapi-basekit:$ONEAPI_VERSION as build ARG LLAMA_SYCL_F16=OFF RUN apt-get update && \ - apt-get install -y git + apt-get install -y git libcurl4-openssl-dev WORKDIR /app COPY . . -RUN mkdir build && \ - cd build && \ - if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ +RUN if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \ echo "LLAMA_SYCL_F16 is set" && \ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \ fi && \ - cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \ - cmake --build . --config Release --target server + cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \ + cmake --build build --config Release --target server FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + COPY --from=build /app/build/bin/server /server ENV LC_ALL=C.utf8 diff --git a/.devops/server-rocm.Dockerfile b/.devops/server-rocm.Dockerfile index e9a31647c..c02a31dd8 100644 --- a/.devops/server-rocm.Dockerfile +++ b/.devops/server-rocm.Dockerfile @@ -40,6 +40,11 @@ ENV LLAMA_HIPBLAS=1 ENV CC=/opt/rocm/llvm/bin/clang ENV CXX=/opt/rocm/llvm/bin/clang++ +# Enable cURL +ENV LLAMA_CURL=1 +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + RUN make ENTRYPOINT [ "/app/server" ] diff --git a/.devops/server-vulkan.Dockerfile b/.devops/server-vulkan.Dockerfile index e0add6fc3..6e757e171 100644 --- a/.devops/server-vulkan.Dockerfile +++ b/.devops/server-vulkan.Dockerfile @@ -11,13 +11,15 @@ RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key apt update -y && \ apt-get install -y vulkan-sdk +# Install cURL +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + # Build it WORKDIR /app COPY . . -RUN mkdir build && \ - cd build && \ - cmake .. -DLLAMA_VULKAN=1 && \ - cmake --build . --config Release --target server +RUN cmake -B build -DLLAMA_VULKAN=1 -DLLAMA_CURL=1 && \ + cmake --build build --config Release --target server # Clean up WORKDIR / diff --git a/.devops/server.Dockerfile b/.devops/server.Dockerfile index 134588fe2..be964e0e8 100644 --- a/.devops/server.Dockerfile +++ b/.devops/server.Dockerfile @@ -3,16 +3,21 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION as build RUN apt-get update && \ - apt-get install -y build-essential git + apt-get install -y build-essential git libcurl4-openssl-dev WORKDIR /app COPY . . +ENV LLAMA_CURL=1 + RUN make FROM ubuntu:$UBUNTU_VERSION as runtime +RUN apt-get update && \ + apt-get install -y libcurl4-openssl-dev + COPY --from=build /app/server /server ENV LC_ALL=C.utf8 diff --git a/.flake8 b/.flake8 index 18fba2c15..d64c2564a 100644 --- a/.flake8 +++ b/.flake8 @@ -1,3 +1,17 @@ [flake8] max-line-length = 125 -ignore = W503 +ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503 +exclude = + # Do not traverse examples + examples, + # Do not include package initializers + __init__.py, + # No need to traverse our git directory + .git, + # There's no value in checking cache directories + __pycache__, + # No need to include the build path + build, + # This contains builds that we don't want to check + dist # This is generated with `python build .` for package releases +# max-complexity = 10 diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml new file mode 100644 index 000000000..de0d994c8 --- /dev/null +++ b/.github/workflows/bench.yml @@ -0,0 +1,310 @@ +# Benchmark +name: Benchmark + +on: + workflow_dispatch: + inputs: + gpu-series: + description: 'Azure GPU series to run with' + required: true + type: choice + options: + - Standard_NC4as_T4_v3 + - Standard_NC24ads_A100_v4 + - Standard_NC80adis_H100_v5 + sha: + description: 'Commit SHA1 to build' + required: false + type: string + duration: + description: 'Duration of the bench' + type: string + default: 10m + + push: + branches: + - master + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + pull_request_target: + types: [opened, synchronize, reopened] + paths: ['llama.cpp', 'ggml.c', 'ggml-backend.c', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp'] + schedule: + - cron: '04 2 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }} + cancel-in-progress: true + +jobs: + bench-server-baseline: + runs-on: Standard_NC4as_T4_v3 + env: + RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it + N_USERS: 8 + DURATION: 10m + + strategy: + matrix: + model: [phi-2] + ftype: [q4_0, q8_0, f16] + include: + - model: phi-2 + ftype: q4_0 + pr_comment_enabled: "true" + + if: | + inputs.gpu-series == 'Standard_NC4as_T4_v3' + || ( + github.event_name == 'schedule' + && github.ref_name == 'master' + && github.repository_owner == 'ggerganov' + ) + || github.event_name == 'pull_request_target' + || ( + github.event_name == 'push' + && github.event.ref == 'refs/heads/master' + && github.repository_owner == 'ggerganov' + ) + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Install python env + id: pipenv + run: | + cd examples/server/bench + python3 -m venv venv + source venv/bin/activate + pip install -r requirements.txt + + - name: Prometheus + id: install_prometheus + run: | + wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz + tar xzf prometheus*.tar.gz --strip-components=1 + ./prometheus --config.file=examples/server/bench/prometheus.yml & + while ! nc -z localhost 9090; do + sleep 0.1 + done + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: '1.21' + + - name: Install k6 and xk6-sse + id: k6_installation + run: | + cd examples/server/bench + go install go.k6.io/xk6/cmd/xk6@latest + xk6 build master \ + --with github.com/phymbert/xk6-sse + + - name: Build + id: cmake_build + run: | + set -eux + cmake -B build \ + -DLLAMA_NATIVE=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ + -DLLAMA_CUBLAS=ON \ + -DCUDAToolkit_ROOT=/usr/local/cuda \ + -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DCMAKE_CUDA_ARCHITECTURES=75 \ + -DLLAMA_FATAL_WARNINGS=OFF \ + -DLLAMA_ALL_WARNINGS=OFF \ + -DCMAKE_BUILD_TYPE=Release; + cmake --build build --config Release -j $(nproc) --target server + + - name: Download the dataset + id: download_dataset + run: | + cd examples/server/bench + wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + - name: Server bench + id: server_bench + run: | + set -eux + + cd examples/server/bench + source venv/bin/activate + python bench.py \ + --runner-label ${{ env.RUNNER_LABEL }} \ + --name ${{ github.job }} \ + --branch ${{ github.head_ref || github.ref_name }} \ + --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \ + --scenario script.js \ + --duration ${{ github.event.inputs.duration || env.DURATION }} \ + --hf-repo ggml-org/models \ + --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \ + --model-path-prefix /models \ + --parallel ${{ env.N_USERS }} \ + -ngl 33 \ + --batch-size 2048 \ + --ubatch-size 256 \ + --ctx-size 16384 \ + --n-prompts 1000 \ + --max-prompt-tokens 1024 \ + --max-tokens 2048 + + cat results.github.env >> $GITHUB_ENV + + # Remove dataset as we do not want it in the artefact + rm ShareGPT_V3_unfiltered_cleaned_split.json + + - uses: actions/upload-artifact@v4 + with: + name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + compression-level: 9 + path: | + examples/server/bench/*.jpg + examples/server/bench/*.json + examples/server/bench/*.log + + - name: Commit status + uses: Sibz/github-status-action@v1 + with: + authToken: ${{secrets.GITHUB_TOKEN}} + sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }} + context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + description: | + ${{ env.BENCH_RESULTS }} + state: 'success' + + - name: Upload benchmark images + uses: devicons/public-upload-to-imgur@v2.2.2 + continue-on-error: true # Important as it looks unstable: 503 + id: imgur_step + with: + client_id: ${{secrets.IMGUR_CLIENT_ID}} + path: | + examples/server/bench/prompt_tokens_seconds.jpg + examples/server/bench/predicted_tokens_seconds.jpg + examples/server/bench/kv_cache_usage_ratio.jpg + examples/server/bench/requests_processing.jpg + + - name: Extract mermaid + id: set_mermaid + run: | + set -eux + + cd examples/server/bench + PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid) + echo "PROMPT_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid) + echo "PREDICTED_TOKENS_SECONDS<> $GITHUB_ENV + echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid) + echo "KV_CACHE_USAGE_RATIO<> $GITHUB_ENV + echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + REQUESTS_PROCESSING=$(cat requests_processing.mermaid) + echo "REQUESTS_PROCESSING<> $GITHUB_ENV + echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV + echo "EOF" >> $GITHUB_ENV + + - name: Extract image url + id: extract_image_url + continue-on-error: true + run: | + set -eux + + echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV + echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV + echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV + echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV + + - name: Comment PR + uses: mshick/add-pr-comment@v2 + id: comment_pr + if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }} + with: + message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }} + message: | +

+ + 📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀 + +

+ +
+ + Expand details for performance related PR only + + - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }} + - HTTP request : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }} + - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s + - Token generation (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s + - ${{ env.BENCH_GRAPH_XLABEL }} + + +

+ + prompt_tokens_seconds + +

+ + More + + ```mermaid + ${{ env.PROMPT_TOKENS_SECONDS }} + ``` + +
+ + predicted_tokens_seconds + +
+ More + + ```mermaid + ${{ env.PREDICTED_TOKENS_SECONDS }} + ``` + +
+ +

+ +
+ + Details + +

+ + kv_cache_usage_ratio + +

+ More + + ```mermaid + ${{ env.KV_CACHE_USAGE_RATIO }} + ``` + +
+ + requests_processing + +
+ More + + ```mermaid + ${{ env.REQUESTS_PROCESSING }} + ``` + +
+ +

+
+
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0f68e4cc3..2d747e688 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -15,6 +15,10 @@ on: types: [opened, synchronize, reopened] paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m'] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} GGML_NLOOP: 3 @@ -27,7 +31,9 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Dependencies id: depends @@ -41,14 +47,14 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON .. + cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON -DLLAMA_CURL=ON .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest -L 'main|curl' --verbose --timeout 900 - name: Determine tag name id: tag @@ -72,10 +78,10 @@ jobs: - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-macos-arm64.zip + name: llama-bin-macos-arm64.zip macOS-latest-cmake-x64: runs-on: macos-latest @@ -83,7 +89,9 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Dependencies id: depends @@ -97,7 +105,9 @@ jobs: sysctl -a mkdir build cd build - cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL_EMBED_LIBRARY=ON .. + # Metal is disabled due to intermittent failures with Github runners not having a GPU: + # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313 + cmake -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_METAL=OFF -DLLAMA_CURL=ON .. cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) - name: Test @@ -128,18 +138,21 @@ jobs: - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip + name: llama-bin-macos-x64.zip ubuntu-focal-make: runs-on: ubuntu-20.04 + env: + LLAMA_NODE_AVAILABLE: true + LLAMA_PYTHON_AVAILABLE: true steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -147,6 +160,14 @@ jobs: sudo apt-get update sudo apt-get install build-essential gcc-8 + - uses: actions/setup-node@v4 + with: + node-version: "20" + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Build id: make_build env: @@ -166,7 +187,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -188,27 +209,67 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Dependencies id: depends run: | sudo apt-get update - sudo apt-get install build-essential + sudo apt-get install build-essential libcurl4-openssl-dev - name: Build id: cmake_build run: | mkdir build cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON + cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON cmake --build . --config Release -j $(nproc) - name: Test id: cmake_test run: | cd build - ctest -L main --verbose --timeout 900 + ctest -L 'main|curl' --verbose --timeout 900 + + - name: Test llama2c conversion + id: llama2c_test + run: | + cd build + echo "Fetch tokenizer" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.bin + echo "Fetch llama2c model" + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.bin + ./bin/convert-llama2c-to-ggml --copy-vocab-from-model ./tok512.bin --llama2c-model stories260K.bin --llama2c-output-model stories260K.gguf + ./bin/main -m stories260K.gguf -p "One day, Lily met a Shoggoth" -n 500 -c 256 + + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + cp LICENSE ./build/bin/ + zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip + name: llama-bin-ubuntu-x64.zip # ubuntu-latest-cmake-sanitizer: # runs-on: ubuntu-latest @@ -223,7 +284,7 @@ jobs: # steps: # - name: Clone # id: checkout -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # id: depends @@ -257,7 +318,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -285,7 +346,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -331,7 +392,7 @@ jobs: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build id: cmake_build @@ -372,7 +433,7 @@ jobs: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build id: cmake_build @@ -392,7 +453,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -423,7 +484,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies id: depends @@ -532,6 +593,63 @@ jobs: run: | make swift + windows-msys2: + runs-on: windows-latest + + strategy: + fail-fast: false + matrix: + include: + - { sys: UCRT64, env: ucrt-x86_64, build: Release } + - { sys: CLANG64, env: clang-x86_64, build: Release } + + steps: + - name: Clone + uses: actions/checkout@v4 + + - name: Setup ${{ matrix.sys }} + uses: msys2/setup-msys2@v2 + with: + update: true + msystem: ${{matrix.sys}} + install: >- + base-devel + mingw-w64-${{matrix.env}}-toolchain + mingw-w64-${{matrix.env}}-cmake + mingw-w64-${{matrix.env}}-openblas + + - name: Build using make + shell: msys2 {0} + run: | + make -j $(nproc) + + - name: Clean after building using make + shell: msys2 {0} + run: | + make clean + + - name: Build using make w/ OpenBLAS + shell: msys2 {0} + run: | + make LLAMA_OPENBLAS=1 -j $(nproc) + + - name: Build using CMake + shell: msys2 {0} + run: | + cmake -B build + cmake --build build --config ${{ matrix.build }} -j $(nproc) + + - name: Clean after building using CMake + shell: msys2 {0} + run: | + rm -rf build + + - name: Build using CMake w/ OpenBLAS + shell: msys2 {0} + run: | + cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS + cmake --build build --config ${{ matrix.build }} -j $(nproc) + windows-latest-cmake: runs-on: windows-latest @@ -567,7 +685,7 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -697,23 +815,23 @@ jobs: - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-x64.zip + name: llama-bin-win-${{ matrix.build }}-x64.zip - windows-latest-cmake-cublas: + windows-latest-cmake-cuda: runs-on: windows-latest strategy: matrix: cuda: ['12.2.0', '11.7.1'] - build: ['cublas'] + build: ['cuda'] steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -729,7 +847,7 @@ jobs: run: | mkdir build cd build - cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON -DBUILD_SHARED_LIBS=ON + cmake .. -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=ON cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} - name: Determine tag name @@ -753,10 +871,10 @@ jobs: - name: Upload artifacts if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + path: llama-${{ steps.tag.outputs.name }}-bin-win-${{ matrix.build }}-cu${{ matrix.cuda }}-x64.zip + name: llama-bin-win-cu${{ matrix.cuda }}-x64.zip - name: Copy and pack Cuda runtime run: | @@ -767,13 +885,14 @@ jobs: - name: Upload Cuda runtime if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: - path: | - cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + path: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip + name: cudart-llama-bin-win-cu${{ matrix.cuda }}-x64.zip windows-latest-cmake-sycl: runs-on: windows-latest + defaults: run: shell: bash @@ -782,11 +901,10 @@ jobs: WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel - steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -797,12 +915,38 @@ jobs: id: cmake_build run: examples/sycl/win-build-sycl.bat + - name: Determine tag name + id: tag + shell: bash + run: | + BUILD_NUMBER="$(git rev-list --count HEAD)" + SHORT_HASH="$(git rev-parse --short=7 HEAD)" + if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then + echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT + else + SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-') + echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT + fi + + - name: Pack artifacts + id: pack_artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + run: | + 7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/* + + - name: Upload artifacts + if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} + uses: actions/upload-artifact@v4 + with: + path: llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip + name: llama-bin-win-sycl-x64.zip + ios-xcode-build: runs-on: macos-latest steps: - name: Checkout code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Build Xcode project run: xcodebuild -project examples/llama.swiftui/llama.swiftui.xcodeproj -scheme llama.swiftui -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build @@ -812,7 +956,7 @@ jobs: steps: - name: Clone - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up JDK uses: actions/setup-java@v3 @@ -835,7 +979,7 @@ jobs: # runs-on: macos-12 # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Build # uses: cross-platform-actions/action@v0.19.0 @@ -859,14 +1003,14 @@ jobs: - macOS-latest-make - macOS-latest-cmake - windows-latest-cmake - - windows-latest-cmake-cublas + - windows-latest-cmake-cuda - macOS-latest-cmake-arm64 - macOS-latest-cmake-x64 steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 @@ -885,7 +1029,13 @@ jobs: - name: Download artifacts id: download-artifact - uses: actions/download-artifact@v3 + uses: actions/download-artifact@v4 + with: + path: ./artifact + + - name: Move artifacts + id: move_artifacts + run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release - name: Create release id: create_release @@ -904,7 +1054,7 @@ jobs: const path = require('path'); const fs = require('fs'); const release_id = '${{ steps.create_release.outputs.id }}'; - for (let file of await fs.readdirSync('./artifact')) { + for (let file of await fs.readdirSync('./artifact/release')) { if (path.extname(file) === '.zip') { console.log('uploadReleaseAsset', file); await github.repos.uploadReleaseAsset({ @@ -912,7 +1062,7 @@ jobs: repo: context.repo.repo, release_id: release_id, name: file, - data: await fs.readFileSync(`./artifact/${file}`) + data: await fs.readFileSync(`./artifact/release/${file}`) }); } } @@ -926,7 +1076,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | @@ -950,7 +1100,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | @@ -974,7 +1124,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | @@ -1004,7 +1154,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Add msbuild to PATH # uses: microsoft/setup-msbuild@v1 @@ -1020,7 +1170,7 @@ jobs: # msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }} # # - name: Upload binaries -# uses: actions/upload-artifact@v1 +# uses: actions/upload-artifact@v4 # with: # name: llama-bin-${{ matrix.arch }} # path: build/bin/${{ matrix.build }} @@ -1043,7 +1193,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Add msbuild to PATH # uses: microsoft/setup-msbuild@v1 @@ -1075,7 +1225,7 @@ jobs: # # - name: Upload binaries # if: matrix.blas == 'ON' -# uses: actions/upload-artifact@v1 +# uses: actions/upload-artifact@v4 # with: # name: llama-blas-bin-${{ matrix.arch }} # path: build/bin/${{ matrix.build }} @@ -1089,7 +1239,7 @@ jobs: # # steps: # - name: Clone -# uses: actions/checkout@v3 +# uses: actions/checkout@v4 # # - name: Dependencies # run: | diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index a151c6780..69c9f4f69 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -12,12 +12,12 @@ jobs: steps: - uses: actions/stale@v5 with: - exempt-issue-labels: "refactor,help wanted,good first issue,research" + exempt-issue-labels: "refactor,help wanted,good first issue,research,bug" days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 - operations-per-run: 1000 + operations-per-run: 10000 repo-token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/code-coverage.yml b/.github/workflows/code-coverage.yml index 392db8a08..f12c558f8 100644 --- a/.github/workflows/code-coverage.yml +++ b/.github/workflows/code-coverage.yml @@ -5,12 +5,16 @@ env: GGML_NLOOP: 3 GGML_N_THREADS: 1 +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: run: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Dependencies run: | diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 94f9161fc..9b03d19bc 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -15,6 +15,10 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: push_to_registry: name: Push Docker image to Docker Hub @@ -42,7 +46,7 @@ jobs: - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" } steps: - name: Check out the repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up QEMU uses: docker/setup-qemu-action@v2 @@ -87,6 +91,12 @@ jobs: echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT fi + - name: Downcase github.repository_owner + run: | + echo "repository_owner_lowercase=${GITHUB_REPOSITORY_OWNER@L}" >> $GITHUB_ENV + env: + GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}' + - name: Build and push Docker image (versioned) if: github.event_name == 'push' uses: docker/build-push-action@v4 @@ -94,7 +104,7 @@ jobs: context: . push: true platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" + tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}" file: ${{ matrix.config.dockerfile }} - name: Build and push Docker image (tagged) @@ -103,5 +113,5 @@ jobs: context: . push: ${{ github.event_name == 'push' }} platforms: ${{ matrix.config.platforms }} - tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" + tags: "ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ env.repository_owner_lowercase }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}" file: ${{ matrix.config.dockerfile }} diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml index 0e0993cd4..ae86e9927 100644 --- a/.github/workflows/editorconfig.yml +++ b/.github/workflows/editorconfig.yml @@ -14,10 +14,14 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: editorconfig: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: editorconfig-checker/action-editorconfig-checker@main - run: editorconfig-checker diff --git a/.github/workflows/gguf-publish.yml b/.github/workflows/gguf-publish.yml index 57db17512..3ca4d3058 100644 --- a/.github/workflows/gguf-publish.yml +++ b/.github/workflows/gguf-publish.yml @@ -24,9 +24,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.9.x' - name: Install dependencies diff --git a/.github/workflows/nix-ci-aarch64.yml b/.github/workflows/nix-ci-aarch64.yml index 8d0a3fd7f..4aa4b2379 100644 --- a/.github/workflows/nix-ci-aarch64.yml +++ b/.github/workflows/nix-ci-aarch64.yml @@ -17,6 +17,10 @@ on: types: [opened, synchronize, reopened] paths: ['**/*.nix', 'flake.lock'] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: nix-build-aarch64: runs-on: ubuntu-latest diff --git a/.github/workflows/nix-ci.yml b/.github/workflows/nix-ci.yml index 01c5a9d5a..8955f38d0 100644 --- a/.github/workflows/nix-ci.yml +++ b/.github/workflows/nix-ci.yml @@ -8,6 +8,10 @@ on: pull_request: types: [opened, synchronize, reopened] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: nix-eval: strategy: diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index b82205992..4e0374fc6 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -16,15 +16,19 @@ on: - 'requirements.txt' - 'requirements/*.txt' +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: python-check-requirements: runs-on: ubuntu-latest name: check-requirements steps: - name: Check out source repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python environment - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" - name: Run check-requirements.sh script diff --git a/.github/workflows/python-lint.yml b/.github/workflows/python-lint.yml index ea0a05ea1..a8d46f31d 100644 --- a/.github/workflows/python-lint.yml +++ b/.github/workflows/python-lint.yml @@ -2,19 +2,22 @@ name: flake8 Lint on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: flake8-lint: runs-on: ubuntu-latest name: Lint steps: - name: Check out source repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python environment - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: "3.11" - name: flake8 Lint uses: py-actions/flake8@v2 with: - ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503" - exclude: "examples/*,examples/*/**,*/**/__init__.py" + plugins: "flake8-no-print" diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 65ca7d9ca..afac89c5b 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -4,6 +4,10 @@ name: Server on: workflow_dispatch: # allows manual triggering inputs: + sha: + description: 'Commit SHA1 to build' + required: false + type: string slow_tests: description: 'Run slow tests' required: true @@ -11,12 +15,16 @@ on: push: branches: - master - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*'] - pull_request: + paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] + pull_request_target: types: [opened, synchronize, reopened] - paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*'] + paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*'] schedule: - - cron: '0 0 * * *' + - cron: '2 4 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true jobs: server: @@ -31,53 +39,70 @@ jobs: include: - build_type: Release sanitizer: "" - disabled_on_pr: true fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken - container: - image: ubuntu:latest - ports: - - 8888 - options: --cpus 4 - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - name: Dependencies id: depends run: | - apt-get update - apt-get -y install \ + sudo apt-get update + sudo apt-get -y install \ build-essential \ + xxd \ git \ cmake \ - python3-pip \ + curl \ wget \ language-pack-en \ libcurl4-openssl-dev - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. \ - -DLLAMA_NATIVE=OFF \ - -DLLAMA_BUILD_SERVER=ON \ - -DLLAMA_CURL=ON \ - -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} + + - name: Python setup + id: setup_python + uses: actions/setup-python@v5 + with: + python-version: '3.11' - name: Tests dependencies id: test_dependencies run: | pip install -r examples/server/tests/requirements.txt + - name: Verify server deps + id: verify_server_deps + run: | + git config --global --add safe.directory $(realpath .) + cd examples/server + git ls-files --others --modified + git status + ./deps.sh + git status + not_ignored_files="$(git ls-files --others --modified)" + echo "Modified files: ${not_ignored_files}" + if [ -n "${not_ignored_files}" ]; then + echo "Repository is dirty or server deps are not built as expected" + echo "${not_ignored_files}" + exit 1 + fi + + - name: Build + id: cmake_build + run: | + cmake -B build \ + -DLLAMA_NATIVE=OFF \ + -DLLAMA_BUILD_SERVER=ON \ + -DLLAMA_CURL=ON \ + -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ + -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ; + cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target server + + - name: Tests id: server_integration_tests if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }} @@ -99,9 +124,10 @@ jobs: steps: - name: Clone id: checkout - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 0 + ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }} - name: libCURL id: get_libcurl @@ -115,10 +141,8 @@ jobs: - name: Build id: cmake_build run: | - mkdir build - cd build - cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" - cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server + cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" + cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server - name: Python setup id: setup_python diff --git a/.github/workflows/zig-build.yml b/.github/workflows/zig-build.yml index 68a698ab9..747c35cc0 100644 --- a/.github/workflows/zig-build.yml +++ b/.github/workflows/zig-build.yml @@ -6,6 +6,10 @@ on: branches: - master +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }} + cancel-in-progress: true + jobs: build: strategy: @@ -14,7 +18,7 @@ jobs: runs-on: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.runs-on }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: recursive fetch-depth: 0 diff --git a/.gitignore b/.gitignore index 51aa84222..50ae0973a 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ *.a *.so *.gguf +*.gguf.json *.bin *.exe *.dll @@ -34,6 +35,7 @@ lcov-report/ gcovr-report/ build* +!build.zig cmake-build-* out/ tmp/ @@ -48,8 +50,10 @@ models-mnt /convert-llama2c-to-ggml /embd-input-test /embedding +/eval-callback /gguf /gguf-llama-simple +/gguf-split /gritlm /imatrix /infill @@ -58,6 +62,9 @@ models-mnt /llava-cli /lookahead /lookup +/lookup-create +/lookup-merge +/lookup-stats /main /metal /passkey @@ -73,6 +80,7 @@ models-mnt /batched-bench /export-lora /finetune +/retrieval /speculative /parallel /train-text-from-scratch @@ -94,7 +102,25 @@ qnt-*.txt perf-*.txt examples/jeopardy/results.txt +examples/server/*.html.hpp +examples/server/*.js.hpp +examples/server/*.mjs.hpp poetry.lock poetry.toml nppBackup + +# Test binaries +/tests/test-grammar-parser +/tests/test-llama-grammar +/tests/test-double-float +/tests/test-grad0 +/tests/test-opt +/tests/test-quantize-fns +/tests/test-quantize-perf +/tests/test-sampling +/tests/test-tokenizer-0 +/tests/test-tokenizer-1-spm +/tests/test-tokenizer-1-bpe +/tests/test-rope +/tests/test-backend-ops diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 65796fe2e..91d791628 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,13 +3,14 @@ exclude: prompts/.*.txt repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.2.0 + rev: v4.6.0 hooks: - id: trailing-whitespace - id: end-of-file-fixer - id: check-yaml - id: check-added-large-files - repo: https://github.com/PyCQA/flake8 - rev: 6.0.0 + rev: 7.0.0 hooks: - id: flake8 + additional_dependencies: [flake8-no-print] diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 000000000..b029f13da --- /dev/null +++ b/AUTHORS @@ -0,0 +1,655 @@ +# date: Tue Apr 9 09:17:14 EEST 2024 +# this file is auto-generated by scripts/gen-authors.sh + +0cc4m +0xspringtime <110655352+0xspringtime@users.noreply.github.com> +2f38b454 +3ooabkhxtn <31479382+3ooabkhxtn@users.noreply.github.com> +44670 <44670@users.noreply.github.com> +AN Long +AT +Aarni Koskela +Aaron Miller +Aaryaman Vasishta +Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> +Abhishek Gopinath K <31348521+overtunned@users.noreply.github.com> +Adithya Balaji +AdithyanI +Adrian +Adrian Hesketh +AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> +Aisuko +Alberto <57916483+albbus-stack@users.noreply.github.com> +Alex +Alex Azarov +Alex Azarov +Alex Klinkhamer +Alex Klinkhamer +Alex Nguyen +Alex Petenchea +Alex Renda +Alex von Gluck IV +Alexey Parfenov +Ali Chraghi <63465728+alichraghi@users.noreply.github.com> +Ali Nehzat +Ali Tariq +Alon +AlpinDale <52078762+AlpinDale@users.noreply.github.com> +AmirAli Mirian <37371367+amiralimi@users.noreply.github.com> +Ananta Bastola +Anas Ahouzi <112881240+aahouzi@users.noreply.github.com> +András Salamon +Andrei +Andrew Canis +Andrew Duffy +Andrew Godfrey +Arik Poznanski +Artem +Artyom Lebedev +Asbjørn Olling +Ásgeir Bjarni Ingvarsson +Ashok Gelal <401055+ashokgelal@users.noreply.github.com> +Ashraful Islam +Atsushi Tatsuma +Austin <77757836+teleprint-me@users.noreply.github.com> +AustinMroz +BADR +Bach Le +Bailey Chittle <39804642+bachittle@users.noreply.github.com> +BarfingLemurs <128182951+BarfingLemurs@users.noreply.github.com> +Behnam M <58621210+ibehnam@users.noreply.github.com> +Ben Garney +Ben Siraphob +Ben Williams +Benjamin Lecaillon <84293038+blecaillon@users.noreply.github.com> +Bernat Vadell +Bodo Graumann +Bono Lv +Borislav Stanimirov +Branden Butler +Brian +Bruce MacDonald +CJ Pais +CRD716 +Cameron +Cameron Kaiser +Casey Primozic +Casey Primozic +CausalLM <148736309+CausalLM@users.noreply.github.com> +Cebtenzzre +Chad Brewbaker +Cheng Shao +Chris Kuehl +Christian Demsar +Christian Demsar +Christian Falch <875252+chrfalch@users.noreply.github.com> +Christian Kögler +Clark Saben <76020733+csaben@users.noreply.github.com> +Clint Herron +Cuong Trinh Manh +DAN™ +Damian Stewart +Dane Madsen +DaniAndTheWeb <57776841+DaniAndTheWeb@users.noreply.github.com> +Daniel Bevenius +Daniel Drake +Daniel Hiltgen +Daniel Illescas Romero +DannyDaemonic +Dat Quoc Nguyen <2412555+datquocnguyen@users.noreply.github.com> +Dave Della Costa +David Friehs +David Kennedy +David Pflug +David Renshaw +David Sommers <12738+databyte@users.noreply.github.com> +David Yang +Dawid Wysocki <62249621+TortillaZHawaii@users.noreply.github.com> +Dean +Deins +Didzis Gosko +Don Mahurin +DooWoong Lee (David) +Doomsdayrs <38189170+Doomsdayrs@users.noreply.github.com> +Douglas Hanley +Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com> +Ebey Abraham +Ed Lee +Ed Lepedus +Edward Taylor +Elbios <141279586+Elbios@users.noreply.github.com> +Engininja2 <139037756+Engininja2@users.noreply.github.com> +Equim +Eric Sommerlade +Eric Zhang <34133756+EZForever@users.noreply.github.com> +Erik Garrison +Erik Scholz +Ettore Di Giacinto +Evan Jones +Evan Miller +Eve <139727413+netrunnereve@users.noreply.github.com> +Evgeny Kurnevsky +Ewout ter Hoeven +ExtReMLapin <3909752+ExtReMLapin@users.noreply.github.com> +FK +Fabian +Fabio R. Sluzala +Faez Shakil +FantasyGmm <16450052+FantasyGmm@users.noreply.github.com> +Fattire <528174+fat-tire@users.noreply.github.com> +Felix +Finn Voorhees +Firat +Folko-Ven <71110216+Folko-Ven@users.noreply.github.com> +Foul-Tarnished <107711110+Foul-Tarnished@users.noreply.github.com> +Francisco Melo <43780565+francis2tm@users.noreply.github.com> +FrankHB +Frederik Vogel +Gabe Goodhart +GainLee +Galunid +Gary Linscott +Gary Mulder +Genkagaku.GPT +Georgi Gerganov +Gilad S +GiviMAD +Govlzkoy +Guillaume "Vermeille" Sanchez +Guillaume Wenzek +Guoteng <32697156+SolenoidWGT@users.noreply.github.com> +Gustavo Rocha Dias <91472747+gustrd@users.noreply.github.com> +Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com> +Haohui Mai +Haoxiang Fei +Harald Fernengel +Hatsune Miku <129688334+at8u@users.noreply.github.com> +Henk Poley +Henri Vasserman +Henrik Forstén +Herman Semenov +Hesen Peng +Hoang Nguyen +Hongyu Ouyang <96765450+casavaca@users.noreply.github.com> +Howard Su +Hua Jiang +Huawei Lin +Ian Bull +Ian Bull +Ian Scrivener +Ido S +IgnacioFDM +Igor Okulist +Ikko Eltociear Ashimine +Ilya Kurdyukov <59548320+ilyakurdyukov@users.noreply.github.com> +Ionoclast Laboratories +Isaac McFadyen +IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> +Ivan Komarov +Ivan Stepanov +JH23X <165871467+JH23X@users.noreply.github.com> +Jack Mousseau +JackJollimore <130917767+JackJollimore@users.noreply.github.com> +Jag Chadha +Jakub N +James Reynolds +Jan Boon +Jan Boon +Jan Ploski +Jannis Schönleber +Jared Van Bortel +Jared Van Bortel +Jason McCartney +Jean-Christophe Hoelt +Jean-Michaël Celerier +Jed Fox +Jeffrey Quesnelle +Jesse Jojo Johnson +Jhen-Jie Hong +Jiahao Li +Jian Liao +JidongZhang-THU <1119708529@qq.com> +Jinwoo Jeong <33892306+williamjeong2@users.noreply.github.com> +Jiří Podivín <66251151+jpodivin@users.noreply.github.com> +Johannes Gäßler +Johannes Rudolph +John <78893154+cmp-nct@users.noreply.github.com> +John Balis +John Smith <67539080+kingsidelee@users.noreply.github.com> +JohnnyB +Jonas Wunderlich <32615971+jonas-w@users.noreply.github.com> +Jorge A <161275481+jorgealias@users.noreply.github.com> +Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com> +Joseph Stahl <1269177+josephst@users.noreply.github.com> +Joyce +Juan Calderon-Perez <835733+gaby@users.noreply.github.com> +Judd +Julius Arkenberg +Jun Jie <71215065+junnjiee16@users.noreply.github.com> +Juraj Bednar +Justin Parker +Justin Suess +Justine Tunney +Juuso Alasuutari +KASR +Kamil Tomšík +Karsten Weiss +Karthick +Karthik Kumar Viswanathan <195178+guilt@users.noreply.github.com> +Karthik Sethuraman +Kasumi <90275229+kasumi-1@users.noreply.github.com> +Kawrakow <48489457+ikawrakow@users.noreply.github.com> +Keiichi Tabata +Kenvix ⭐ +Kerfuffle <44031344+KerfuffleV2@users.noreply.github.com> +Kevin Ji <1146876+kevinji@users.noreply.github.com> +Kevin Kwok +Kevin Lo +Kolen Cheung +Konstantin Herud +Konstantin Zhuravlyov +Kunshang Ji +Kyle Liang +Kyle Mistele +Kylin <56434533+KyL0N@users.noreply.github.com> +Lars Grammel +Laura +Lee <44310445+lx200916@users.noreply.github.com> +Lee Drake +Leng Yue +LeonEricsson <70749762+LeonEricsson@users.noreply.github.com> +Leonardo Neumann +Li Tan +Linwei Wang +LoganDark +LostRuins <39025047+LostRuins@users.noreply.github.com> +Luciano +Luo Tian +M. Yusuf Sarıgöz +Maarten ter Huurne +Mack Straight +Maël Kerbiriou +MaggotHATE +Marc Köhlbrugge +Marco Matthies <71844+marcom@users.noreply.github.com> +Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> +Marian Cepok +Mark Fairbairn +Marko Tasic +Martin Krasser +Martin Schwaighofer +Marvin Gießing +Mateusz Charytoniuk +Matheus C. França +Matheus Gabriel Alves Silva +Mathieu Nayrolles +Mathijs de Bruin +Matt Clayton <156335168+mattjcly@users.noreply.github.com> +Matt Pulver +Matteo Boschini <12133566+mbosc@users.noreply.github.com> +Matthew Tejo +Matvey Soloviev +Maxime <672982+maximegmd@users.noreply.github.com> +Maximilian Winter +Meng Zhang +Meng, Hengyu +Merrick Christensen +Michael Coppola +Michael Hueschen +Michael Kesper +Michael Klimenko +Michael Podvitskiy +Michael Potter +Michaël de Vries +Mihai +Mike +Minsoo Cheong <54794500+mscheong01@users.noreply.github.com> +Mirko185 +Mirror Azure <54669636+MirrorAzure@users.noreply.github.com> +Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> +Mohammadreza Hendiani +Murilo Santana +Musab Gultekin +Nam D. Tran <42194884+namtranase@users.noreply.github.com> +NawafAlansari <72708095+NawafAlansari@users.noreply.github.com> +Nebula +Neo Zhang Jianyu +Neuman Vong +Nexesenex <124105151+Nexesenex@users.noreply.github.com> +Niall Coates <1349685+Niall-@users.noreply.github.com> +Nicolai Weitkemper +Nigel Bosch +Niklas Korz +Nindaleth +Oleksandr Nikitin +Oleksii Maryshchenko +Olivier Chafik +Ondřej Čertík +Ouadie EL FAROUKI +Paul Tsochantaris +Pavol Rusnak +Pedro Cuenca +Peter Sugihara +Phil H <5756783+phiharri@users.noreply.github.com> +Philip Taron +Phillip Kravtsov +Pierre Alexandre SCHEMBRI +Pierrick Hymbert +Przemysław Pawełczyk +Qin Yue Chen <71813199+chenqiny@users.noreply.github.com> +Qingyou Meng +Qu Zongfu <43257352+yancaoweidaode@users.noreply.github.com> +RJ Adriaansen +Radoslav Gerganov +Radosław Gryta +Rahul Vivek Nair <68507071+RahulVivekNair@users.noreply.github.com> +Rand Xie +Randall Fitzgerald +Reinforce-II +Riceball LEE +Richard Kiss +Richard Roberson +Rick G <26732651+TheFlipbook@users.noreply.github.com> +Rickard Edén +Rickard Hallerbäck +Rickey Bowers Jr +Riley Stewart +Rinne +Rinne +Robert Brisita <986796+rbrisita@users.noreply.github.com> +Robert Sung-wook Shin +Robey Holderith +Robyn +Roger Meier +Roland <14355895+rbur0425@users.noreply.github.com> +Romain D <90720+Artefact2@users.noreply.github.com> +Romain Neutron +Roman Parykin +Ron Evans +Ron Jailall +Ronny Brendel +Ronsor +Rowan Hart +Rune <43761327+Rune-AI@users.noreply.github.com> +Ryan Landay +Ryder Wishart +Rőczey Barnabás <31726601+An0nie@users.noreply.github.com> +SakuraUmi +Salvador E. Tropea +Sam Spilsbury +Sami Farin <3876865+Safari77@users.noreply.github.com> +Samuel Maynard +Sang-Kil Park +Seb C <47074056+Sebby37@users.noreply.github.com> +Sebastián A +SebastianApel <13675545+SebastianApel@users.noreply.github.com> +Senemu <10880819+Senemu@users.noreply.github.com> +Sergey Alirzaev +Sergio López +SeungWon Jeong <65549245+redlion0929@users.noreply.github.com> +ShadovvBeast +Shakhar Dasgupta +Shangning Xu <32517059+xushangning@users.noreply.github.com> +Shijie <821898965@qq.com> +Shintarou Okada +Shouzheng Liu <61452103+lshzh-ww@users.noreply.github.com> +Shouzheng Liu +Sigbjørn Skjæret +Simon Willison +Siwen Yu +Sky Yan +Slaren <2141330+slaren@users.noreply.github.com> +Slava Primenko +SoftwareRenderer <138734813+SoftwareRenderer@users.noreply.github.com> +Someone +Someone Serge +Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> +Spencer Sutton +Srinivas Billa +Stefan Sydow +Stephan Walter +Stephen Nichols +Steve Grubb +Steven Roussey +Steward Garcia <57494570+FSSRepo@users.noreply.github.com> +Suaj Carrot <72162667+SuajCarrot@users.noreply.github.com> +SuperUserNameMan +Tai Duc Nguyen +Taikono-Himazin +Tameem <113388789+AhmadTameem@users.noreply.github.com> +Tamotsu Takahashi +Thái Hoàng Tâm <75922889+RoyalHeart@users.noreply.github.com> +Thatcher Chamberlin +Theia Vogel +Thérence <13496987+Royalphax@users.noreply.github.com> +Thibault Terrasson +Thomas Klausner +Tim Miller +Timmy Knight +Timothy Cronin <40186632+4imothy@users.noreply.github.com> +Ting Lou +Ting Sun +Tobias Lütke +Tom C +Tom Jobbins <784313+TheBloke@users.noreply.github.com> +Tomas +Tomáš Pazdiora +Tristan Ross +Tungsten842 <886724vf@anonaddy.me> +Tungsten842 +Tushar +UEXTM.com <84163508+uextm@users.noreply.github.com> +Uzo Nweke +Vaibhav Srivastav +Val Kharitonov +Valentin Konovalov +Valentyn Bezshapkin <61702053+valentynbez@users.noreply.github.com> +Victor Z. Peng +Vlad +Vladimir +Vladimir Malyutin +Vladimir Zorin +Volodymyr Vitvitskyi <72226+signalpillar@users.noreply.github.com> +WangHaoranRobin <56047610+WangHaoranRobin@users.noreply.github.com> +Weird Constructor +Welby Seely +Wentai Zhang +WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com> +Willy Tarreau +Wu Jian Ping +Wu Jian Ping +Xiake Sun +Xiang (Kevin) Li +Xiao-Yong Jin +XiaotaoChen +Xiaoyi Chen +Xingchen Song(宋星辰) +Xuan Son Nguyen +Yann Follet <131855179+YannFollet@users.noreply.github.com> +Yiming Cui +Yishuo Wang +Yueh-Po Peng <94939112+y10ab1@users.noreply.github.com> +Yui +Yusuf Kağan Hanoğlu +Yuval Peled <31162840+Yuval-Peled@users.noreply.github.com> +ZHAOKAI WANG +Zane Shannon +Zay <95888118+isaiahbjork@users.noreply.github.com> +Zenix +Zhang Peiyuan +ZhouYuChen +Ziad Ben Hadj-Alouane +Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> +Zsapi +a-n-n-a-l-e-e <150648636+a-n-n-a-l-e-e@users.noreply.github.com> +adel boussaken +afrideva <95653597+afrideva@users.noreply.github.com> +akawrykow <142945436+akawrykow@users.noreply.github.com> +alexpinel <93524949+alexpinel@users.noreply.github.com> +alonfaraj +andrijdavid +anon998 <131767832+anon998@users.noreply.github.com> +anzz1 +apaz +apcameron <37645737+apcameron@users.noreply.github.com> +arcrank +arlo-phoenix <140345165+arlo-phoenix@users.noreply.github.com> +at8u <129688334+at8u@users.noreply.github.com> +automaticcat +bandoti <141645996+bandoti@users.noreply.github.com> +beiller +bhubbb <79117352+bhubbb@users.noreply.github.com> +bmwl +bobqianic <129547291+bobqianic@users.noreply.github.com> +bryanSwk <93190252+bryanSwk@users.noreply.github.com> +bsilvereagle +bssrdf +byte-6174 <88070277+byte-6174@users.noreply.github.com> +cebtenzzre +chaihahaha +chiranko <96988916+chiranko@users.noreply.github.com> +clibdev <52199778+clibdev@users.noreply.github.com> +clyang +cocktailpeanut <121128867+cocktailpeanut@users.noreply.github.com> +coezbek +comex +compilade <113953597+compilade@users.noreply.github.com> +crasm +crasm +daboe01 +david raistrick +ddpasa <112642920+ddpasa@users.noreply.github.com> +deepdiffuser <112834445+deepdiffuser@users.noreply.github.com> +divinity76 +dotpy314 <33351922+dotpy314@users.noreply.github.com> +drbh +ds5t5 <145942675+ds5t5@users.noreply.github.com> +dylan +eastriver +ebraminio +eiery <19350831+eiery@users.noreply.github.com> +eric8607242 +fraxy-v <65565042+fraxy-v@users.noreply.github.com> +github-actions[bot] +gliptic +goerch +grahameth <96447521+grahameth@users.noreply.github.com> +gwjr <502526+gwjr@users.noreply.github.com> +h-h-h-h <13482553+h-h-h-h@users.noreply.github.com> +hankcs +hoangmit +hongbo.mo <352280764@qq.com> +howlger +howlger +hutli <6594598+hutli@users.noreply.github.com> +hutli +hutli +hxer7963 +hydai +iSma +iacore <74560659+iacore@users.noreply.github.com> +igarnier +iohub +jacobi petrucciani <8117202+jpetrucciani@users.noreply.github.com> +jameswu2014 <545426914@qq.com> +jneem +johnson442 <56517414+johnson442@users.noreply.github.com> +jon-chuang <9093549+jon-chuang@users.noreply.github.com> +jp-x-g +jwj7140 <32943891+jwj7140@users.noreply.github.com> +kaizau +kalomaze <66376113+kalomaze@users.noreply.github.com> +kang +katsu560 <118887472+katsu560@users.noreply.github.com> +kchro3 <62481661+kchro3@users.noreply.github.com> +khimaros +kiltyj +klosax <131523366+klosax@users.noreply.github.com> +kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> +kunnis +kuronekosaiko +kuvaus <22169537+kuvaus@users.noreply.github.com> +kwin1412 <42286931+kwin1412@users.noreply.github.com> +l3utterfly +ldwang +le.chang +leejet +limitedAtonement +lon <114724657+longregen@users.noreply.github.com> +m3ndax +maddes8cht <55592906+maddes8cht@users.noreply.github.com> +makomk +manikbhandari +mdrokz +mgroeber9110 <45620825+mgroeber9110@users.noreply.github.com> +minarchist +mj-shifu <77107165+mj-shifu@users.noreply.github.com> +mmyjona +momonga <115213907+mmnga@users.noreply.github.com> +moritzbrantner <31051084+moritzbrantner@users.noreply.github.com> +mzcu +nanahi <130121847+na-na-hi@users.noreply.github.com> +ngc92 <7938269+ngc92@users.noreply.github.com> +nhamanasu <45545786+nhamanasu@users.noreply.github.com> +niansa/tuxifan +niansa/tuxifan +ningshanwutuobang +nold +nopperl <54780682+nopperl@users.noreply.github.com> +nusu-github <29514220+nusu-github@users.noreply.github.com> +olexiyb +oobabooga <112222186+oobabooga@users.noreply.github.com> +opparco +ostix360 <55257054+ostix360@users.noreply.github.com> +perserk +postmasters +pudepiedj +qingfengfenga <41416092+qingfengfenga@users.noreply.github.com> +qouoq +qunash +rabidcopy +rankaiyx +rhjdvsgsgks <26178113+rhjdvsgsgks@users.noreply.github.com> +rhuddleston +rimoliga <53384203+rimoliga@users.noreply.github.com> +runfuture +sandyiscool +semidark +sharpHL <132747147+sharpHL@users.noreply.github.com> +shibe2 +singularity <12184989+singularity-s0@users.noreply.github.com> +sjinzh +slaren <2141330+slaren@users.noreply.github.com> +slaren +snadampal <87143774+snadampal@users.noreply.github.com> +staviq +stduhpf +swittk +takov751 <40316768+takov751@users.noreply.github.com> +tarcey +texmex76 <40733439+texmex76@users.noreply.github.com> +thement <40525767+thement@users.noreply.github.com> +tjohnman +tslmy +ubik2 +uint256_t +uint256_t +unbounded +valiray <133289098+valiray@users.noreply.github.com> +vodkaslime <646329483@qq.com> +vvhg1 <94630311+vvhg1@users.noreply.github.com> +vxiiduu <73044267+vxiiduu@users.noreply.github.com> +wbpxre150 <100937007+wbpxre150@users.noreply.github.com> +whoreson <139810751+whoreson@users.noreply.github.com> +wonjun Jang +wzy <32936898+Freed-Wu@users.noreply.github.com> +xaedes +xaedes +xloem <0xloem@gmail.com> +yangli2 +yuiseki +zakkor +zhouwg <6889919+zhouwg@users.noreply.github.com> +zrm +源文雨 <41315874+fumiama@users.noreply.github.com> +Нияз Гарифзянов <112617865+garrnizon@users.noreply.github.com> diff --git a/CMakeLists.txt b/CMakeLists.txt index fc4cff28f..aa65b0d6c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,8 @@ else() set(LLAMA_METAL_DEFAULT OFF) endif() +set(LLAMA_LLAMAFILE_DEFAULT ON) + # general option(BUILD_SHARED_LIBS "build shared libraries" OFF) option(LLAMA_STATIC "llama: static link libraries" OFF) @@ -88,9 +90,10 @@ endif() # 3rd party libs option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON) option(LLAMA_BLAS "llama: use BLAS" OFF) +option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM" ${LLAMA_LLAMAFILE_DEFAULT}) set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor") -option(LLAMA_CUBLAS "llama: use CUDA" OFF) -#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF) +option(LLAMA_CUDA "llama: use CUDA" OFF) +option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF) option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF) option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF) set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels") @@ -99,6 +102,9 @@ option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K") set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING "llama: max. batch size for using peer access") +option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF) +option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF) + option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF) option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF) option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF) @@ -112,6 +118,9 @@ option(LLAMA_METAL "llama: use Metal" option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF) option(LLAMA_METAL_SHADER_DEBUG "llama: compile Metal with -fno-fast-math" OFF) option(LLAMA_METAL_EMBED_LIBRARY "llama: embed Metal library" OFF) +set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING + "llama: metal minimum macOS version") +set(LLAMA_METAL_STD "" CACHE STRING "llama: metal standard version (-std flag)") option(LLAMA_KOMPUTE "llama: use Kompute" OFF) option(LLAMA_MPI "llama: use MPI" OFF) option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF) @@ -249,6 +258,16 @@ if (LLAMA_METAL) set(XC_FLAGS -O3) endif() + # Append macOS metal versioning flags + if (LLAMA_METAL_MACOSX_VERSION_MIN) + message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation") + list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN}) + endif() + if (LLAMA_METAL_STD) + message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation") + list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD}) + endif() + add_custom_command( OUTPUT ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib COMMAND xcrun -sdk macosx metal ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air @@ -272,6 +291,7 @@ if (LLAMA_METAL) ${METALKIT_FRAMEWORK} ) endif() + if (LLAMA_BLAS) if (LLAMA_STATIC) set(BLA_STATIC ON) @@ -354,29 +374,47 @@ if (LLAMA_BLAS) endif() endif() +if (LLAMA_LLAMAFILE) + add_compile_definitions(GGML_USE_LLAMAFILE) + + set(GGML_HEADERS_LLAMAFILE sgemm.h) + set(GGML_SOURCES_LLAMAFILE sgemm.cpp) +endif() + if (LLAMA_QKK_64) add_compile_definitions(GGML_QKK_64) endif() if (LLAMA_CUBLAS) + message(WARNING "LLAMA_CUBLAS is deprecated and will be removed in the future.\nUse LLAMA_CUDA instead") + set(LLAMA_CUDA ON) +endif() + +if (LLAMA_CUDA) cmake_minimum_required(VERSION 3.17) find_package(CUDAToolkit) if (CUDAToolkit_FOUND) - message(STATUS "cuBLAS found") + message(STATUS "CUDA found") enable_language(CUDA) set(GGML_HEADERS_CUDA ggml-cuda.h) - set(GGML_SOURCES_CUDA ggml-cuda.cu) - add_compile_definitions(GGML_USE_CUBLAS) + file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu") + + add_compile_definitions(GGML_USE_CUDA) + add_compile_definitions(GGML_CUDA_USE_GRAPHS) if (LLAMA_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV) endif() if (LLAMA_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() + if (LLAMA_CUDA_NO_VMM) + add_compile_definitions(GGML_CUDA_NO_VMM) + endif() add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) if (DEFINED LLAMA_CUDA_DMMV_Y) @@ -387,10 +425,13 @@ if (LLAMA_CUBLAS) endif() add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE}) + if (LLAMA_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() if (LLAMA_STATIC) if (WIN32) - # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library + # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt) else () set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static) @@ -399,7 +440,11 @@ if (LLAMA_CUBLAS) set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt) endif() - set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) + if (LLAMA_CUDA_NO_VMM) + # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) + else() + set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ... + endif() if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) # 52 == lowest CUDA 12 standard @@ -416,7 +461,7 @@ if (LLAMA_CUBLAS) message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") else() - message(WARNING "cuBLAS not found") + message(WARNING "CUDA not found") endif() endif() @@ -515,9 +560,11 @@ if (LLAMA_HIPBLAS) message(STATUS "HIP and hipBLAS found") set(GGML_HEADERS_ROCM ggml-cuda.h) - set(GGML_SOURCES_ROCM ggml-cuda.cu) - add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS) + file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu") + list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu") + + add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA) if (LLAMA_HIP_UMA) add_compile_definitions(GGML_HIP_UMA) @@ -531,11 +578,15 @@ if (LLAMA_HIPBLAS) add_compile_definitions(GGML_CUDA_FORCE_MMQ) endif() + if (LLAMA_CUDA_NO_PEER_COPY) + add_compile_definitions(GGML_CUDA_NO_PEER_COPY) + endif() + add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y}) add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER}) - set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX) + set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) if (LLAMA_STATIC) message(FATAL_ERROR "Static linking not supported for HIP/ROCm") @@ -818,7 +869,7 @@ endif() set(CUDA_CXX_FLAGS "") -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) set(CUDA_FLAGS -use_fast_math) if (LLAMA_FATAL_WARNINGS) @@ -1043,7 +1094,7 @@ endif() add_compile_options("$<$:${ARCH_FLAGS}>") add_compile_options("$<$:${ARCH_FLAGS}>") -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS}) list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "") @@ -1121,15 +1172,16 @@ add_library(ggml OBJECT ggml-backend.h ggml-quants.c ggml-quants.h - ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} - ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} - ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} - ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} - ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} - ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} - ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} - ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} - ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} + ${GGML_SOURCES_OPENCL} ${GGML_HEADERS_OPENCL} + ${GGML_SOURCES_METAL} ${GGML_HEADERS_METAL} + ${GGML_SOURCES_MPI} ${GGML_HEADERS_MPI} + ${GGML_SOURCES_EXTRA} ${GGML_HEADERS_EXTRA} + ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL} + ${GGML_SOURCES_KOMPUTE} ${GGML_HEADERS_KOMPUTE} + ${GGML_SOURCES_VULKAN} ${GGML_HEADERS_VULKAN} + ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} + ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ) target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES}) @@ -1153,6 +1205,7 @@ add_library(llama llama.h unicode.h unicode.cpp + unicode-data.cpp ) target_include_directories(llama PUBLIC .) @@ -1248,6 +1301,12 @@ if (LLAMA_METAL) GROUP_READ WORLD_READ DESTINATION ${CMAKE_INSTALL_BINDIR}) + if (NOT LLAMA_METAL_EMBED_LIBRARY) + install( + FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib + DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + endif() endif() # diff --git a/LICENSE b/LICENSE index 76f67efdc..acb96ce78 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2023 Georgi Gerganov +Copyright (c) 2023-2024 The ggml authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/Makefile b/Makefile index 9b72e1dbd..3fa56d13a 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,28 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ - simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ - speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o + simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \ + retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o # Binaries only useful for tests TEST_TARGETS = \ - tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ - tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ - tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-rope \ - tests/test-backend-ops tests/test-model-load-cancel tests/test-autorelease \ - tests/test-json-schema-to-grammar + tests/test-autorelease \ + tests/test-backend-ops \ + tests/test-double-float \ + tests/test-grad0 \ + tests/test-grammar-integration \ + tests/test-grammar-parser \ + tests/test-json-schema-to-grammar \ + tests/test-llama-grammar \ + tests/test-model-load-cancel \ + tests/test-opt \ + tests/test-quantize-fns \ + tests/test-quantize-perf \ + tests/test-rope \ + tests/test-sampling \ + tests/test-tokenizer-0 \ + tests/test-tokenizer-1-bpe \ + tests/test-tokenizer-1-spm # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -27,6 +39,17 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif +# In GNU make default CXX is g++ instead of c++. Let's fix that so that users +# of non-gcc compilers don't have to provide g++ alias or wrapper. +DEFCC := cc +DEFCXX := c++ +ifeq ($(origin CC),default) +CC := $(DEFCC) +endif +ifeq ($(origin CXX),default) +CXX := $(DEFCXX) +endif + # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 ifeq ($(UNAME_S),Darwin) @@ -49,11 +72,16 @@ default: $(BUILD_TARGETS) test: $(TEST_TARGETS) @failures=0; \ for test_target in $(TEST_TARGETS); do \ - if [ "$$test_target" = "tests/test-tokenizer-0-llama" ]; then \ - ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ - elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ + if [ "$$test_target" = "tests/test-tokenizer-0" ]; then \ + ./$$test_target $(CURDIR)/models/ggml-vocab-llama-spm.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ - elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ + ./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \ + ./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \ + elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \ continue; \ elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ continue; \ @@ -384,20 +412,31 @@ ifdef LLAMA_OPENBLAS MK_LDFLAGS += $(shell pkg-config --libs openblas) endif # LLAMA_OPENBLAS +ifndef LLAMA_NO_LLAMAFILE + MK_CPPFLAGS += -DGGML_USE_LLAMAFILE + OBJS += sgemm.o +endif + ifdef LLAMA_BLIS MK_CPPFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/blis -I/usr/include/blis MK_LDFLAGS += -lblis -L/usr/local/lib endif # LLAMA_BLIS ifdef LLAMA_CUBLAS +# LLAMA_CUBLAS is deprecated and will be removed in the future + LLAMA_CUDA := 1 +endif + +ifdef LLAMA_CUDA ifneq ('', '$(wildcard /opt/cuda)') CUDA_PATH ?= /opt/cuda else CUDA_PATH ?= /usr/local/cuda endif - MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include + MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib OBJS += ggml-cuda.o + OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) MK_NVCCFLAGS += -use_fast_math ifdef LLAMA_FATAL_WARNINGS MK_NVCCFLAGS += -Werror all-warnings @@ -452,22 +491,31 @@ ifdef LLAMA_CUDA_PEER_MAX_BATCH_SIZE else MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE -#ifdef LLAMA_CUDA_CUBLAS -# MK_NVCCFLAGS += -DGGML_CUDA_CUBLAS -#endif # LLAMA_CUDA_CUBLAS +ifdef LLAMA_CUDA_NO_PEER_COPY + MK_NVCCFLAGS += -DGGML_CUDA_NO_PEER_COPY +endif # LLAMA_CUDA_NO_PEER_COPY ifdef LLAMA_CUDA_CCBIN MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN) endif -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h + ifdef JETSON_EOL_MODULE_DETECT - $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +define NVCC_COMPILE + $(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +endef # NVCC_COMPILE else +define NVCC_COMPILE $(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@ +endef # NVCC_COMPILE endif # JETSON_EOL_MODULE_DETECT -endif # LLAMA_CUBLAS + +ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh + $(NVCC_COMPILE) + +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) + $(NVCC_COMPILE) +endif # LLAMA_CUDA ifdef LLAMA_CLBLAST - MK_CPPFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags-only-I clblast OpenCL) MK_CFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL) MK_CXXFLAGS += $(shell pkg-config --cflags-only-other clblast OpenCL) @@ -510,7 +558,6 @@ ggml-vulkan.o: ggml-vulkan.cpp ggml-vulkan.h endif # LLAMA_VULKAN ifdef LLAMA_HIPBLAS - ifeq ($(wildcard /opt/rocm),) ROCM_PATH ?= /usr GPU_TARGETS ?= $(shell $(shell which amdgpu-arch)) @@ -522,7 +569,7 @@ ifdef LLAMA_HIPBLAS LLAMA_CUDA_DMMV_X ?= 32 LLAMA_CUDA_MMV_Y ?= 1 LLAMA_CUDA_KQUANTS_ITER ?= 2 - MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS + MK_CPPFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUDA ifdef LLAMA_HIP_UMA MK_CPPFLAGS += -DGGML_HIP_UMA endif # LLAMA_HIP_UMA @@ -535,9 +582,18 @@ endif # LLAMA_HIP_UMA ifdef LLAMA_CUDA_FORCE_DMMV HIPFLAGS += -DGGML_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV +ifdef LLAMA_CUDA_NO_PEER_COPY + HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY +endif # LLAMA_CUDA_NO_PEER_COPY OBJS += ggml-cuda.o -ggml-cuda.o: ggml-cuda.cu ggml-cuda.h + OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu)) + +ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh) $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< + +ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/common.cuh + $(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $< + endif # LLAMA_HIPBLAS ifdef LLAMA_METAL @@ -578,6 +634,11 @@ ggml-mpi.o: ggml-mpi.c ggml-mpi.h $(CC) $(CFLAGS) -c $< -o $@ endif # LLAMA_MPI +ifndef LLAMA_NO_LLAMAFILE +sgemm.o: sgemm.cpp sgemm.h ggml.h + $(CXX) $(CXXFLAGS) -c $< -o $@ +endif + GF_CC := $(CC) include scripts/get-flags.mk @@ -590,7 +651,7 @@ override NVCCFLAGS := $(MK_NVCCFLAGS) $(NVCCFLAGS) override LDFLAGS := $(MK_LDFLAGS) $(LDFLAGS) # identify CUDA host compiler -ifdef LLAMA_CUBLAS +ifdef LLAMA_CUDA GF_CC := $(NVCC) $(NVCCFLAGS) 2>/dev/null .c -Xcompiler include scripts/get-flags.mk CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic @@ -615,19 +676,26 @@ $(info I NVCCFLAGS: $(NVCCFLAGS)) $(info I LDFLAGS: $(LDFLAGS)) $(info I CC: $(shell $(CC) --version | head -n 1)) $(info I CXX: $(shell $(CXX) --version | head -n 1)) -ifdef LLAMA_CUBLAS +ifdef LLAMA_CUDA $(info I NVCC: $(shell $(NVCC) --version | tail -n 1)) CUDA_VERSION := $(shell $(NVCC) --version | grep -oP 'release (\K[0-9]+\.[0-9])') ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1) ifndef CUDA_DOCKER_ARCH ifndef CUDA_POWER_ARCH -$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH) +$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via environment variable CUDA_DOCKER_ARCH, e.g. by running "export CUDA_DOCKER_ARCH=compute_XX" on Unix-like systems, where XX is the minimum compute capability that the code needs to run on. A list with compute capabilities can be found here: https://developer.nvidia.com/cuda-gpus ) endif # CUDA_POWER_ARCH endif # CUDA_DOCKER_ARCH endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1) -endif # LLAMA_CUBLAS +endif # LLAMA_CUDA $(info ) +ifdef LLAMA_CUBLAS +$(info !!!!) +$(info LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.) +$(info !!!!) +$(info ) +endif + # # Build library # @@ -647,13 +715,16 @@ ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h unicode.o: unicode.cpp unicode.h $(CXX) $(CXXFLAGS) -c $< -o $@ -OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o +unicode-data.o: unicode-data.cpp unicode-data.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + +OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h $(CXX) $(CXXFLAGS) -c $< -o $@ -COMMON_H_DEPS = common/common.h common/sampling.h common/log.h -COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o +COMMON_H_DEPS = common/common.h common/sampling.h common/log.h llama.h +COMMON_DEPS = common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o common.o: common/common.cpp $(COMMON_H_DEPS) $(CXX) $(CXXFLAGS) -c $< -o $@ @@ -673,6 +744,9 @@ json-schema-to-grammar.o: common/json-schema-to-grammar.cpp common/json-schema-t train.o: common/train.cpp common/train.h $(CXX) $(CXXFLAGS) -c $< -o $@ +ngram-cache.o: common/ngram-cache.cpp common/ngram-cache.h + $(CXX) $(CXXFLAGS) -c $< -o $@ + libllama.so: llama.o ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) @@ -680,7 +754,8 @@ libllama.a: llama.o ggml.o $(OBJS) $(COMMON_DEPS) ar rcs libllama.a llama.o ggml.o $(OBJS) $(COMMON_DEPS) clean: - rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot $(COV_TARGETS) $(BUILD_TARGETS) $(TEST_TARGETS) + rm -vrf ggml-cuda/*.o find examples pocs -type f -name "*.o" -delete # @@ -717,11 +792,11 @@ batched: examples/batched/batched.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o common.o $(OBJS) +batched-bench: examples/batched-bench/batched-bench.cpp build-info.o ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -quantize: examples/quantize/quantize.cpp build-info.o ggml.o llama.o $(OBJS) +quantize: examples/quantize/quantize.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -749,10 +824,19 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp json-schema-to-grammar.o common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) +server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h common/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/server/json-schema-to-grammar.mjs.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2) +# Portable equivalent of `cd examples/server/public && xxd -i $(notdir $<) ../$(notdir $<).hpp`: +examples/server/%.hpp: examples/server/public/% Makefile + @( export NAME=$(subst .,_,$(subst -,_,$(notdir $<))) && \ + echo "unsigned char $${NAME}[] = {" && \ + cat $< | od -v -t x1 -An | sed -E 's/([0-9a-fA-F]+)/0x\1, /g' && \ + echo "};" && \ + echo "unsigned int $${NAME}_len = $(shell cat $< | wc -c );" \ + ) > $@ + gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -761,6 +845,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -798,6 +886,10 @@ export-lora: examples/export-lora/export-lora.cpp ggml.o common/common.h $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +retrieval: examples/retrieval/retrieval.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + speculative: examples/speculative/speculative.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -810,11 +902,21 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +lookup: examples/lookup/lookup.cpp ggml.o llama.o ngram-cache.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-merge.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-merge.cpp) -o lookup-merge $(LDFLAGS) + $(CXX) $(CXXFLAGS) -c examples/lookup/lookup-stats.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-stats.cpp) -o lookup-stats $(LDFLAGS) + +passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +gbnf-validator: examples/gbnf-validator/gbnf-validator.cpp ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -865,6 +967,10 @@ tests/test-grammar-parser: tests/test-grammar-parser.cpp ggml.o llama.o grammar- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +tests/test-grammar-integration: tests/test-grammar-integration.cpp ggml.o llama.o grammar-parser.o $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + tests/test-double-float: tests/test-double-float.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -893,11 +999,7 @@ tests/test-sampling: tests/test-sampling.cpp ggml.o llama.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-0: tests/test-tokenizer-0.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) @@ -905,7 +1007,7 @@ tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMM $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) +tests/test-tokenizer-1-spm: tests/test-tokenizer-1-spm.cpp ggml.o llama.o $(COMMON_DEPS) console.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index 4696ae5e4..183e64757 100644 --- a/Package.swift +++ b/Package.swift @@ -2,6 +2,45 @@ import PackageDescription +var sources = [ + "ggml.c", + "sgemm.cpp", + "llama.cpp", + "unicode.cpp", + "unicode-data.cpp", + "ggml-alloc.c", + "ggml-backend.c", + "ggml-quants.c", +] + +var resources: [Resource] = [] +var linkerSettings: [LinkerSetting] = [] +var cSettings: [CSetting] = [ + .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), + .unsafeFlags(["-fno-objc-arc"]), + // NOTE: NEW_LAPACK will required iOS version 16.4+ + // We should consider add this in the future when we drop support for iOS 14 + // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) + // .define("ACCELERATE_NEW_LAPACK"), + // .define("ACCELERATE_LAPACK_ILP64") +] + +#if canImport(Darwin) +sources.append("ggml-metal.m") +resources.append(.process("ggml-metal.metal")) +linkerSettings.append(.linkedFramework("Accelerate")) +cSettings.append( + contentsOf: [ + .define("GGML_USE_ACCELERATE"), + .define("GGML_USE_METAL") + ] +) +#endif + +#if os(Linux) + cSettings.append(.define("_GNU_SOURCE")) +#endif + let package = Package( name: "llama", platforms: [ @@ -28,33 +67,11 @@ let package = Package( "ggml-cuda.h", "Makefile" ], - sources: [ - "ggml.c", - "llama.cpp", - "unicode.cpp", - "ggml-alloc.c", - "ggml-backend.c", - "ggml-quants.c", - "ggml-metal.m", - ], - resources: [ - .process("ggml-metal.metal") - ], + sources: sources, + resources: resources, publicHeadersPath: "spm-headers", - cSettings: [ - .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]), - .define("GGML_USE_ACCELERATE"), - .unsafeFlags(["-fno-objc-arc"]), - .define("GGML_USE_METAL"), - // NOTE: NEW_LAPACK will required iOS version 16.4+ - // We should consider add this in the future when we drop support for iOS 14 - // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc) - // .define("ACCELERATE_NEW_LAPACK"), - // .define("ACCELERATE_LAPACK_ILP64") - ], - linkerSettings: [ - .linkedFramework("Accelerate") - ] + cSettings: cSettings, + linkerSettings: linkerSettings ) ], cxxLanguageStandard: .cxx11 diff --git a/README-sycl.md b/README-sycl.md index 501b9d481..cfa248a95 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -3,32 +3,42 @@ - [Background](#background) - [News](#news) - [OS](#os) -- [Intel GPU](#intel-gpu) +- [Hardware](#hardware) - [Docker](#docker) - [Linux](#linux) - [Windows](#windows) - [Environment Variable](#environment-variable) -- [Known Issue](#known-issue) -- [Q&A](#q&a) -- [Todo](#todo) +- [Known Issue](#known-issues) +- [Q&A](#qa) +- [TODO](#todo) ## Background -SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17. +**SYCL** is a high-level parallel programming model designed to improve developers productivity writing code across various hardware accelerators such as CPUs, GPUs, and FPGAs. It is a single-source language designed for heterogeneous computing and based on standard C++17. -oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms. +**oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include: -Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs. +- **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers. +- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL - Math Kernel Library)*. +- **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs. +- **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets. -To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL. +### Llama.cpp + SYCL -The llama.cpp for SYCL is used to support Intel GPUs. +The llama.cpp SYCL backend is designed to support **Intel GPU** firstly. Based on the cross-platform feature of SYCL, it could support other vendor GPUs: Nvidia GPU (*AMD GPU coming*). -For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). +When targeting **Intel CPU**, it is recommended to use llama.cpp for [Intel oneMKL](README.md#intel-onemkl) backend. + +It has the similar design of other llama.cpp BLAS-based paths such as *OpenBLAS, cuBLAS, CLBlast etc..*. In beginning work, the oneAPI's [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) open-source migration tool (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) was used for this purpose. ## News +- 2024.4 + - Support data types: GGML_TYPE_IQ4_NL, GGML_TYPE_IQ4_XS, GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ3_S, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M. + - 2024.3 + - Release binary files of Windows. + - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd). - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437). - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. @@ -44,216 +54,230 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). ## OS -|OS|Status|Verified| -|-|-|-| -|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39| -|Windows|Support|Windows 11| +| OS | Status | Verified | +|---------|---------|------------------------------------| +| Linux | Support | Ubuntu 22.04, Fedora Silverblue 39 | +| Windows | Support | Windows 11 | -## Intel GPU +## Hardware -### Verified +### Intel GPU -|Intel GPU| Status | Verified Model| -|-|-|-| -|Intel Data Center Max Series| Support| Max 1550| -|Intel Data Center Flex Series| Support| Flex 170| -|Intel Arc Series| Support| Arc 770, 730M| -|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake| -|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7| +**Verified devices** -Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use. +| Intel GPU | Status | Verified Model | +|-------------------------------|---------|---------------------------------------| +| Intel Data Center Max Series | Support | Max 1550, 1100 | +| Intel Data Center Flex Series | Support | Flex 170 | +| Intel Arc Series | Support | Arc 770, 730M | +| Intel built-in Arc GPU | Support | built-in Arc GPU in Meteor Lake | +| Intel iGPU | Support | iGPU in i5-1250P, i7-1260P, i7-1165G7 | -### Memory +*Notes:* -The memory is a limitation to run LLM on GPUs. +- **Memory** + - The device memory is a limitation when running a large model. The loaded model size, *`llm_load_tensors: buffer_size`*, is displayed in the log when running `./bin/main`. -When run llama.cpp, there is print log to show the applied memory on GPU. You could know how much memory to be used in your case. Like `llm_load_tensors: buffer size = 3577.56 MiB`. + - Please make sure the GPU shared memory from the host is large enough to account for the model's size. For e.g. the *llama-2-7b.Q4_0* requires at least 8.0GB for integrated GPU and 4.0GB for discrete GPU. -For iGPU, please make sure the shared memory from host memory is enough. For llama-2-7b.Q4_0, recommend the host memory is 8GB+. +- **Execution Unit (EU)** + - If the iGPU has less than 80 EUs, the inference speed will likely be too slow for practical use. -For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+. +### Other Vendor GPU -## Nvidia GPU +**Verified devices** -### Verified - -|Intel GPU| Status | Verified Model| -|-|-|-| -|Ampere Series| Support| A100| - -### oneMKL for CUDA - -The current oneMKL release does not contain the oneMKL cuBlas backend. -As a result for Nvidia GPU's oneMKL must be built from source. - -``` -git clone https://github.com/oneapi-src/oneMKL -cd oneMKL -mkdir build -cd build -cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -ninja -// Add paths as necessary -``` +| Nvidia GPU | Status | Verified Model | +|--------------------------|---------|----------------| +| Ampere Series | Support | A100, A4000 | +| Ampere Series *(Mobile)* | Support | RTX 40 Series | ## Docker +The docker build option is currently limited to *intel GPU* targets. -Note: -- Only docker on Linux is tested. Docker on WSL may not work. -- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that) - -### Build the image - -You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. - - +### Build image ```sh -# For F16: -#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . - -# Or, for F32: -docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile . - -# Note: you can also use the ".devops/server-intel.Dockerfile", which compiles the "server" example +# Using FP16 +docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile . ``` -### Run +*Notes*: + +To build in default FP32 *(Slower than FP16 alternative)*, you can remove the `--build-arg="LLAMA_SYCL_F16=ON"` argument from the previous command. + +You can also use the `.devops/server-intel.Dockerfile`, which builds the *"server"* alternative. + +### Run container ```sh -# Firstly, find all the DRI cards: +# First, find all the DRI cards ls -la /dev/dri -# Then, pick the card that you want to use. - -# For example with "/dev/dri/card1" +# Then, pick the card that you want to use (here for e.g. /dev/dri/card1). docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 ``` +*Notes:* +- Docker has been tested successfully on native Linux. WSL support has not been verified yet. +- You may need to install Intel GPU driver on the **host** machine *(Please refer to the [Linux configuration](#linux) for details)*. + ## Linux -### Setup Environment +### I. Setup Environment -1. Install Intel GPU driver. +1. **Install GPU drivers** -a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html). + - **Intel GPU** -Note: for iGPU, please install the client GPU driver. +Intel data center GPUs drivers installation guide and download page can be found here: [Get intel dGPU Drivers](https://dgpu-docs.intel.com/driver/installation.html#ubuntu-install-steps). -b. Add user to group: video, render. +*Note*: for client GPUs *(iGPU & Arc A-Series)*, please refer to the [client iGPU driver installation](https://dgpu-docs.intel.com/driver/client/overview.html). + +Once installed, add the user(s) to the `video` and `render` groups. ```sh -sudo usermod -aG render username -sudo usermod -aG video username +sudo usermod -aG render $USER +sudo usermod -aG video $USER ``` -Note: re-login to enable it. +*Note*: logout/re-login for the changes to take effect. -c. Check +Verify installation through `clinfo`: ```sh sudo apt install clinfo sudo clinfo -l ``` -Output (example): +Sample output: -``` +```sh Platform #0: Intel(R) OpenCL Graphics `-- Device #0: Intel(R) Arc(TM) A770 Graphics - Platform #0: Intel(R) OpenCL HD Graphics `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49] ``` -2. Install Intel® oneAPI Base toolkit. +- **Nvidia GPU** -a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). +In order to target Nvidia GPUs through SYCL, please make sure the CUDA/CUBLAS native requirements *-found [here](README.md#cuda)-* are installed. -Recommend to install to default folder: **/opt/intel/oneapi**. +2. **Install Intel® oneAPI Base toolkit** -Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder. +- **For Intel GPU** -b. Check +The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page. + +Please follow the instructions for downloading and installing the Toolkit for Linux, and preferably keep the default installation values unchanged, notably the installation path *(`/opt/intel/oneapi` by default)*. + +Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable. + +Upon a successful installation, SYCL is enabled for the available intel devices, along with relevant libraries such as oneAPI MKL for intel GPUs. + +- **Adding support to Nvidia GPUs** + +**oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup. + + +**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs. ```sh -source /opt/intel/oneapi/setvars.sh +git clone https://github.com/oneapi-src/oneMKL +cd oneMKL +cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas +cmake --build buildWithCublas --config Release +``` + +3. **Verify installation and environment** + +In order to check the available SYCL devices on the machine, please use the `sycl-ls` command. +```sh +source /opt/intel/oneapi/setvars.sh sycl-ls ``` -There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. +- **Intel GPU** + +When targeting an intel GPU, the user should expect one or more level-zero devices among the available SYCL devices. Please make sure that at least one GPU is present, for instance [`ext_oneapi_level_zero:gpu:0`] in the sample output below: -Output (example): ``` [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.10.0.17_160000] [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000] [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO [23.30.26918.50] [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918] - ``` -2. Build locally: +- **Nvidia GPU** -Note: -- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference. -- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. +Similarly, user targeting Nvidia GPUs should expect at least one SYCL-CUDA device [`ext_oneapi_cuda:gpu`] as bellow: +``` +[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2 [2023.16.12.0.12_195853.xmain-hotfix] +[opencl:cpu:1] Intel(R) OpenCL, Intel(R) Xeon(R) Gold 6326 CPU @ 2.90GHz OpenCL 3.0 (Build 0) [2023.16.12.0.12_195853.xmain-hotfix] +[ext_oneapi_cuda:gpu:0] NVIDIA CUDA BACKEND, NVIDIA A100-PCIE-40GB 8.0 [CUDA 12.2] +``` +### II. Build llama.cpp + +#### Intel GPU ```sh -mkdir -p build -cd build +# Export relevant ENV variables source /opt/intel/oneapi/setvars.sh -# For FP16: -#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON +# Build LLAMA with MKL BLAS acceleration for intel GPU -# Or, for FP32: -cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +# Option 1: Use FP32 (recommended for better performance in most cases) +cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -# For Nvidia GPUs -cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx +# Option 2: Use FP16 +cmake -B build -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON -# Build example/main only -#cmake --build . --config Release --target main - -# Or, build all binary -cmake --build . --config Release -v - -cd .. +# build all binary +cmake --build build --config Release -j -v ``` -or - +#### Nvidia GPU ```sh -./examples/sycl/build.sh +# Export relevant ENV variables +export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH +export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH +export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR +export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR + +# Build LLAMA with Nvidia BLAS acceleration through SYCL + +# Option 1: Use FP32 (recommended for better performance in most cases) +cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx + +# Option 2: Use FP16 +cmake -B build -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON + +# build all binary +cmake --build build --config Release -j -v + ``` -### Run +### III. Run the inference -1. Put model file to folder **models** +1. Retrieve and prepare model -You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. +You can refer to the general [*Prepare and Quantize*](README.md#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. 2. Enable oneAPI running environment -``` +```sh source /opt/intel/oneapi/setvars.sh ``` -3. List device ID +3. List devices information -Run without parameter: +Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ```sh ./build/bin/ls-sycl-device - -# or running the "main" executable and look at the output log: - -./build/bin/main ``` - -Check the ID in startup log, like: - +A example of such log in a system with 1 *intel CPU* and 1 *intel GPU* can look like the following: ``` found 6 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | @@ -267,22 +291,22 @@ found 6 SYCL devices: | 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` -|Attribute|Note| -|-|-| -|compute capability 1.3|Level-zero running time, recommended | -|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| +| Attribute | Note | +|------------------------|-------------------------------------------------------------| +| compute capability 1.3 | Level-zero driver/runtime, recommended | +| compute capability 3.0 | OpenCL driver/runtime, slower than level-zero in most cases | -4. Device selection and execution of llama.cpp +4. Launch inference There are two device selection modes: -- Single device: Use one device assigned by user. -- Multiple devices: Automatically choose the devices with the same biggest Max compute units. +- Single device: Use one device target specified by the user. +- Multiple devices: Automatically select the devices with the same largest Max compute-units. -|Device selection|Parameter| -|-|-| -|Single device|--split-mode none --main-gpu DEVICE_ID | -|Multiple devices|--split-mode layer (default)| +| Device selection | Parameter | +|------------------|----------------------------------------| +| Single device | --split-mode none --main-gpu DEVICE_ID | +| Multiple devices | --split-mode layer (default) | Examples: @@ -302,74 +326,63 @@ or run by script: ```sh ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` -or run by script: + +Otherwise, you can run the script: ```sh ./examples/sycl/run_llama2.sh ``` -Note: +*Notes:* -- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. +- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: - -5. Verify the device ID in output - -Verify to see if the selected GPU is shown in the output, like: - -``` +```sh detect 1 SYCL GPUs: [0] with top Max compute units:512 ``` Or -``` +```sh use 1 SYCL GPUs: [0] with Max compute units:512 ``` - ## Windows -### Setup Environment +### I. Setup Environment -1. Install Intel GPU driver. +1. Install GPU driver -Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html). +Intel GPU drivers instructions guide and download page can be found here: [Get intel GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html). -Note: **The driver is mandatory for compute function**. +2. Install Visual Studio -2. Install Visual Studio. +If you already have a recent version of Microsoft Visual Studio, you can skip this step. Otherwise, please refer to the official download page for [Microsoft Visual Studio](https://visualstudio.microsoft.com/). -Please install [Visual Studio](https://visualstudio.microsoft.com/) which impact oneAPI environment enabling in Windows. +3. Install Intel® oneAPI Base toolkit -3. Install Intel® oneAPI Base toolkit. +The base toolkit can be obtained from the official [Intel® oneAPI Base Toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) page. -a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html). +Please follow the instructions for downloading and installing the Toolkit for Windows, and preferably keep the default installation values unchanged, notably the installation path *(`C:\Program Files (x86)\Intel\oneAPI` by default)*. -Recommend to install to default folder: **C:\Program Files (x86)\Intel\oneAPI**. - -Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder. +Following guidelines/code snippets assume the default installation values. Otherwise, please make sure the necessary changes are reflected where applicable. b. Enable oneAPI running environment: -- In Search, input 'oneAPI'. +- Type "oneAPI" in the search bar, then open the `Intel oneAPI command prompt for Intel 64 for Visual Studio 2022` App. -Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022" - -- In Run: - -In CMD: +- On the command prompt, enable the runtime environment with the following: ``` "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 ``` -c. Check GPU +c. Verify installation -In oneAPI command line: +In the oneAPI command line, run the following to print the available SYCL devices: ``` sycl-ls ``` -There should be one or more level-zero devices. Please confirm that at least one GPU is present, like **[ext_oneapi_level_zero:gpu:0]**. +There should be one or more *level-zero* GPU devices displayed as **[ext_oneapi_level_zero:gpu]**. Below is example of such output detecting an *intel Iris Xe* GPU as a Level-zero SYCL device: Output (example): ``` @@ -379,7 +392,7 @@ Output (example): [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044] ``` -4. Install cmake & make +4. Install build tools a. Download & install cmake for Windows: https://cmake.org/download/ @@ -389,76 +402,55 @@ b. Download & install mingw-w64 make for Windows provided by w64devkit - Extract `w64devkit` on your pc. -- Add the **bin** folder path in the Windows system PATH environment, like `C:\xxx\w64devkit\bin\`. +- Add the **bin** folder path in the Windows system PATH environment (for e.g. `C:\xxx\w64devkit\bin\`). -### Build locally: +### II. Build llama.cpp -In oneAPI command line window: +On the oneAPI command line window, step into the llama.cpp main directory and run the following: ``` -mkdir -p build -cd build @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force -:: for FP16 -:: faster for long-prompt inference -:: cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON +# Option 1: Use FP32 (recommended for better performance in most cases) +cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -:: for FP32 -cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release +# Option 2: Or FP16 +cmake -B build -G "MinGW Makefiles" -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON - -:: build example/main only -:: make main - -:: build all binary -make -j -cd .. +cmake --build build --config Release -j ``` -or - -``` +Otherwise, run the `win-build-sycl.bat` wrapper which encapsulates the former instructions: +```sh .\examples\sycl\win-build-sycl.bat ``` -Note: +*Notes:* -- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only. +- By default, calling `make` will build all target binary files. In case of a minimal experimental setup, the user can build the inference executable only through `make main`. -### Run +### III. Run the inference -1. Put model file to folder **models** +1. Retrieve and prepare model -You could download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) as example. +You can refer to the general [*Prepare and Quantize*](README#prepare-and-quantize) guide for model prepration, or simply download [llama-2-7b.Q4_0.gguf](https://huggingface.co/TheBloke/Llama-2-7B-GGUF/blob/main/llama-2-7b.Q4_0.gguf) model as example. 2. Enable oneAPI running environment -- In Search, input 'oneAPI'. - -Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022" - -- In Run: - -In CMD: +On the oneAPI command line window, run the following and step into the llama.cpp directory: ``` "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 ``` -3. List device ID +3. List devices information -Run without parameter: +Similar to the native `sycl-ls`, available SYCL devices can be queried as follow: ``` build\bin\ls-sycl-device.exe - -or - -build\bin\main.exe ``` -Check the ID in startup log, like: - +The output of this command in a system with 1 *intel CPU* and 1 *intel GPU* would look like the following: ``` found 6 SYCL devices: | | | |Compute |Max compute|Max work|Max sub| | @@ -473,23 +465,23 @@ found 6 SYCL devices: ``` -|Attribute|Note| -|-|-| -|compute capability 1.3|Level-zero running time, recommended | -|compute capability 3.0|OpenCL running time, slower than level-zero in most cases| +| Attribute | Note | +|------------------------|-----------------------------------------------------------| +| compute capability 1.3 | Level-zero running time, recommended | +| compute capability 3.0 | OpenCL running time, slower than level-zero in most cases | -4. Device selection and execution of llama.cpp +4. Launch inference There are two device selection modes: - Single device: Use one device assigned by user. - Multiple devices: Automatically choose the devices with the same biggest Max compute units. -|Device selection|Parameter| -|-|-| -|Single device|--split-mode none --main-gpu DEVICE_ID | -|Multiple devices|--split-mode layer (default)| +| Device selection | Parameter | +|------------------|----------------------------------------| +| Single device | --split-mode none --main-gpu DEVICE_ID | +| Multiple devices | --split-mode layer (default) | Examples: @@ -504,7 +496,7 @@ build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be ``` build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer ``` -or run by script: +Otherwise, run the following wrapper script: ``` .\examples\sycl\win-run-llama2.bat @@ -512,19 +504,13 @@ or run by script: Note: -- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. +- Upon execution, verify the selected device(s) ID(s) in the output log, which can for instance be displayed as follow: - - -5. Verify the device ID in output - -Verify to see if the selected GPU is shown in the output, like: - -``` +```sh detect 1 SYCL GPUs: [0] with top Max compute units:512 ``` Or -``` +```sh use 1 SYCL GPUs: [0] with Max compute units:512 ``` @@ -532,67 +518,51 @@ use 1 SYCL GPUs: [0] with Max compute units:512 #### Build -|Name|Value|Function| -|-|-|-| -|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path.
For FP32/FP16, LLAMA_SYCL=ON is mandatory.| -|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference.
For FP32, not set it.| -|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path| -|CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path| +| Name | Value | Function | +|--------------------|-----------------------------------|---------------------------------------------| +| LLAMA_SYCL | ON (mandatory) | Enable build with SYCL code path. | +| LLAMA_SYCL_TARGET | INTEL *(default)* \| NVIDIA | Set the SYCL target device type. | +| LLAMA_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. | +| CMAKE_C_COMPILER | icx | Set *icx* compiler for SYCL code path. | +| CMAKE_CXX_COMPILER | icpx *(Linux)*, icx *(Windows)* | Set `icpx/icx` compiler for SYCL code path. | -#### Running +#### Runtime +| Name | Value | Function | +|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------| +| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG | +| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer | -|Name|Value|Function| -|-|-|-| -|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| -|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer| +## Known Issues -## Known Issue - -- Hang during startup - - llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block. - - Solution: add **--no-mmap** or **--mmap 0**. - -- Split-mode: [row] is not supported - - It's on developing. +- `Split-mode:[row]` is not supported. ## Q&A -Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible. - - - Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. - Miss to enable oneAPI running environment. + - Potential cause: Unavailable oneAPI installation or not set ENV variables. + - Solution: Install *oneAPI base toolkit* and enable its ENV through: `source /opt/intel/oneapi/setvars.sh`. - Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`. +- General compiler error: -- In Windows, no result, not error. + - Remove **build** folder or try a clean-build. - Miss to enable oneAPI running environment. +- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux. -- Meet compile error. + Please double-check with `sudo sycl-ls`. - Remove folder **build** and try again. - -- I can **not** see **[ext_oneapi_level_zero:gpu:0]** afer install GPU driver in Linux. - - Please run **sudo sycl-ls**. - - If you see it in result, please add video/render group to your ID: + If it's present in the list, please add video/render group to your user then **logout/login** or restart your system: ``` - sudo usermod -aG render username - sudo usermod -aG video username + sudo usermod -aG render $USER + sudo usermod -aG video $USER ``` + Otherwise, please double-check the GPU driver installation steps. - Then **relogin**. +### **GitHub contribution**: +Please add the **[SYCL]** prefix/tag in issues/PRs titles to help the SYCL-team check/address them without delay. - If you do not see it, please check the installation GPU steps again. - -## Todo +## TODO - Support row layer split for multiple card runs. diff --git a/README.md b/README.md index c2f3342f0..9f2f8df64 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) -[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) +[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) @@ -10,6 +10,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Recent API changes +- [2024 Apr 21] `llama_token_to_piece` can now optionally render special tokens https://github.com/ggerganov/llama.cpp/pull/6807 +- [2024 Apr 4] State and session file functions reorganized under `llama_state_*` https://github.com/ggerganov/llama.cpp/pull/6341 +- [2024 Mar 26] Logits and embeddings API updated for compactness https://github.com/ggerganov/llama.cpp/pull/6122 - [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017 - [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796 @@ -17,7 +20,12 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics -- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017 +- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021** +- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920 +- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387 +- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404 +- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225 +- Multi-GPU pipeline parallelism support https://github.com/ggerganov/llama.cpp/pull/6017 - Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981 - Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962 - Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328 @@ -87,9 +95,11 @@ Typically finetunes of the base models below are supported as well. - [X] LLaMA 🦙 - [x] LLaMA 2 🦙🦙 +- [x] LLaMA 3 🦙🦙🦙 - [X] [Mistral 7B](https://huggingface.co/mistralai/Mistral-7B-v0.1) - [x] [Mixtral MoE](https://huggingface.co/models?search=mistral-ai/Mixtral) -- [X] Falcon +- [x] [DBRX](https://huggingface.co/databricks/dbrx-instruct) +- [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon) - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2) - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne) - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/) @@ -112,7 +122,14 @@ Typically finetunes of the base models below are supported as well. - [x] [CodeShell](https://github.com/WisdomShell/codeshell) - [x] [Gemma](https://ai.google.dev/gemma) - [x] [Mamba](https://github.com/state-spaces/mamba) -- [x] [Command-R](https://huggingface.co/CohereForAI/c4ai-command-r-v01) +- [x] [Grok-1](https://huggingface.co/keyfan/grok-1-hf) +- [x] [Xverse](https://huggingface.co/models?search=xverse) +- [x] [Command-R models](https://huggingface.co/models?search=CohereForAI/c4ai-command-r) +- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion) +- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B) +- [x] [OLMo](https://allenai.org/olmo) + +(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md)) **Multimodal models:** @@ -122,6 +139,8 @@ Typically finetunes of the base models below are supported as well. - [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) +- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) +- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2) **HTTP server** @@ -136,6 +155,7 @@ Typically finetunes of the base models below are supported as well. - JavaScript/Wasm (works in browser): [tangledgroup/llama-cpp-wasm](https://github.com/tangledgroup/llama-cpp-wasm) - Typescript/Wasm (nicer API, available on npm): [ngxson/wllama](https://github.com/ngxson/wllama) - Ruby: [yoshoku/llama_cpp.rb](https://github.com/yoshoku/llama_cpp.rb) +- Rust (more features): [edgenai/llama_cpp-rs](https://github.com/edgenai/llama_cpp-rs) - Rust (nicer API): [mdrokz/rust-llama.cpp](https://github.com/mdrokz/rust-llama.cpp) - Rust (more direct bindings): [utilityai/llama-cpp-rs](https://github.com/utilityai/llama-cpp-rs) - C#/.NET: [SciSharp/LLamaSharp](https://github.com/SciSharp/LLamaSharp) @@ -145,6 +165,7 @@ Typically finetunes of the base models below are supported as well. - Java: [kherud/java-llama.cpp](https://github.com/kherud/java-llama.cpp) - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig) - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart) +- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326) **UI:** @@ -155,6 +176,7 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [nat/openplayground](https://github.com/nat/openplayground) - [Faraday](https://faraday.dev/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary) +- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary) - [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) @@ -165,11 +187,20 @@ Unless otherwise noted these projects are open-source with permissive licensing: - [cztomsik/ava](https://github.com/cztomsik/ava) (MIT) - [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) - [pythops/tenere](https://github.com/pythops/tenere) (AGPL) +- [RecurseChat](https://recurse.chat/) (proprietary) - [semperai/amica](https://github.com/semperai/amica) - [withcatai/catai](https://github.com/withcatai/catai) - [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT) - [Msty](https://msty.app) (proprietary) - [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT) +- [KanTV](https://github.com/zhouwg/kantv?tab=readme-ov-file)(Apachev2.0 or later) +- [Dot](https://github.com/alexpinel/Dot) (GPL) +- [MindMac](https://mindmac.app) (proprietary) +- [KodiBot](https://github.com/firatkiral/kodibot) (GPL) +- [eva](https://github.com/ylsdamxssjxxdd/eva) (MIT) +- [AI Sublime Text plugin](https://github.com/yaroslavyaroslav/OpenAI-sublime-text) (MIT) + +*(to have a project listed here, it should clearly state that it depends on `llama.cpp`)* --- @@ -279,6 +310,8 @@ In order to build llama.cpp you have three different options. make ``` + **Note**: for `Debug` builds, run `make LLAMA_DEBUG=1` + - On Windows: 1. Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). @@ -293,12 +326,26 @@ In order to build llama.cpp you have three different options. - Using `CMake`: ```bash - mkdir build - cd build - cmake .. - cmake --build . --config Release + cmake -B build + cmake --build build --config Release ``` + **Note**: for `Debug` builds, there are two cases: + + - Single-config generators (e.g. default = `Unix Makefiles`; note that they just ignore the `--config` flag): + + ```bash + cmake -B build -DCMAKE_BUILD_TYPE=Debug + cmake --build build + ``` + + - Multi-config generators (`-G` param set to Visual Studio, XCode...): + + ```bash + cmake -B build -G "Xcode" + cmake --build build --config Debug + ``` + - Using `Zig` (version 0.11 or later): Building for optimization levels and CPU features can be accomplished using standard build arguments, for example AVX2, FMA, F16C, @@ -410,10 +457,8 @@ Building the program with BLAS support may lead to some performance improvements - Using `CMake` on Linux: ```bash - mkdir build - cd build - cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS - cmake --build . --config Release + cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS + cmake --build build --config Release ``` - #### BLIS @@ -433,11 +478,9 @@ Building the program with BLAS support may lead to some performance improvements - Using manual oneAPI installation: By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps: ```bash - mkdir build - cd build source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation - cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON - cmake --build . --config Release + cmake -B build -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON + cmake --build build --config Release ``` - Using oneAPI docker image: @@ -445,44 +488,39 @@ Building the program with BLAS support may lead to some performance improvements Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information. -- #### cuBLAS +- #### CUDA - This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). + This provides GPU acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). For Jetson user, if you have Jetson Orin, you can try this: [Offical Support](https://www.jetson-ai-lab.com/tutorial_text-generation.html). If you are using an old model(nano/TX2), need some additional operations before compiling. - Using `make`: ```bash - make LLAMA_CUBLAS=1 + make LLAMA_CUDA=1 ``` - Using `CMake`: ```bash - mkdir build - cd build - cmake .. -DLLAMA_CUBLAS=ON - cmake --build . --config Release + cmake -B build -DLLAMA_CUDA=ON + cmake --build build --config Release ``` The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used. The following compilation options are also available to tweak performance: - - | Option | Legal values | Default | Description | - |--------------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | - | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | + | Option | Legal values | Default | Description | + |--------------------------------|------------------------|---------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. | + | LLAMA_CUDA_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels and for the q4_1 and q5_1 matrix matrix multiplication kernels. Can improve performance on relatively recent GPUs. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per CUDA thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | LLAMA_CUDA_PEER_MAX_BATCH_SIZE | Positive integer | 128 | Maximum batch size for which to enable peer access between multiple GPUs. Peer access requires either Linux or NVLink. When using NVLink enabling peer access for larger batch sizes is potentially beneficial. | - #### hipBLAS This provides BLAS acceleration on HIP-supported AMD GPUs. Make sure to have ROCm installed. - You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html). + You can download it from your Linux distro's package manager or from here: [ROCm Quick Start (Linux)](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/tutorial/quick-start.html#rocm-install-quick). - Using `make`: ```bash @@ -491,15 +529,15 @@ Building the program with BLAS support may lead to some performance improvements - Using `CMake` for Linux (assuming a gfx1030-compatible AMD GPU): ```bash CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ \ - cmake -H. -Bbuild -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ - && cmake --build build -- -j 16 + cmake -B build -DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \ + && cmake --build build --config Release -- -j 16 ``` On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DLLAMA_HIP_UMA=ON"`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs). - Using `make` (example for target gfx1030, build with 16 CPU threads): ```bash - make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gxf1030 + make -j16 LLAMA_HIPBLAS=1 LLAMA_HIP_UMA=1 AMDGPU_TARGETS=gfx1030 ``` - Using `CMake` for Windows (using x64 Native Tools Command Prompt for VS, and assuming a gfx1100-compatible AMD GPU): @@ -507,7 +545,7 @@ Building the program with BLAS support may lead to some performance improvements set PATH=%HIP_PATH%\bin;%PATH% mkdir build cd build - cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ .. + cmake -G Ninja -DAMDGPU_TARGETS=gfx1100 -DLLAMA_HIPBLAS=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_BUILD_TYPE=Release .. cmake --build . ``` Make sure that `AMDGPU_TARGETS` is set to the GPU arch you want to compile for. The above example uses `gfx1100` that corresponds to Radeon RX 7900XTX/XT/GRE. You can find a list of targets [here](https://llvm.org/docs/AMDGPUUsage.html#processors) @@ -518,18 +556,18 @@ Building the program with BLAS support may lead to some performance improvements If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3. The following compilation options are also available to tweak performance (yes, they refer to CUDA, not HIP, because it uses the same code as the cuBLAS version above): - | Option | Legal values | Default | Description | - |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | - | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | - | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | + | Option | Legal values | Default | Description | + |-------------------------|------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| + | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the HIP dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | + | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the HIP mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | + | LLAMA_CUDA_KQUANTS_ITER | 1 or 2 | 2 | Number of values processed per iteration and per HIP thread for Q2_K and Q6_K quantization formats. Setting this value to 1 can improve performance for slow GPUs. | - #### CLBlast OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU. You will need the [OpenCL SDK](https://github.com/KhronosGroup/OpenCL-SDK). - - For Ubuntu or Debian, the packages `opencl-headers`, `ocl-icd` may be needed. + - For Ubuntu, Debian, and Fedora the packages `opencl-headers`, `ocl-icd` may be needed. - For Windows, a pre-built SDK is available on the [OpenCL Releases](https://github.com/KhronosGroup/OpenCL-SDK/releases) page. @@ -538,15 +576,14 @@ Building the program with BLAS support may lead to some performance improvements ```sh git clone --recurse-submodules https://github.com/KhronosGroup/OpenCL-SDK.git - mkdir OpenCL-SDK/build - cd OpenCL-SDK/build - cmake .. -DBUILD_DOCS=OFF \ + cd OpenCL-SDK + cmake -B build -DBUILD_DOCS=OFF \ -DBUILD_EXAMPLES=OFF \ -DBUILD_TESTING=OFF \ -DOPENCL_SDK_BUILD_SAMPLES=OFF \ -DOPENCL_SDK_TEST_SAMPLES=OFF - cmake --build . --config Release - cmake --install . --prefix /some/path + cmake --build build + cmake --install build --prefix /some/path ``` @@ -554,6 +591,12 @@ Building the program with BLAS support may lead to some performance improvements Pre-built CLBlast binaries may be found on the [CLBlast Releases](https://github.com/CNugteren/CLBlast/releases) page. For Unix variants, it may also be found in your operating system's packages. + Linux packaging: + Fedora Linux: + ```bash + sudo dnf install clblast + ``` + Alternatively, they may be built from source. -
@@ -562,23 +605,23 @@ Building the program with BLAS support may lead to some performance improvements ```cmd set OPENCL_SDK_ROOT="C:/OpenCL-SDK-v2023.04.17-Win-x64" git clone https://github.com/CNugteren/CLBlast.git - mkdir CLBlast\build - cd CLBlast\build - cmake .. -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64 - cmake --build . --config Release - cmake --install . --prefix C:/CLBlast + cd CLBlast + cmake -B build -DBUILD_SHARED_LIBS=OFF -DOVERRIDE_MSVC_FLAGS_TO_MT=OFF -DTUNERS=OFF -DOPENCL_ROOT=%OPENCL_SDK_ROOT% -G "Visual Studio 17 2022" -A x64 + cmake --build build --config Release + cmake --install build --prefix C:/CLBlast ``` + (note: `--config Release` at build time is the default and only relevant for Visual Studio builds - or multi-config Ninja builds) + -
Unix: ```sh git clone https://github.com/CNugteren/CLBlast.git - mkdir CLBlast/build - cd CLBlast/build - cmake .. -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF - cmake --build . --config Release - cmake --install . --prefix /some/path + cd CLBlast + cmake -B build -DBUILD_SHARED_LIBS=OFF -DTUNERS=OFF + cmake --build build --config Release + cmake --install build --prefix /some/path ``` Where `/some/path` is where the built library will be installed (default is `/usr/local`). @@ -592,21 +635,17 @@ Building the program with BLAS support may lead to some performance improvements ``` - CMake (Unix): ```sh - mkdir build - cd build - cmake .. -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path - cmake --build . --config Release + cmake -B build -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path + cmake --build build --config Release ``` - CMake (Windows): ```cmd set CL_BLAST_CMAKE_PKG="C:/CLBlast/lib/cmake/CLBlast" git clone https://github.com/ggerganov/llama.cpp cd llama.cpp - mkdir build - cd build - cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64 - cmake --build . --config Release - cmake --install . --prefix C:/LlamaCPP + cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=%CL_BLAST_CMAKE_PKG% -G "Visual Studio 17 2022" -A x64 + cmake --build build --config Release + cmake --install build --prefix C:/LlamaCPP ``` ##### Running Llama with CLBlast @@ -662,10 +701,8 @@ Building the program with BLAS support may lead to some performance improvements Then, build llama.cpp using the cmake command below: ```bash - mkdir -p build - cd build - cmake .. -DLLAMA_VULKAN=1 - cmake --build . --config Release + cmake -B build -DLLAMA_VULKAN=1 + cmake --build build --config Release # Test the output binary (with "-ngl 33" to offload all layers to GPU) ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4 @@ -677,6 +714,8 @@ Building the program with BLAS support may lead to some performance improvements To obtain the official LLaMA 2 weights please see the Obtaining and using the Facebook LLaMA 2 model section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face. +Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face. + ```bash # obtain the official LLaMA model weights and place them in ./models ls ./models @@ -730,11 +769,11 @@ From the unzipped folder, open a terminal/cmd window here and place a pre-conver As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. | Model | Original size | Quantized size (Q4_0) | -|------:|--------------:|-----------------------:| -| 7B | 13 GB | 3.9 GB | -| 13B | 24 GB | 7.8 GB | -| 30B | 60 GB | 19.5 GB | -| 65B | 120 GB | 38.5 GB | +|------:|--------------:|----------------------:| +| 7B | 13 GB | 3.9 GB | +| 13B | 24 GB | 7.8 GB | +| 30B | 60 GB | 19.5 GB | +| 65B | 120 GB | 38.5 GB | ### Quantization @@ -742,7 +781,7 @@ Several quantization methods are supported. They differ in the resulting model d *(outdated)* -| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | +| Model | Measure | F16 | Q4_0 | Q4_1 | Q5_0 | Q5_1 | Q8_0 | |------:|--------------|-------:|-------:|-------:|-------:|-------:|-------:| | 7B | perplexity | 5.9066 | 6.1565 | 6.0912 | 5.9862 | 5.9481 | 5.9070 | | 7B | file size | 13.0G | 3.5G | 3.9G | 4.3G | 4.7G | 6.7G | @@ -898,17 +937,25 @@ If your issue is with model generation quality, then please at least scan the fo ### Android +#### Build on Android using Termux +[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required). +``` +apt update && apt upgrade -y +apt install git make cmake +``` + +It's recommended to move your model inside the `~/` directory for best performance: +``` +cd storage/downloads +mv model.gguf ~/ +``` + +[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`. + #### Building the Project using Android NDK -You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/). - -First, install the essential packages for termux: -``` -pkg install clang wget git cmake -``` -Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake: - -You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux. +Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake. +Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux: ``` $ mkdir build-android $ cd build-android @@ -916,7 +963,9 @@ $ export NDK= $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod .. $ make ``` -Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card. + +Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice). + Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission: (Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`) @@ -938,53 +987,10 @@ $cd /data/data/com.termux/files/home/bin $./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml ``` -Here is a demo of an interactive session running on Pixel 5 phone: +Here's a demo of an interactive session running on Pixel 5 phone: https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4 -#### Building the Project using Termux (F-Droid) -Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card. - -Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU. - -If you opt to utilize OpenBLAS, you'll need to install the corresponding package. -``` -apt install libopenblas -``` - -Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages: -``` -apt install ocl-icd opencl-headers opencl-clhpp clinfo -``` - -In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below: -``` -cmake . -make -cp libclblast.so* $PREFIX/lib -cp ./include/clblast.h ../llama.cpp -``` - -Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below: -``` -cp /data/data/com.termux/files/usr/include/openblas/cblas.h . -cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h . -make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice) -``` - -Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below: -``` -GGML_OPENCL_PLATFORM=0 -GGML_OPENCL_DEVICE=0 -export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH -``` - -(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ ) - -For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle. - -Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script. - ### Docker #### Prerequisites @@ -1090,7 +1096,9 @@ docker run --gpus all -v /path/to/models:/models local/llama.cpp:server-cuda -m - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a` - See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices -- Matrix multiplication is unconventional: [`z = ggml_mul_mat(ctx, x, y)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means `zT = x @ yT` +- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$ + +![matmul](media/matmul.png) ### Docs diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 000000000..f4322c6ee --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,67 @@ +# Security Policy + + - [**Using llama.cpp securely**](#using-llamacpp-securely) + - [Untrusted models](#untrusted-models) + - [Untrusted inputs](#untrusted-inputs) + - [Data privacy](#data-privacy) + - [Untrusted environments or networks](#untrusted-environments-or-networks) + - [Multi-Tenant environments](#multi-tenant-environments) + - [**Reporting a vulnerability**](#reporting-a-vulnerability) + +## Using llama.cpp securely + +### Untrusted models +Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources. + +*Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. + +> [!NOTE] +> The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance. + +### Untrusted inputs + +Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks. + +For maximum security when handling untrusted inputs, you may need to employ the following: + +* Sandboxing: Isolate the environment where the inference happens. +* Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics. +* Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches. +* Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as: + * Validation: Enforce strict rules on allowed characters and data types. + * Filtering: Remove potentially malicious scripts or code fragments. + * Encoding: Convert special characters into safe representations. + * Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)). + +### Data privacy + +To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors. + +### Untrusted environments or networks + +If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions: +* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value +* Encrypt your data if sending it over the network. + +### Multi-Tenant environments + +If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks. + +1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity. + +2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring. + +3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk. + +4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time. + +## Reporting a vulnerability + +Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++. + + +However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released. + +Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new). + +A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure. diff --git a/build.zig b/build.zig index a1de7083a..96783574f 100644 --- a/build.zig +++ b/build.zig @@ -112,10 +112,12 @@ pub fn build(b: *std.build.Builder) !void { make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false; const ggml = make.obj("ggml", "ggml.c"); + const sgemm = make.obj("sgemm", "sgemm.cpp"); const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c"); const ggml_backend = make.obj("ggml-backend", "ggml-backend.c"); const ggml_quants = make.obj("ggml-quants", "ggml-quants.c"); const unicode = make.obj("unicode", "unicode.cpp"); + const unicode_data = make.obj("unicode-data", "unicode-data.cpp"); const llama = make.obj("llama", "llama.cpp"); const buildinfo = make.obj("common", "common/build-info.cpp"); const common = make.obj("common", "common/common.cpp"); @@ -127,15 +129,44 @@ pub fn build(b: *std.build.Builder) !void { const clip = make.obj("clip", "examples/llava/clip.cpp"); const llava = make.obj("llava", "examples/llava/llava.cpp"); - _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser }); - _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo }); - _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo }); - _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo }); - _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train }); - _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train }); + _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, console, grammar_parser }); + _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo }); + _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); + _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, train }); - const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, json_schema_to_grammar, clip, llava }); + const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, sgemm, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, unicode_data, common, json_schema_to_grammar, buildinfo, sampling, grammar_parser, clip, llava }); if (server.target.isWindows()) { server.linkSystemLibrary("ws2_32"); } + + const server_assets = [_][]const u8{ "index.html", "index.js", "completion.js", "json-schema-to-grammar.mjs" }; + for (server_assets) |asset| { + const input_path = b.fmt("examples/server/public/{s}", .{asset}); + const output_path = b.fmt("examples/server/{s}.hpp", .{asset}); + + // Portable equivalent of `b.addSystemCommand(&.{ "xxd", "-n", asset, "-i", input_path, output_path }) })`: + + const input = try std.fs.cwd().readFileAlloc(b.allocator, input_path, std.math.maxInt(usize)); + defer b.allocator.free(input); + + var buf = std.ArrayList(u8).init(b.allocator); + defer buf.deinit(); + + for (input) |byte| { + try std.fmt.format(buf.writer(), "0x{X:0>2}, ", .{byte}); + } + + var name = try std.mem.replaceOwned(u8, b.allocator, asset, "-", "_"); + defer b.allocator.free(name); + std.mem.replaceScalar(u8, name, '.', '_'); + + try std.fs.cwd().writeFile(output_path, b.fmt( + "unsigned char {s}[] = {{{s}}};\nunsigned int {s}_len = {d};\n", + .{ name, buf.items, name, input.len }, + )); + + std.debug.print("Dumped hex of \"{s}\" ({s}) to {s}\n", .{ input_path, name, output_path }); + } } diff --git a/ci/run.sh b/ci/run.sh index 51f4c74cc..e67c1a5ff 100755 --- a/ci/run.sh +++ b/ci/run.sh @@ -40,7 +40,7 @@ if [ ! -z ${GG_BUILD_METAL} ]; then fi if [ ! -z ${GG_BUILD_CUDA} ]; then - CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1" + CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUDA=1" fi if [ ! -z ${GG_BUILD_SYCL} ]; then @@ -153,6 +153,54 @@ function gg_sum_ctest_release { gg_printf '```\n' } +# test_scripts_debug + +function gg_run_test_scripts_debug { + cd ${SRC} + + set -e + + (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + (cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + + set +e +} + +function gg_sum_test_scripts_debug { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs test scripts in debug mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)" + gg_printf '```\n' + gg_printf '\n' +} + +# test_scripts_release + +function gg_run_test_scripts_release { + cd ${SRC} + + set -e + + (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + (cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log + + set +e +} + +function gg_sum_test_scripts_release { + gg_printf '### %s\n\n' "${ci}" + + gg_printf 'Runs test scripts in release mode\n' + gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)" + gg_printf '```\n' + gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)" + gg_printf '```\n' + gg_printf '\n' +} + function gg_get_model { local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf" local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf" @@ -287,7 +335,8 @@ function gg_run_open_llama_3b_v2 { (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -412,8 +461,8 @@ function gg_run_open_llama_7b_v2 { set -e - (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log - (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log + (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUDA=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log + (time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log python3 ../convert.py ${path_models} @@ -468,7 +517,10 @@ function gg_run_open_llama_7b_v2 { (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log - (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log + (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log function check_ppl { qnt="$1" @@ -575,7 +627,7 @@ function gg_run_embd_bge_small { cd ${SRC} gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json - gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model + gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin @@ -642,6 +694,11 @@ test $ret -eq 0 && gg_run ctest_release if [ -z ${GG_BUILD_LOW_PERF} ]; then test $ret -eq 0 && gg_run embd_bge_small + if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then + test $ret -eq 0 && gg_run test_scripts_debug + test $ret -eq 0 && gg_run test_scripts_release + fi + if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then if [ -z ${GG_BUILD_CUDA} ]; then test $ret -eq 0 && gg_run open_llama_3b_v2 diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index 10951693a..0ec8d6d8d 100644 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -47,9 +47,6 @@ if (BUILD_SHARED_LIBS) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) endif() -set(TARGET json-schema-to-grammar) -add_library(${TARGET} OBJECT json-schema-to-grammar.cpp json-schema-to-grammar.h) - set(TARGET common) add_library(${TARGET} STATIC @@ -63,8 +60,11 @@ add_library(${TARGET} STATIC grammar-parser.h grammar-parser.cpp json.hpp + json-schema-to-grammar.cpp train.h train.cpp + ngram-cache.h + ngram-cache.cpp ) if (BUILD_SHARED_LIBS) diff --git a/common/common.cpp b/common/common.cpp index 192182d05..ba1ecf0e5 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1,4 +1,8 @@ #include "common.h" +// Change JSON_ASSERT from assert() to GGML_ASSERT: +#define JSON_ASSERT GGML_ASSERT +#include "json.hpp" +#include "json-schema-to-grammar.h" #include "llama.h" #include @@ -16,6 +20,7 @@ #include #include #include +#include #if defined(__APPLE__) && defined(__MACH__) #include @@ -27,7 +32,6 @@ #ifndef NOMINMAX # define NOMINMAX #endif -#include #include #include #include @@ -39,18 +43,21 @@ #endif #if defined(LLAMA_USE_CURL) #include +#include +#include +#include #endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) -#define GGML_USE_CUBLAS_SYCL +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) +#define GGML_USE_CUDA_SYCL #endif -#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) -#define GGML_USE_CUBLAS_SYCL_VULKAN +#if (defined(GGML_USE_CUDA) || defined(GGML_USE_SYCL)) || defined(GGML_USE_VULKAN) +#define GGML_USE_CUDA_SYCL_VULKAN #endif #if defined(LLAMA_USE_CURL) @@ -61,16 +68,17 @@ #else #include #endif -#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX -#define LLAMA_CURL_MAX_HEADER_LENGTH 256 +#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 #endif // LLAMA_USE_CURL +using json = nlohmann::ordered_json; + int32_t get_num_physical_cores() { #ifdef __linux__ // enumerate the set of thread siblings, num entries is num cores std::unordered_set siblings; for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) { - std::ifstream thread_siblings("/sys/devices/system/cpu" + std::ifstream thread_siblings("/sys/devices/system/cpu/cpu" + std::to_string(cpu) + "/topology/thread_siblings"); if (!thread_siblings.is_open()) { break; // no more cpus @@ -101,7 +109,80 @@ int32_t get_num_physical_cores() { return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4; } -void process_escapes(std::string& input) { +#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) +#include + +static void cpuid(unsigned leaf, unsigned subleaf, + unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx) { + __asm__("movq\t%%rbx,%%rsi\n\t" + "cpuid\n\t" + "xchgq\t%%rbx,%%rsi" + : "=a"(*eax), "=S"(*ebx), "=c"(*ecx), "=d"(*edx) + : "0"(leaf), "2"(subleaf)); +} + +static int pin_cpu(int cpu) { + cpu_set_t mask; + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + return pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); +} + +static bool is_hybrid_cpu(void) { + unsigned eax, ebx, ecx, edx; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + return !!(edx & (1u << 15)); +} + +static bool is_running_on_efficiency_core(void) { + unsigned eax, ebx, ecx, edx; + cpuid(0x1a, 0, &eax, &ebx, &ecx, &edx); + int intel_atom = 0x20; + int core_type = (eax & 0xff000000u) >> 24; + return core_type == intel_atom; +} + +static int count_math_cpus(int cpu_count) { + int result = 0; + for (int cpu = 0; cpu < cpu_count; ++cpu) { + if (pin_cpu(cpu)) { + return -1; + } + if (is_running_on_efficiency_core()) { + continue; // efficiency cores harm lockstep threading + } + ++cpu; // hyperthreading isn't useful for linear algebra + ++result; + } + return result; +} + +#endif // __x86_64__ && __linux__ + +/** + * Returns number of CPUs on system that are useful for math. + */ +int get_math_cpu_count() { +#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__) + int cpu_count = sysconf(_SC_NPROCESSORS_ONLN); + if (cpu_count < 1) { + return get_num_physical_cores(); + } + if (is_hybrid_cpu()) { + cpu_set_t affinity; + if (!pthread_getaffinity_np(pthread_self(), sizeof(affinity), &affinity)) { + int result = count_math_cpus(cpu_count); + pthread_setaffinity_np(pthread_self(), sizeof(affinity), &affinity); + if (result > 0) { + return result; + } + } + } +#endif + return get_num_physical_cores(); +} + +void process_escapes(std::string & input) { std::size_t input_len = input.length(); std::size_t output_idx = 0; @@ -154,16 +235,63 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } -static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int & i, bool & invalid_param) { - std::string arg = argv[i]; - llama_sampling_params& sparams = params.sparams; +bool parse_kv_override(const char * data, std::vector & overrides) { + const char * sep = strchr(data, '='); + if (sep == nullptr || sep - data >= 128) { + fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data); + return false; + } + llama_model_kv_override kvo; + std::strncpy(kvo.key, data, sep - data); + kvo.key[sep - data] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = std::atol(sep); + } else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.val_f64 = std::atof(sep); + } else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.val_bool = true; + } else if (std::strcmp(sep, "false") == 0) { + kvo.val_bool = false; + } else { + fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data); + return false; + } + } else if (strncmp(sep, "str:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + if (strlen(sep) > 127) { + fprintf(stderr, "%s: malformed KV override '%s', value cannot exceed 127 chars\n", __func__, data); + return false; + } + strncpy(kvo.val_str, sep, 127); + kvo.val_str[127] = '\0'; + } else { + fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data); + return false; + } + overrides.emplace_back(std::move(kvo)); + return true; +} + +bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param) { + llama_sampling_params & sparams = params.sparams; if (arg == "-s" || arg == "--seed") { if (++i >= argc) { invalid_param = true; return true; } + // This is temporary, in the future the samplign state will be moved fully to llama_sampling_context. params.seed = std::stoul(argv[i]); + sparams.seed = std::stoul(argv[i]); return true; } if (arg == "-t" || arg == "--threads") { @@ -648,14 +776,6 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int params.model = argv[i]; return true; } - if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - return true; - } - params.model_url = argv[i]; - return true; - } if (arg == "-md" || arg == "--model-draft") { if (++i >= argc) { invalid_param = true; @@ -672,6 +792,30 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int params.model_alias = argv[i]; return true; } + if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_url = argv[i]; + return true; + } + if (arg == "-hfr" || arg == "--hf-repo") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hf_repo = argv[i]; + return true; + } + if (arg == "-hff" || arg == "--hf-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hf_file = argv[i]; + return true; + } if (arg == "--lora") { if (++i >= argc) { invalid_param = true; @@ -750,13 +894,17 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int invalid_param = true; return true; } - params.image = argv[i]; + params.image.emplace_back(argv[i]); return true; } if (arg == "-i" || arg == "--interactive") { params.interactive = true; return true; } + if (arg == "--interactive-specials") { + params.interactive_specials = true; + return true; + } if (arg == "--embedding") { params.embedding = true; return true; @@ -769,6 +917,10 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int params.instruct = true; return true; } + if (arg == "-cnv" || arg == "--conversation") { + params.conversation = true; + return true; + } if (arg == "-cml" || arg == "--chatml") { params.chatml = true; return true; @@ -805,6 +957,10 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int params.cont_batching = true; return true; } + if (arg == "-fa" || arg == "--flash-attn") { + params.flash_attn = true; + return true; + } if (arg == "--color") { params.use_color = true; return true; @@ -843,9 +999,9 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int return true; } params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL return true; } if (arg == "--split-mode" || arg == "-sm") { @@ -871,9 +1027,9 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int invalid_param = true; return true; } -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL return true; } if (arg == "--tensor-split" || arg == "-ts") { @@ -899,9 +1055,9 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int params.tensor_split[i] = 0.0f; } } -#ifndef GGML_USE_CUBLAS_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL +#ifndef GGML_USE_CUDA_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without CUDA/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUDA_SYCL_VULKAN return true; } if (arg == "--no-mmap") { @@ -948,6 +1104,22 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int } return true; } + if (arg == "-lcs" || arg == "--lookup-cache-static") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lookup_cache_static = argv[i]; + return true; + } + if (arg == "-lcd" || arg == "--lookup-cache-dynamic") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lookup_cache_dynamic = argv[i]; + return true; + } if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { if (++i >= argc) { invalid_param = true; @@ -976,6 +1148,10 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int params.n_print = std::stoi(argv[i]); return true; } + if (arg == "--check-tensors") { + params.check_tensors = true; + return true; + } if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; @@ -1028,8 +1204,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int params.ignore_eos = true; return true; } - if (arg == "--no-penalize-nl") { - sparams.penalize_nl = false; + if (arg == "--penalize-nl") { + sparams.penalize_nl = true; return true; } if (arg == "-l" || arg == "--logit-bias") { @@ -1114,52 +1290,24 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int ); return true; } + if (arg == "-j" || arg == "--json-schema") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.grammar = json_schema_to_grammar(json::parse(argv[i])); + return true; + } if (arg == "--override-kv") { if (++i >= argc) { invalid_param = true; return true; } - char* sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - return true; - } - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } - else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } - else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } - else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } - else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - return true; - } - } - else { + if (!parse_kv_override(argv[i], params.kv_overrides)) { fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); invalid_param = true; return true; } - params.kv_overrides.push_back(kvo); return true; } #ifndef LOG_DISABLE_LOGS @@ -1189,6 +1337,29 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int return false; } +void gpt_params_handle_model_default(gpt_params & params) { + if (!params.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (params.hf_file.empty()) { + if (params.model.empty()) { + throw std::invalid_argument("error: --hf-repo requires either --hf-file or --model\n"); + } + params.hf_file = params.model; + } else if (params.model.empty()) { + params.model = "models/" + string_split(params.hf_file, '/').back(); + } + } else if (!params.model_url.empty()) { + if (params.model.empty()) { + auto f = string_split(params.model_url, '#').front(); + f = string_split(f, '?').front(); + f = string_split(f, '/').back(); + params.model = "models/" + f; + } + } else if (params.model.empty()) { + params.model = DEFAULT_MODEL_PATH; + } +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -1200,14 +1371,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { std::replace(arg.begin(), arg.end(), '_', '-'); } - - if (!gpt_params_find_arg(argc, argv, params, i, invalid_param)) { + if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) { throw std::invalid_argument("error: unknown argument: " + arg); } + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } } - if (invalid_param) { - throw std::invalid_argument("error: invalid parameter for argument: " + arg); - } + if (params.prompt_cache_all && (params.interactive || params.interactive_first || params.instruct)) { @@ -1215,6 +1386,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } + gpt_params_handle_model_default(params); + if (params.escape) { process_escapes(params.prompt); process_escapes(params.input_prefix); @@ -1251,7 +1424,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -h, --help show this help message and exit\n"); printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); + printf(" --interactive-specials allow special tokens in user text, in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); + printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n"); printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); @@ -1312,6 +1487,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); printf(" --grammar GRAMMAR BNF-like grammar to constrain generations (see samples in grammars/ dir)\n"); printf(" --grammar-file FNAME file to read grammar from\n"); + printf(" -j SCHEMA, --json-schema SCHEMA\n"); + printf(" JSON schema to constrain generations (https://json-schema.org/), e.g. `{}` for any JSON object.\n"); + printf(" For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead\n"); printf(" --cfg-negative-prompt PROMPT\n"); printf(" negative prompt to use for guidance. (default: empty)\n"); printf(" --cfg-negative-prompt-file FNAME\n"); @@ -1332,7 +1510,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -dt N, --defrag-thold N\n"); printf(" KV cache defragmentation threshold (default: %.1f, < 0 - disabled)\n", params.defrag_thold); printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - printf(" --no-penalize-nl do not penalize newline token\n"); + printf(" --penalize-nl penalize newline tokens\n"); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); @@ -1349,8 +1527,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" -ns N, --sequences N number of sequences to decode (default: %d)\n", params.n_sequences); printf(" -ps N, --p-split N speculative decoding split probability (default: %.1f)\n", (double)params.p_split); printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n"); + printf(" -fa, --flash-attn enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n"); - printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n"); + printf(" --image IMAGE_FILE path to an image file. use with multimodal models. Specify multiple times for batching\n"); if (llama_supports_mlock()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); } @@ -1403,18 +1582,27 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --control-vector-layer-range START END\n"); printf(" layer range to apply the control vector(s) to, start and end inclusive\n"); printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", params.model.c_str()); - printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); - printf(" model download url (default: %s)\n", params.model_url.c_str()); + printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH); printf(" -md FNAME, --model-draft FNAME\n"); - printf(" draft model for speculative decoding\n"); + printf(" draft model for speculative decoding (default: unused)\n"); + printf(" -mu MODEL_URL, --model-url MODEL_URL\n"); + printf(" model download url (default: unused)\n"); + printf(" -hfr REPO, --hf-repo REPO\n"); + printf(" Hugging Face model repository (default: unused)\n"); + printf(" -hff FILE, --hf-file FILE\n"); + printf(" Hugging Face model file (default: unused)\n"); printf(" -ld LOGDIR, --logdir LOGDIR\n"); printf(" path under which to save YAML logs (no logging if unset)\n"); + printf(" -lcs FNAME, --lookup-cache-static FNAME\n"); + printf(" path to static lookup cache to use for lookup decoding (not updated by generation)\n"); + printf(" -lcd FNAME, --lookup-cache-dynamic FNAME\n"); + printf(" path to dynamic lookup cache to use for lookup decoding (updated by generation)\n"); printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); - printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); printf(" -ptc N, --print-token-count N\n"); printf(" print token count every N tokens (default: %d)\n", params.n_print); + printf(" --check-tensors check model tensor data for invalid values\n"); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); @@ -1451,6 +1639,77 @@ std::string gpt_random_prompt(std::mt19937 & rng) { GGML_UNREACHABLE(); } +// Validate if a filename is safe to use +// To validate a full path, split the path by the OS-specific path separator, and validate each part with this function +bool validate_file_name(const std::string & filename) { + if (!filename.length()) { + // Empty filename invalid + return false; + } + if (filename.length() > 255) { + // Limit at common largest possible filename on Linux filesystems + // to avoid unnecessary further validation + // (On systems with smaller limits it will be caught by the OS) + return false; + } + + std::u32string filename_utf32; + try { + std::wstring_convert, char32_t> converter; + filename_utf32 = converter.from_bytes(filename); + + // If the reverse conversion mismatches, it means overlong UTF-8 sequences were used, + // or invalid encodings were encountered. Reject such attempts + std::string filename_reencoded = converter.to_bytes(filename_utf32); + if (filename_reencoded != filename) { + return false; + } + } catch (const std::exception &) { + return false; + } + + // Check for forbidden codepoints: + // - Control characters + // - Unicode equivalents of illegal characters + // - UTF-16 surrogate pairs + // - UTF-8 replacement character + // - Byte order mark (BOM) + // - Illegal characters: / \ : * ? " < > | + for (char32_t c : filename_utf32) { + if (c <= 0x1F // Control characters (C0) + || c == 0x7F // Control characters (DEL) + || (c >= 0x80 && c <= 0x9F) // Control characters (C1) + || c == 0xFF0E // Fullwidth Full Stop (period equivalent) + || c == 0x2215 // Division Slash (forward slash equivalent) + || c == 0x2216 // Set Minus (backslash equivalent) + || (c >= 0xD800 && c <= 0xDFFF) // UTF-16 surrogate pairs + || c == 0xFFFD // Replacement Character (UTF-8) + || c == 0xFEFF // Byte Order Mark (BOM) + || c == '/' || c == '\\' || c == ':' || c == '*' // Illegal characters + || c == '?' || c == '"' || c == '<' || c == '>' || c == '|') { + return false; + } + } + + // Reject any leading or trailing ' ', or any trailing '.', these are stripped on Windows and will cause a different filename + // Unicode and other whitespace is not affected, only 0x20 space + if (filename.front() == ' ' || filename.back() == ' ' || filename.back() == '.') { + return false; + } + + // Reject any ".." (currently stricter than necessary, it should be fine to just check for == ".." instead) + if (filename.find("..") != std::string::npos) { + return false; + } + + // Reject "." + if (filename == ".") { + return false; + } + + return true; +} + // // String utils // @@ -1468,6 +1727,18 @@ std::vector string_split(std::string input, char separator) { return parts; } +std::string string_strip(const std::string & str) { + size_t start = 0; + size_t end = str.size(); + while (start < end && std::isspace(str[start])) { + start++; + } + while (end > start && std::isspace(str[end - 1])) { + end--; + } + return str.substr(start, end - start); +} + std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names) { std::unordered_map sampler_canonical_name_map { {"top_k", llama_sampler_type::TOP_K}, @@ -1564,6 +1835,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; + mparams.check_tensors = params.check_tensors; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1625,7 +1897,10 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.pooling_type = params.pooling_type; cparams.defrag_thold = params.defrag_thold; + cparams.cb_eval = params.cb_eval; + cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; + cparams.flash_attn = params.flash_attn; cparams.type_k = kv_cache_type_from_str(params.cache_type_k); cparams.type_v = kv_cache_type_from_str(params.cache_type_v); @@ -1656,71 +1931,75 @@ void llama_batch_add( #ifdef LLAMA_USE_CURL -struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, - struct llama_model_params params) { - // Basic validation of the model_url - if (!model_url || strlen(model_url) == 0) { - fprintf(stderr, "%s: invalid model_url\n", __func__); - return NULL; - } +static bool starts_with(const std::string & str, const std::string & prefix) { + // While we wait for C++20's std::string::starts_with... + return str.rfind(prefix, 0) == 0; +} - // Initialize libcurl globally - auto curl = curl_easy_init(); +static bool llama_download_file(const std::string & url, const std::string & path) { + // Initialize libcurl + std::unique_ptr curl(curl_easy_init(), &curl_easy_cleanup); if (!curl) { fprintf(stderr, "%s: error initializing libcurl\n", __func__); - return NULL; + return false; } + bool force_download = false; + // Set the URL, allow to follow http redirection - curl_easy_setopt(curl, CURLOPT_URL, model_url); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); + #if defined(_WIN32) // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of // operating system. Currently implemented under MS-Windows. - curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); #endif // Check if the file already exists locally struct stat model_file_info; - auto file_exists = (stat(path_model, &model_file_info) == 0); + auto file_exists = (stat(path.c_str(), &model_file_info) == 0); - // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files - char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model); - - char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model); + // If the file exists, check its JSON metadata companion file. + std::string metadata_path = path + ".json"; + nlohmann::json metadata; + std::string etag; + std::string last_modified; if (file_exists) { - auto * f_etag = fopen(etag_path, "r"); - if (f_etag) { - if (!fgets(etag, sizeof(etag), f_etag)) { - fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path); - } else { - fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag); + // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block). + std::ifstream metadata_in(metadata_path); + if (metadata_in.good()) { + try { + metadata_in >> metadata; + fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + if (metadata.contains("url") && metadata.at("url").is_string()) { + auto previous_url = metadata.at("url").get(); + if (previous_url != url) { + fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); + return false; + } + } + if (metadata.contains("etag") && metadata.at("etag").is_string()) { + etag = metadata.at("etag"); + } + if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) { + last_modified = metadata.at("lastModified"); + } + } catch (const nlohmann::json::exception & e) { + fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + return false; } - fclose(f_etag); - } - - auto * f_last_modified = fopen(last_modified_path, "r"); - if (f_last_modified) { - if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) { - fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path); - } else { - fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path, - last_modified); - } - fclose(f_last_modified); } + } else { + fprintf(stderr, "%s: no previous model file found %s\n", __func__, path.c_str()); } // Send a HEAD request to retrieve the etag and last-modified headers struct llama_load_model_from_url_headers { - char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; - char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0}; + std::string etag; + std::string last_modified; }; llama_load_model_from_url_headers headers; { @@ -1728,148 +2007,271 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata; - const char * etag_prefix = "etag: "; - if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) { - strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF - } + static std::regex header_regex("([^:]+): (.*)\r\n"); + static std::regex etag_regex("ETag", std::regex_constants::icase); + static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase); - const char * last_modified_prefix = "last-modified: "; - if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) { - strncpy(headers->last_modified, buffer + strlen(last_modified_prefix), - n_items - strlen(last_modified_prefix) - 2); // Remove CRLF + std::string header(buffer, n_items); + std::smatch match; + if (std::regex_match(header, match, header_regex)) { + const std::string & key = match[1]; + const std::string & value = match[2]; + if (std::regex_match(key, match, etag_regex)) { + headers->etag = value; + } else if (std::regex_match(key, match, last_modified_regex)) { + headers->last_modified = value; + } } return n_items; }; - curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress - curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast(header_callback)); - curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers); + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress + curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); + curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - CURLcode res = curl_easy_perform(curl); + CURLcode res = curl_easy_perform(curl.get()); if (res != CURLE_OK) { - curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); - return NULL; + return false; } long http_code = 0; - curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code); + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code != 200) { // HEAD not supported, we don't know if the file has changed // force trigger downloading - file_exists = false; + force_download = true; fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } } - // If the ETag or the Last-Modified headers are different: trigger a new download - if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) { - char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0}; - snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model); + bool should_download = !file_exists || force_download; + if (!should_download) { + if (!etag.empty() && etag != headers.etag) { + fprintf(stderr, "%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + should_download = true; + } else if (!last_modified.empty() && last_modified != headers.last_modified) { + fprintf(stderr, "%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + should_download = true; + } + } + if (should_download) { + std::string path_temporary = path + ".downloadInProgress"; if (file_exists) { - fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model); - if (remove(path_model) != 0) { - curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model); - return NULL; + fprintf(stderr, "%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + if (remove(path.c_str()) != 0) { + fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path.c_str()); + return false; } } // Set the output file - auto * outfile = fopen(path_model_temporary, "wb"); + std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb"), fclose); if (!outfile) { - curl_easy_cleanup(curl); - fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model); - return NULL; + fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path.c_str()); + return false; } typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { return fwrite(data, size, nmemb, (FILE *)fd); }; - curl_easy_setopt(curl, CURLOPT_NOBODY, 0L); - curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile); + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L); + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get()); // display download progress - curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); + + // helper function to hide password in URL + auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { + std::size_t protocol_pos = url.find("://"); + if (protocol_pos == std::string::npos) { + return url; // Malformed URL + } + + std::size_t at_pos = url.find('@', protocol_pos + 3); + if (at_pos == std::string::npos) { + return url; // No password in URL + } + + return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); + }; // start the download - fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - model_url, path_model, headers.etag, headers.last_modified); - auto res = curl_easy_perform(curl); + fprintf(stderr, "%s: downloading from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + auto res = curl_easy_perform(curl.get()); if (res != CURLE_OK) { - fclose(outfile); - curl_easy_cleanup(curl); fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res)); - return NULL; + return false; } long http_code = 0; - curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code); + curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); if (http_code < 200 || http_code >= 400) { - fclose(outfile); - curl_easy_cleanup(curl); fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code); - return NULL; + return false; } - // Clean up - fclose(outfile); + // Causes file to be closed explicitly here before we rename it. + outfile.reset(); - // Write the new ETag to the .etag file - if (strlen(headers.etag) > 0) { - auto * etag_file = fopen(etag_path, "w"); - if (etag_file) { - fputs(headers.etag, etag_file); - fclose(etag_file); - fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag); - } - } + // Write the updated JSON metadata file. + metadata.update({ + {"url", url}, + {"etag", headers.etag}, + {"lastModified", headers.last_modified} + }); + std::ofstream(metadata_path) << metadata.dump(4); + fprintf(stderr, "%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); - // Write the new lastModified to the .etag file - if (strlen(headers.last_modified) > 0) { - auto * last_modified_file = fopen(last_modified_path, "w"); - if (last_modified_file) { - fputs(headers.last_modified, last_modified_file); - fclose(last_modified_file); - fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path, - headers.last_modified); - } - } - - if (rename(path_model_temporary, path_model) != 0) { - curl_easy_cleanup(curl); - fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model); - return NULL; + if (rename(path_temporary.c_str(), path.c_str()) != 0) { + fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + return false; } } - curl_easy_cleanup(curl); + return true; +} + +struct llama_model * llama_load_model_from_url( + const char * model_url, + const char * path_model, + const struct llama_model_params & params) { + // Basic validation of the model_url + if (!model_url || strlen(model_url) == 0) { + fprintf(stderr, "%s: invalid model_url\n", __func__); + return NULL; + } + + if (!llama_download_file(model_url, path_model)) { + return NULL; + } + + // check for additional GGUFs split to download + int n_split = 0; + { + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + auto * ctx_gguf = gguf_init_from_file(path_model, gguf_params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, path_model); + return NULL; + } + + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split >= 0) { + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + } + + gguf_free(ctx_gguf); + } + + if (n_split > 1) { + char split_prefix[PATH_MAX] = {0}; + char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + + // Verify the first split file format + // and extract split URL and PATH prefixes + { + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), path_model, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model file name: %s" + " n_split=%d\n", __func__, path_model, n_split); + return NULL; + } + + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url, 0, n_split)) { + fprintf(stderr, "\n%s: unexpected model url: %s" + " n_split=%d\n", __func__, model_url, n_split); + return NULL; + } + } + + // Prepare download in parallel + std::vector> futures_download; + for (int idx = 1; idx < n_split; idx++) { + futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split](int download_idx) -> bool { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); + + char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); + + return llama_download_file(split_url, split_path); + }, idx)); + } + + // Wait for all downloads to complete + for (auto & f : futures_download) { + if (!f.get()) { + return NULL; + } + } + } return llama_load_model_from_file(path_model, params); } +struct llama_model * llama_load_model_from_hf( + const char * repo, + const char * model, + const char * path_model, + const struct llama_model_params & params) { + // construct hugging face model url: + // + // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf + // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf + // + // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf + // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf + // + + std::string model_url = "https://huggingface.co/"; + model_url += repo; + model_url += "/resolve/main/"; + model_url += model; + + return llama_load_model_from_url(model_url.c_str(), path_model, params); +} + #else -struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/, - struct llama_model_params /*params*/) { +struct llama_model * llama_load_model_from_url( + const char * /*model_url*/, + const char * /*path_model*/, + const struct llama_model_params & /*params*/) { fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); return nullptr; } +struct llama_model * llama_load_model_from_hf( + const char * /*repo*/, + const char * /*model*/, + const char * /*path_model*/, + const struct llama_model_params & /*params*/) { + fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); + return nullptr; +} + #endif // LLAMA_USE_CURL std::tuple llama_init_from_gpt_params(gpt_params & params) { auto mparams = llama_model_params_from_gpt_params(params); llama_model * model = nullptr; - if (!params.model_url.empty()) { + + if (!params.hf_repo.empty() && !params.hf_file.empty()) { + model = llama_load_model_from_hf(params.hf_repo.c_str(), params.hf_file.c_str(), params.model.c_str(), mparams); + } else if (!params.model_url.empty()) { model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams); } else { model = llama_load_model_from_file(params.model.c_str(), mparams); } + if (model == NULL) { fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str()); return std::make_tuple(nullptr, nullptr); @@ -1909,7 +2311,7 @@ std::tuple llama_init_from_gpt_par } for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) { - const std::string& lora_adapter = std::get<0>(params.lora_adapter[i]); + const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]); float lora_scale = std::get<1>(params.lora_adapter[i]); int err = llama_model_apply_lora_from_file(model, lora_adapter.c_str(), @@ -1930,7 +2332,7 @@ std::tuple llama_init_from_gpt_par params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } - { + if (params.warmup) { LOG("warming up the model with an empty run\n"); std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; @@ -1950,23 +2352,23 @@ std::tuple llama_init_from_gpt_par std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, - bool add_bos, - bool special) { - return llama_tokenize(llama_get_model(ctx), text, add_bos, special); + bool add_special, + bool parse_special) { + return llama_tokenize(llama_get_model(ctx), text, add_special, parse_special); } std::vector llama_tokenize( const struct llama_model * model, const std::string & text, - bool add_bos, - bool special) { + bool add_special, + bool parse_special) { // upper limit for the number of tokens - int n_tokens = text.length() + add_bos; + int n_tokens = text.length() + 2 * add_special; std::vector result(n_tokens); - n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special); + n_tokens = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_bos, special); + int check = llama_tokenize(model, text.data(), text.length(), result.data(), result.size(), add_special, parse_special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -1974,12 +2376,12 @@ std::vector llama_tokenize( return result; } -std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { +std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); GGML_ASSERT(check == -n_tokens); } else { result.resize(n_tokens); @@ -2191,7 +2593,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "cpu_has_avx512: %s\n", ggml_cpu_has_avx512() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false"); fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false"); - fprintf(stream, "cpu_has_cublas: %s\n", ggml_cpu_has_cublas() ? "true" : "false"); + fprintf(stream, "cpu_has_cuda: %s\n", ggml_cpu_has_cuda() ? "true" : "false"); fprintf(stream, "cpu_has_vulkan: %s\n", ggml_cpu_has_vulkan() ? "true" : "false"); fprintf(stream, "cpu_has_clblast: %s\n", ggml_cpu_has_clblast() ? "true" : "false"); fprintf(stream, "cpu_has_kompute: %s\n", ggml_cpu_has_kompute() ? "true" : "false"); @@ -2253,6 +2655,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str()); fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false"); fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false"); + fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false"); fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false"); fprintf(stream, "keep: %d # default: 0\n", params.n_keep); fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str()); @@ -2286,14 +2689,14 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "mirostat_ent: %f # default: 5.0\n", sparams.mirostat_tau); fprintf(stream, "mirostat_lr: %f # default: 0.1\n", sparams.mirostat_eta); fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false"); - fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str()); + fprintf(stream, "model: %s # default: %s\n", params.model.c_str(), DEFAULT_MODEL_PATH); fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str()); fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false"); fprintf(stream, "n_gpu_layers: %d # default: -1\n", params.n_gpu_layers); fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); + fprintf(stream, "penalize_nl: %s # default: false\n", sparams.penalize_nl ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); fprintf(stream, "presence_penalty: %f # default: 0.0\n", sparams.penalty_present); @@ -2321,6 +2724,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed); fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false"); fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false"); + fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false"); fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp); const std::vector tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices()); diff --git a/common/common.h b/common/common.h index 8dd8a3edc..d80344f2a 100644 --- a/common/common.h +++ b/common/common.h @@ -31,6 +31,8 @@ fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ } while(0) +#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf" + // build info extern int LLAMA_BUILD_NUMBER; extern char const *LLAMA_COMMIT; @@ -39,6 +41,7 @@ extern char const *LLAMA_BUILD_TARGET; struct llama_control_vector_load_info; +int get_math_cpu_count(); int32_t get_num_physical_cores(); // @@ -48,7 +51,7 @@ int32_t get_num_physical_cores(); struct gpt_params { uint32_t seed = LLAMA_DEFAULT_SEED; // RNG seed - int32_t n_threads = get_num_physical_cores(); + int32_t n_threads = get_math_cpu_count(); int32_t n_threads_draft = -1; int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads) int32_t n_threads_batch_draft = -1; @@ -80,26 +83,33 @@ struct gpt_params { int32_t yarn_orig_ctx = 0; // YaRN original context length float defrag_thold = -1.0f; // KV cache defragmentation threshold + ggml_backend_sched_eval_callback cb_eval = nullptr; + void * cb_eval_user_data = nullptr; + ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED; - llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; - llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings + enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED; + enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings // // sampling parameters struct llama_sampling_params sparams; - std::string model = "models/7B/ggml-model-f16.gguf"; // model path - std::string model_url = ""; // model url to download - std::string model_draft = ""; // draft model for speculative decoding - std::string model_alias = "unknown"; // model alias - std::string prompt = ""; - std::string prompt_file = ""; // store the external prompt file name - std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state - std::string input_prefix = ""; // string to prefix user inputs with - std::string input_suffix = ""; // string to suffix user inputs with + std::string model = ""; // model path + std::string model_draft = ""; // draft model for speculative decoding + std::string model_alias = "unknown"; // model alias + std::string model_url = ""; // model url to download + std::string hf_repo = ""; // HF repo + std::string hf_file = ""; // HF file + std::string prompt = ""; + std::string prompt_file = ""; // store the external prompt file name + std::string path_prompt_cache = ""; // path to file for saving/loading prompt eval state + std::string input_prefix = ""; // string to prefix user inputs with + std::string input_suffix = ""; // string to suffix user inputs with std::vector antiprompt; // string upon seeing which more user input is prompted - std::string logdir = ""; // directory in which to save YAML log files - std::string logits_file = ""; // file for saving *all* logits + std::string logdir = ""; // directory in which to save YAML log files + std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding + std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding + std::string logits_file = ""; // file for saving *all* logits std::vector kv_overrides; @@ -125,11 +135,13 @@ struct gpt_params { bool multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt size_t multiple_choice_tasks = 0; // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed - bool kl_divergence = false; // compute KL-divergence + bool kl_divergence = false; // compute KL divergence bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode + bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode + bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix) bool chatml = false; // chatml mode (used for models trained on chatml syntax) bool prompt_cache_all = false; // save user input and generations to prompt cache bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it @@ -139,7 +151,8 @@ struct gpt_params { bool interactive_first = false; // wait for user input immediately bool multiline_input = false; // reverse the usage of `\` bool simple_io = false; // improves compatibility with subprocesses and limited consoles - bool cont_batching = false; // insert new sequences for decoding on-the-fly + bool cont_batching = true; // insert new sequences for decoding on-the-fly + bool flash_attn = false; // flash attention bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix bool ignore_eos = false; // ignore generated EOS tokens @@ -152,27 +165,37 @@ struct gpt_params { bool infill = false; // use infill mode bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes bool no_kv_offload = false; // disable KV offloading + bool warmup = true; // warmup run + bool check_tensors = false; // validate tensor data std::string cache_type_k = "f16"; // KV cache data type for the K std::string cache_type_v = "f16"; // KV cache data type for the V // multimodal models (see examples/llava) - std::string mmproj = ""; // path to multimodal projector - std::string image = ""; // path to an image file + std::string mmproj = ""; // path to multimodal projector + std::vector image; // path to image file(s) }; +void gpt_params_handle_model_default(gpt_params & params); + +bool parse_kv_override(const char * data, std::vector & overrides); + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params); bool gpt_params_parse(int argc, char ** argv, gpt_params & params); void gpt_print_usage(int argc, char ** argv, const gpt_params & params); +bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_params & params, int & i, bool & invalid_param); + std::string get_system_info(const gpt_params & params); std::string gpt_random_prompt(std::mt19937 & rng); void process_escapes(std::string& input); +bool validate_file_name(const std::string & filename); + // // String utils // @@ -180,6 +203,7 @@ void process_escapes(std::string& input); std::vector sampler_types_from_names(const std::vector & names, bool allow_alt_names); std::vector sampler_types_from_chars(const std::string & names_string); std::vector string_split(std::string input, char separator); +std::string string_strip(const std::string & str); std::string sampler_type_to_name_string(llama_sampler_type sampler_type); // @@ -192,8 +216,8 @@ std::tuple llama_init_from_gpt_par struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params); struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params); -struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, - struct llama_model_params params); +struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const struct llama_model_params & params); +struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const struct llama_model_params & params); // Batch utils @@ -215,20 +239,21 @@ void llama_batch_add( std::vector llama_tokenize( const struct llama_context * ctx, const std::string & text, - bool add_bos, - bool special = false); + bool add_special, + bool parse_special = false); std::vector llama_tokenize( const struct llama_model * model, const std::string & text, - bool add_bos, - bool special = false); + bool add_special, + bool parse_special = false); -// tokenizes a token into a piece +// tokenizes a token into a piece, optionally renders special/control tokens // should work similar to Python's `tokenizer.id_to_piece` std::string llama_token_to_piece( const struct llama_context * ctx, - llama_token token); + llama_token token, + bool special = true); // TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function // that takes into account the tokenizer type and decides how to handle the leading space @@ -302,3 +327,10 @@ struct llama_control_vector_load_info { // Load control vectors, scale each by strength, and add them together. // On error, returns {-1, empty} llama_control_vector_data llama_control_vector_load(const std::vector & load_infos); + +// +// Split utils +// +static const char * const LLM_KV_SPLIT_NO = "split.no"; +static const char * const LLM_KV_SPLIT_COUNT = "split.count"; +static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; diff --git a/common/grammar-parser.cpp b/common/grammar-parser.cpp index 2a1301569..fecb7cd71 100644 --- a/common/grammar-parser.cpp +++ b/common/grammar-parser.cpp @@ -142,6 +142,9 @@ namespace grammar_parser { pos++; last_sym_start = out_elements.size(); while (*pos != '"') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); @@ -156,6 +159,9 @@ namespace grammar_parser { } last_sym_start = out_elements.size(); while (*pos != ']') { + if (!*pos) { + throw std::runtime_error("unexpected end of input"); + } auto char_pair = parse_char(pos); pos = char_pair.second; enum llama_gretype type = last_sym_start < out_elements.size() @@ -164,6 +170,9 @@ namespace grammar_parser { out_elements.push_back({type, char_pair.first}); if (pos[0] == '-' && pos[1] != ']') { + if (!pos[1]) { + throw std::runtime_error("unexpected end of input"); + } auto endchar_pair = parse_char(pos + 1); pos = endchar_pair.second; out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp index dc6d30ef1..0f8f1b1d4 100644 --- a/common/json-schema-to-grammar.cpp +++ b/common/json-schema-to-grammar.cpp @@ -9,37 +9,103 @@ #include #include -using json = nlohmann::json; +using json = nlohmann::ordered_json; + +template +static std::string join(Iterator begin, Iterator end, const std::string & separator); + +static std::string repeat(const std::string & str, size_t n); + +static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "", bool item_rule_is_literal = false) { + if (separator_rule.empty()) { + if (min_items == 0 && max_items == 1) { + return item_rule + "?"; + } else if (min_items == 1 && max_items == std::numeric_limits::max()) { + return item_rule + "+"; + } + } + + std::string result; + if (min_items > 0) { + if (item_rule_is_literal && separator_rule.empty()) { + result = "\"" + repeat(std::string(item_rule.begin() + 1, item_rule.end() - 1), min_items) + "\""; + } else { + std::vector items(min_items, item_rule); + result = join(items.begin(), items.end(), separator_rule.empty() ? " " : " " + separator_rule + " "); + } + } + + std::function opt_repetitions = [&](int up_to_n, bool prefix_with_sep) -> std::string { + auto content = prefix_with_sep && !separator_rule.empty() ? separator_rule + " " + item_rule : item_rule; + + if (up_to_n == 0) { + return ""; + } else if (up_to_n == 1) { + return "(" + content + ")?"; + } else if (!separator_rule.empty() && !prefix_with_sep) { + return "(" + content + " " + opt_repetitions(up_to_n - 1, true) + ")?"; + } else { + std::string res = repeat("(" + content + " ", up_to_n); + // strip trailing space + res = res.substr(0, res.length() - 1); + res += repeat(")?", up_to_n); + return res; + } + }; + + if (min_items > 0 && max_items != min_items) { + result += " "; + } + + if (max_items != std::numeric_limits::max()) { + result += opt_repetitions(max_items - min_items, min_items > 0); + } else { + std::string item_operator = "(" + (separator_rule.empty() ? "" : separator_rule + " ") + item_rule + ")"; + if (min_items == 0 && !separator_rule.empty()) { + result = "(" + item_rule + " " + item_operator + "*)?"; + } else { + result += item_operator + "*"; + } + } + + return result; +} const std::string SPACE_RULE = "\" \"?"; -std::unordered_map PRIMITIVE_RULES = { - {"boolean", "(\"true\" | \"false\") space"}, - {"number", "(\"-\"? ([0-9] | [1-9] [0-9]*)) (\".\" [0-9]+)? ([eE] [-+]? [0-9]+)? space"}, - {"integer", "(\"-\"? ([0-9] | [1-9] [0-9]*)) space"}, - {"value", "object | array | string | number | boolean"}, - {"object", "\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space"}, - {"array", "\"[\" space ( value (\",\" space value)* )? \"]\" space"}, - {"uuid", "\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " - "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " - "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " - "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " - "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space"}, - {"string", " \"\\\"\" (\n" - " [^\"\\\\] |\n" - " \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])\n" - " )* \"\\\"\" space"}, - {"null", "\"null\" space"} +struct BuiltinRule { + std::string content; + std::vector deps; }; -std::vector OBJECT_RULE_NAMES = {"object", "array", "string", "number", "boolean", "null", "value"}; -std::unordered_map DATE_RULES = { - {"date", "[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )"}, - {"time", "([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )"}, - {"date-time", "date \"T\" time"}, - {"date-string", "\"\\\"\" date \"\\\"\" space"}, - {"time-string", "\"\\\"\" time \"\\\"\" space"}, - {"date-time-string", "\"\\\"\" date-time \"\\\"\" space"} +const std::string _up_to_15_digits = build_repetition("[0-9]", 0, 15); + +std::unordered_map PRIMITIVE_RULES = { + {"boolean", {"(\"true\" | \"false\") space", {}}}, + {"decimal-part", {"[0-9] " + _up_to_15_digits, {}}}, + {"integral-part", {"[0-9] | [1-9] " + _up_to_15_digits, {}}}, + {"number", {"(\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space", {"integral-part", "decimal-part"}}}, + {"integer", {"(\"-\"? integral-part) space", {"integral-part"}}}, + {"value", {"object | array | string | number | boolean | null", {"object", "array", "string", "number", "boolean", "null"}}}, + {"object", {"\"{\" space ( string \":\" space value (\",\" space string \":\" space value)* )? \"}\" space", {"string", "value"}}}, + {"array", {"\"[\" space ( value (\",\" space value)* )? \"]\" space", {"value"}}}, + {"uuid", {"\"\\\"\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] " + "\"-\" [0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F] \"\\\"\" space", {}}}, + {"char", {"[^\"\\\\] | \"\\\\\" ([\"\\\\/bfnrt] | \"u\" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])", {}}}, + {"string", {"\"\\\"\" char* \"\\\"\" space", {"char"}}}, + {"null", {"\"null\" space", {}}}, +}; + +std::unordered_map STRING_FORMAT_RULES = { + {"date", {"[0-9] [0-9] [0-9] [0-9] \"-\" ( \"0\" [1-9] | \"1\" [0-2] ) \"-\" ( \"0\" [1-9] | [1-2] [0-9] | \"3\" [0-1] )", {}}}, + {"time", {"([01] [0-9] | \"2\" [0-3]) \":\" [0-5] [0-9] \":\" [0-5] [0-9] ( \".\" [0-9] [0-9] [0-9] )? ( \"Z\" | ( \"+\" | \"-\" ) ( [01] [0-9] | \"2\" [0-3] ) \":\" [0-5] [0-9] )", {}}}, + {"date-time", {"date \"T\" time", {"date", "time"}}}, + {"date-string", {"\"\\\"\" date \"\\\"\" space", {"date"}}}, + {"time-string", {"\"\\\"\" time \"\\\"\" space", {"time"}}}, + {"date-time-string", {"\"\\\"\" date-time \"\\\"\" space", {"date-time"}}} }; static bool is_reserved_name(const std::string & name) { @@ -47,7 +113,7 @@ static bool is_reserved_name(const std::string & name) { if (RESERVED_NAMES.empty()) { RESERVED_NAMES.insert("root"); for (const auto &p : PRIMITIVE_RULES) RESERVED_NAMES.insert(p.first); - for (const auto &p : DATE_RULES) RESERVED_NAMES.insert(p.first); + for (const auto &p : STRING_FORMAT_RULES) RESERVED_NAMES.insert(p.first); } return RESERVED_NAMES.find(name) != RESERVED_NAMES.end(); } @@ -124,7 +190,7 @@ static std::string replacePattern(const std::string & input, const std::regex & } static std::string format_literal(const std::string & literal) { - std::string escaped = replacePattern(json(literal).dump(), GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) { + std::string escaped = replacePattern(literal, GRAMMAR_LITERAL_ESCAPE_RE, [&](const std::smatch & match) { char c = match.str()[0]; return GRAMMAR_LITERAL_ESCAPES.at(c); }); @@ -137,7 +203,7 @@ private: std::function _fetch_json; bool _dotall; std::map _rules; - std::unordered_map _refs; + std::unordered_map _refs; std::unordered_set _refs_being_resolved; std::vector _errors; std::vector _warnings; @@ -192,7 +258,7 @@ private: if (_dotall) { rule = "[\\U00000000-\\U0010FFFF]"; } else { - rule = "[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]"; + rule = "[^\\x0A\\x0D]"; } return _add_rule("dot", rule); }; @@ -308,47 +374,21 @@ private: auto &sub = last.first; auto sub_is_literal = last.second; - if (min_times == 0 && max_times == std::numeric_limits::max()) { - sub += "*"; - } else if (min_times == 0 && max_times == 1) { - sub += "?"; - } else if (min_times == 1 && max_times == std::numeric_limits::max()) { - sub += "+"; - } else { - if (!sub_is_literal) { - std::string & sub_id = sub_rule_ids[sub]; - if (sub_id.empty()) { - sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub); - } - sub = sub_id; + if (!sub_is_literal) { + std::string & sub_id = sub_rule_ids[sub]; + if (sub_id.empty()) { + sub_id = _add_rule(name + "-" + std::to_string(sub_rule_ids.size()), sub); } - std::string result; - if (sub_is_literal && min_times > 0) { - result = "\"" + repeat(sub.substr(1, sub.length() - 2), min_times) + "\""; - } else { - for (int j = 0; j < min_times; j++) { - if (j > 0) { - result += " "; - } - result += sub; - } - } - if (min_times > 0 && min_times < max_times) { - result += " "; - } - if (max_times == std::numeric_limits::max()) { - result += sub + "*"; - } else { - for (int j = min_times; j < max_times; j++) { - if (j > min_times) { - result += " "; - } - result += sub + "?"; - } - } - seq.back().first = result; - seq.back().second = false; + sub = sub_id; } + seq.back().first = build_repetition( + sub_is_literal ? "\"" + sub + "\"" : sub, + min_times, + max_times, + "", + sub_is_literal + ); + seq.back().second = false; } else { std::string literal; auto is_non_literal = [&](char c) { @@ -413,7 +453,7 @@ private: std::string prop_rule_name = visit(prop_schema, name + (name.empty() ? "" : "-") + prop_name); prop_kv_rule_names[prop_name] = _add_rule( name + (name.empty() ? "" : "-") + prop_name + "-kv", - format_literal(prop_name) + " space \":\" space " + prop_rule_name + format_literal(json(prop_name).dump()) + " space \":\" space " + prop_rule_name ); if (required.find(prop_name) != required.end()) { required_props.push_back(prop_name); @@ -424,7 +464,7 @@ private: if (additional_properties.is_object() || (additional_properties.is_boolean() && additional_properties.get())) { std::string sub_name = name + (name.empty() ? "" : "-") + "additional"; std::string value_rule = visit(additional_properties.is_object() ? additional_properties : json::object(), sub_name + "-value"); - std::string kv_rule = _add_rule(sub_name + "-kv", _add_rule("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule); + std::string kv_rule = _add_rule(sub_name + "-kv", _add_primitive("string", PRIMITIVE_RULES.at("string")) + " \":\" space " + value_rule); prop_kv_rule_names["*"] = kv_rule; optional_props.push_back("*"); } @@ -486,6 +526,25 @@ private: return rule; } + std::string _add_primitive(const std::string & name, const BuiltinRule & rule) { + auto n = _add_rule(name, rule.content); + for (const auto & dep : rule.deps) { + BuiltinRule dep_rule; + auto it = PRIMITIVE_RULES.find(dep); + if (it == PRIMITIVE_RULES.end()) { + it = STRING_FORMAT_RULES.find(dep); + if (it == STRING_FORMAT_RULES.end()) { + _errors.push_back("Rule " + dep + " not known"); + continue; + } + } + if (_rules.find(dep) == _rules.end()) { + _add_primitive(dep, it->second); + } + } + return n; + } + public: SchemaConverter( const std::function & fetch_json, @@ -495,7 +554,7 @@ public: _rules["space"] = SPACE_RULE; } - void resolve_refs(nlohmann::json & schema, const std::string & url) { + void resolve_refs(json & schema, const std::string & url) { /* * Resolves all $ref fields in the given schema, fetching any remote schemas, * replacing each $ref with absolute reference URL and populates _refs with the @@ -557,11 +616,7 @@ public: } std::string _generate_constant_rule(const json & value) { - if (!value.is_string()) { - _errors.push_back("Only std::string constants are supported, got " + value.dump()); - return ""; - } - return format_literal(value.get()); + return format_literal(value.dump()); } std::string visit(const json & schema, const std::string & name) { @@ -651,49 +706,33 @@ public: return _add_rule(rule_name, rule); } else { std::string item_rule_name = visit(items, name + (name.empty() ? "" : "-") + "item"); - std::string list_item_operator = "( \",\" space " + item_rule_name + " )"; - std::string successive_items; int min_items = schema.contains("minItems") ? schema["minItems"].get() : 0; json max_items_json = schema.contains("maxItems") ? schema["maxItems"] : json(); - int max_items = max_items_json.is_number_integer() ? max_items_json.get() : -1; - if (min_items > 0) { - successive_items += repeat(list_item_operator, min_items - 1); - min_items--; - } - if (max_items >= 0 && max_items > min_items) { - successive_items += repeat(list_item_operator + "?", max_items - min_items - 1); - } else { - successive_items += list_item_operator + "*"; - } - std::string rule; - if (min_items == 0) { - rule = "\"[\" space ( " + item_rule_name + " " + successive_items + " )? \"]\" space"; - } else { - rule = "\"[\" space " + item_rule_name + " " + successive_items + " \"]\" space"; - } - return _add_rule(rule_name, rule); + int max_items = max_items_json.is_number_integer() ? max_items_json.get() : std::numeric_limits::max(); + + return _add_rule(rule_name, "\"[\" space " + build_repetition(item_rule_name, min_items, max_items, "\",\" space") + " \"]\" space"); } } else if ((schema_type.is_null() || schema_type == "string") && schema.contains("pattern")) { return _visit_pattern(schema["pattern"], rule_name); } else if ((schema_type.is_null() || schema_type == "string") && std::regex_match(schema_format, std::regex("^uuid[1-5]?$"))) { - return _add_rule(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid")); - } else if ((schema_type.is_null() || schema_type == "string") && DATE_RULES.find(schema_format) != DATE_RULES.end()) { - for (const auto & kv : DATE_RULES) { - _add_rule(kv.first, kv.second); - } - return schema_format + "-string"; + return _add_primitive(rule_name == "root" ? "root" : schema_format, PRIMITIVE_RULES.at("uuid")); + } else if ((schema_type.is_null() || schema_type == "string") && STRING_FORMAT_RULES.find(schema_format + "-string") != STRING_FORMAT_RULES.end()) { + auto prim_name = schema_format + "-string"; + return _add_rule(rule_name, _add_primitive(prim_name, STRING_FORMAT_RULES.at(prim_name))); + } else if (schema_type == "string" && (schema.contains("minLength") || schema.contains("maxLength"))) { + std::string char_rule = _add_primitive("char", PRIMITIVE_RULES.at("char")); + int min_len = schema.contains("minLength") ? schema["minLength"].get() : 0; + int max_len = schema.contains("maxLength") ? schema["maxLength"].get() : std::numeric_limits::max(); + return _add_rule(rule_name, "\"\\\"\" " + build_repetition(char_rule, min_len, max_len) + " \"\\\"\" space"); } else if (schema.empty() || schema_type == "object") { - for (const auto & n : OBJECT_RULE_NAMES) { - _add_rule(n, PRIMITIVE_RULES.at(n)); - } - return _add_rule(rule_name, "object"); + return _add_rule(rule_name, _add_primitive("object", PRIMITIVE_RULES.at("object"))); } else { if (!schema_type.is_string() || PRIMITIVE_RULES.find(schema_type.get()) == PRIMITIVE_RULES.end()) { _errors.push_back("Unrecognized schema: " + schema.dump()); return ""; } // TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero - return _add_rule(rule_name == "root" ? "root" : schema_type.get(), PRIMITIVE_RULES.at(schema_type.get())); + return _add_primitive(rule_name == "root" ? "root" : schema_type.get(), PRIMITIVE_RULES.at(schema_type.get())); } } diff --git a/common/json-schema-to-grammar.h b/common/json-schema-to-grammar.h index 2e4c6ade3..41623b346 100644 --- a/common/json-schema-to-grammar.h +++ b/common/json-schema-to-grammar.h @@ -1,4 +1,8 @@ #pragma once + +#include "ggml.h" +// Change JSON_ASSERT from assert() to GGML_ASSERT: +#define JSON_ASSERT GGML_ASSERT #include "json.hpp" -std::string json_schema_to_grammar(const nlohmann::json& schema); +std::string json_schema_to_grammar(const nlohmann::ordered_json& schema); diff --git a/common/log.h b/common/log.h index eb111e784..6934c57b2 100644 --- a/common/log.h +++ b/common/log.h @@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // INTERNAL, DO NOT USE // USE LOG() INSTEAD // -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) #define LOG_IMPL(str, ...) \ do { \ if (LOG_TARGET != nullptr) \ @@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std:: // INTERNAL, DO NOT USE // USE LOG_TEE() INSTEAD // -#ifndef _MSC_VER +#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__) #define LOG_TEE_IMPL(str, ...) \ do { \ if (LOG_TARGET != nullptr) \ @@ -566,6 +566,7 @@ inline void log_print_usage() printf(" --log-new Create a separate new log file on start. " "Each log file will have unique name: \"..log\"\n"); printf(" --log-append Don't truncate the old log file.\n"); + printf("\n"); } #define log_dump_cmdline(argc, argv) log_dump_cmdline_impl(argc, argv) diff --git a/common/ngram-cache.cpp b/common/ngram-cache.cpp new file mode 100644 index 000000000..3ca112ef1 --- /dev/null +++ b/common/ngram-cache.cpp @@ -0,0 +1,282 @@ +#include "ngram-cache.h" +#include "common.h" +#include "log.h" + +#include +#include + +void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, + std::vector & inp, int nnew, bool print_progress) { + const int64_t t_start_ms = ggml_time_ms(); + const int64_t inp_size = inp.size(); + + const int64_t n_todo = inp_size * (ngram_max - ngram_min + 1); + int64_t n_done = 0; + + for (int64_t ngram_size = ngram_min; ngram_size <= ngram_max; ++ngram_size) { + const int64_t i_start = std::max(inp_size - nnew, ngram_size); + for (int64_t i = i_start; i < inp_size; ++i) { + const int64_t ngram_start = i - ngram_size; + llama_ngram ngram(&inp[ngram_start], ngram_size); + const llama_token token = inp[i]; + + llama_ngram_cache::iterator part_it = ngram_cache.find(ngram); + if (part_it == ngram_cache.end()) { + llama_ngram_cache_part part; + part.emplace(token, 1); + ngram_cache.emplace(ngram, part); + } else { + llama_ngram_cache_part::iterator token_count_it = part_it->second.find(token); + if (token_count_it == part_it->second.end()) { + part_it->second.emplace(token, 1); + } else { + token_count_it->second++; + } + } + ++n_done; + + if (print_progress && n_done % 10000000 == 0) { + const int64_t t_now_ms = ggml_time_ms(); + const int64_t eta_ms = (inp_size*(ngram_max-ngram_min+1) - n_done) * (t_now_ms - t_start_ms) / n_done; + const int64_t eta_min = eta_ms / (60*1000); + const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; + + fprintf(stderr, "%s: %" PRId64 "/%" PRId64 " done, ETA: %02" PRId64 ":%02" PRId64 "\n", __func__, n_done, n_todo, eta_min, eta_s); + } + } + } +} + +// Helper function to get a token from the combined, speculative sequence of inp and draft. +static llama_token get_token(const std::vector & inp, const std::vector & draft, const size_t i) { + return i < inp.size() ? inp[i] : draft[1 + i - inp.size()]; +} + +// If sample size or percentage are below these thresholds the draft is aborted early: +constexpr int draft_min_sample_size_lax[LLAMA_NGRAM_MAX] = { 2, 2, 1, 1}; +constexpr int draft_min_percent_lax[LLAMA_NGRAM_MAX] = {66, 50, 50, 50}; +constexpr int draft_min_sample_size_strict[LLAMA_NGRAM_MAX] = { 4, 3, 2, 2}; +constexpr int draft_min_percent_strict[LLAMA_NGRAM_MAX] = {75, 66, 66, 66}; + +// Helper function that tries to draft a token from only the static ngram cache: +static llama_token try_draft(llama_ngram_cache & nc_static, const llama_ngram ngram_static) { + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + if (part_static_it == nc_static.end()) { + return -1; + } + const llama_ngram_cache_part part_static = part_static_it->second; + + int max_count_static = 0; + int sum_count_static = 0; + llama_token max_token = -1; + + for (std::pair token_count_static : part_static) { + const llama_token token = token_count_static.first; + const int32_t count_static = token_count_static.second; + + if (count_static > max_count_static) { + max_token = token; + max_count_static = count_static; + } + sum_count_static += count_static; + } + + if (sum_count_static < draft_min_sample_size_lax[LLAMA_NGRAM_STATIC-1]) { + return -1; + } + if (100*max_count_static < draft_min_percent_lax[LLAMA_NGRAM_STATIC-1]*sum_count_static) { + return -1; + } + return max_token; +} + +// Try to draft a token from primary cache (context/dynamic), validate with static cache: +static llama_token try_draft( + llama_ngram_cache & nc_primary, const std::vector & ngrams_primary, llama_ngram_cache_part & part_static, + const int * min_sample_size, const int * min_percent) { + + llama_token drafted_token = -1; + + for (int i = ngrams_primary.size()-1; i >= 0 && drafted_token == -1; --i) { + const llama_ngram ngram_primary = ngrams_primary[i]; + + llama_ngram_cache::iterator part_primary_it = nc_primary.find(ngram_primary); + if (part_primary_it == nc_primary.end()) { + continue; + } + const llama_ngram_cache_part part_primary = part_primary_it->second; + + int max_count_primary = 0; + int max_count_static = 0; + int sum_count_primary = 0; + llama_token max_token = -1; + + for (std::pair token_count_primary : part_primary) { + const llama_token token = token_count_primary.first; + + llama_ngram_cache_part::iterator token_count_static_it = part_static.find(token); + + const int32_t count_primary = token_count_primary.second; + const int32_t count_static = token_count_static_it != part_static.end() ? 100*token_count_static_it->second : 1; + + if (count_primary*count_static > max_count_primary*max_count_static) { + max_token = token; + max_count_primary = count_primary; + max_count_static = count_static; + } + sum_count_primary += count_primary; + } + + if (sum_count_primary < min_sample_size[i]) { + continue; + } + if (100*max_count_primary < min_percent[i]*sum_count_primary) { + continue;; + } + drafted_token = max_token; + } + + return drafted_token; +} + +void llama_ngram_cache_draft( + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static +) { + GGML_ASSERT(draft.size() == 1); + const int inp_size = inp.size(); + + if (inp_size < LLAMA_NGRAM_STATIC) { + return; + } + + while ((int) draft.size()-1 < n_draft) { + llama_token drafted_token = -1; + + const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1; + llama_ngram ngram_static; + for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) { + ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j); + } + llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static); + llama_ngram_cache_part part_static; + if (part_static_it != nc_static.end()) { + part_static = part_static_it->second; + } + + // cd = context + dynamic + std::vector ngrams_cd; + for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) { + const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1; + llama_ngram ngram_cd; + for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) { + ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j); + } + ngrams_cd.push_back(ngram_cd); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_context, ngrams_cd, part_static, draft_min_sample_size_lax, draft_min_percent_lax); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_dynamic, ngrams_cd, part_static, draft_min_sample_size_strict, draft_min_percent_strict); + } + if (drafted_token == -1) { + drafted_token = try_draft(nc_static, ngram_static); + } + + if (drafted_token == -1) { + break; + } + + LOG(" - draft candidate: token=%d\n", drafted_token); + draft.push_back(drafted_token); + } +} + +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename) { + std::ofstream file_out(filename, std::ios::binary); + for (std::pair item : ngram_cache) { + const llama_ngram ngram = item.first; + llama_ngram_cache_part token_counts = item.second; + GGML_ASSERT(!token_counts.empty()); + const int32_t ntokens = token_counts.size(); + GGML_ASSERT(ntokens > 0); + + file_out.write(reinterpret_cast(&ngram), sizeof(llama_ngram)); + file_out.write(reinterpret_cast(&ntokens), sizeof(int32_t)); + for (std::pair item2 : token_counts) { + const llama_token token = item2.first; + const int32_t count = item2.second; + GGML_ASSERT(count > 0); + + file_out.write(reinterpret_cast(&token), sizeof(llama_token)); + file_out.write(reinterpret_cast(&count), sizeof(int32_t)); + } + } + +} + +llama_ngram_cache llama_ngram_cache_load(std::string & filename) { + std::ifstream hashmap_file(filename, std::ios::binary); + if (!hashmap_file) { + throw std::ifstream::failure("Unable to open file " + filename); + } + llama_ngram_cache ngram_cache; + + llama_ngram ngram; + int32_t ntokens; + llama_token token; + int32_t count; + + char * ngramc = reinterpret_cast(&ngram); + char * ntokensc = reinterpret_cast(&ntokens); + char * tokenc = reinterpret_cast(&token); + char * countc = reinterpret_cast(&count); + while(hashmap_file.read(ngramc, sizeof(llama_ngram))) { + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t))); + GGML_ASSERT(ntokens > 0); + llama_ngram_cache_part token_counts; + + for (int i = 0; i < ntokens; ++i) { + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token))); + GGML_ASSERT(!hashmap_file.eof()); + GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t))); + GGML_ASSERT(count > 0); + token_counts.emplace(token, count); + } + + ngram_cache.emplace(ngram, token_counts); + } + GGML_ASSERT(hashmap_file.eof()); + + return ngram_cache; +} + +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add) { + for (std::pair ngram_part : ngram_cache_add) { + const llama_ngram ngram = ngram_part.first; + llama_ngram_cache_part part = ngram_part.second; + + llama_ngram_cache::iterator part_merged_it = ngram_cache_target.find(ngram); + if (part_merged_it == ngram_cache_target.end()) { + ngram_cache_target.emplace(ngram, part); + continue; + } + + for (std::pair token_count : part) { + const llama_token token = token_count.first; + const int32_t count = token_count.second; + GGML_ASSERT(count > 0); + + llama_ngram_cache_part::iterator token_count_merged_it = part_merged_it->second.find(token); + if (token_count_merged_it == part_merged_it->second.end()) { + part_merged_it->second.emplace(token, count); + continue; + } + + token_count_merged_it->second += count; + } + } +} diff --git a/common/ngram-cache.h b/common/ngram-cache.h new file mode 100644 index 000000000..e4fa4cbd1 --- /dev/null +++ b/common/ngram-cache.h @@ -0,0 +1,94 @@ +#pragma once + +#include "llama.h" + +#include +#include +#include + +#define LLAMA_NGRAM_MIN 1 +#define LLAMA_NGRAM_MAX 4 +#define LLAMA_NGRAM_STATIC 2 + +// Data structures to map n-grams to empirical token probabilities: + +struct llama_ngram { + llama_token tokens[LLAMA_NGRAM_MAX]; + + llama_ngram() { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + tokens[i] = -1; + } + } + + llama_ngram(const llama_token * input, const int ngram_size) { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + tokens[i] = i < ngram_size ? input[i] : -1; + } + } + + bool operator==(const llama_ngram & other) const { + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + if (tokens[i] != other.tokens[i]) { + return false; + } + } + return true; + } +}; + +struct llama_ngram_hash_function { + size_t operator()(const llama_ngram & ngram) const { + size_t hash = 0; + for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) { + hash ^= std::hash{}(ngram.tokens[i]); + } + return hash; + } +}; + +// token -> number of times token has been seen +typedef std::unordered_map llama_ngram_cache_part; + +// n-gram -> empirical distribution of following tokens +typedef std::unordered_map llama_ngram_cache; + + +// Update an ngram cache with tokens. +// ngram_cache: the cache to modify. +// ngram_min/ngram_max: the min/max size of the ngrams to extract from inp_data. +// inp_data: the token sequence with which to update ngram_cache. +// nnew: how many new tokens have been appended to inp_data since the last call to this function. +// print_progress: whether to print progress to stderr. +// +// In order to get correct results inp_data can ONLY BE APPENDED TO. +// Changes in the middle need a complete rebuild. +void llama_ngram_cache_update( + llama_ngram_cache & ngram_cache, int ngram_min, int ngram_max, std::vector & inp_data, int nnew, bool print_progress); + +// Try to draft tokens from ngram caches. +// inp: the tokens generated so far. +// draft: the token sequence to draft. Expected to initially contain the previously sampled token. +// n_draft: maximum number of tokens to add to draft. +// ngram_min/gram_max: the min/max size of the ngrams in nc_context and nc_dynamic. +// nc_context: ngram cache based on current context. +// nc_dynamic: ngram cache based on previous user generations. +// nc_static: ngram cache generated from a large text corpus, used for validation. +void llama_ngram_cache_draft( + std::vector & inp, std::vector & draft, int n_draft, int ngram_min, int ngram_max, + llama_ngram_cache & nc_context, llama_ngram_cache & nc_dynamic, llama_ngram_cache & nc_static); + +// Save an ngram cache to a file. +// ngram_cache: the ngram cache to save. +// filename: the path under which to save the ngram cache. +void llama_ngram_cache_save(llama_ngram_cache & ngram_cache, std::string & filename); + +// Load an ngram cache saved with llama_ngram_cache_save. +// filename: the path from which to load the ngram cache. +// returns: an ngram cache containing the information saved to filename. +llama_ngram_cache llama_ngram_cache_load(std::string & filename); + +// Merge two ngram caches. +// ngram_cache_target: the ngram cache to which to add the information from ngram_cache_add. +// ngram_cache_add: the ngram cache to add to ngram_cache_target. +void llama_ngram_cache_merge(llama_ngram_cache & ngram_cache_target, llama_ngram_cache & ngram_cache_add); diff --git a/common/sampling.cpp b/common/sampling.cpp index 5a5450982..3715a7985 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -1,4 +1,6 @@ +#define LLAMA_API_INTERNAL #include "sampling.h" +#include struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_params & params) { struct llama_sampling_context * result = new llama_sampling_context(); @@ -33,6 +35,10 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_ result->prev.resize(params.n_prev); + result->n_considered = 0; + + llama_sampling_set_rng_seed(result, params.seed); + return result; } @@ -60,6 +66,14 @@ void llama_sampling_reset(llama_sampling_context * ctx) { std::fill(ctx->prev.begin(), ctx->prev.end(), 0); ctx->cur.clear(); + ctx->n_considered = 0; +} + +void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) { + if (seed == LLAMA_DEFAULT_SEED) { + seed = std::random_device{}(); + } + ctx->rng.seed(seed); } void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst) { @@ -168,77 +182,20 @@ static llama_token llama_sampling_sample_impl( bool is_resampling) { // Add a parameter to indicate if we are resampling const llama_sampling_params & params = ctx_sampling->params; - const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); - const float temp = params.temp; - const int32_t penalty_last_n = params.penalty_last_n < 0 ? params.n_prev : params.penalty_last_n; - const float penalty_repeat = params.penalty_repeat; - const float penalty_freq = params.penalty_freq; - const float penalty_present = params.penalty_present; const int mirostat = params.mirostat; const float mirostat_tau = params.mirostat_tau; const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - - auto & prev = ctx_sampling->prev; - auto & cur = ctx_sampling->cur; + std::vector original_logits; + auto cur_p = llama_sampling_prepare(ctx_sampling, ctx_main, ctx_cfg, idx, !is_resampling, &original_logits); + if (!is_resampling) { + GGML_ASSERT(!original_logits.empty()); + } llama_token id = 0; - // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - // Declare original_logits at the beginning of the function scope - std::vector original_logits; - - if (!is_resampling) { - // Only make a copy of the original logits if we are not in the resampling phase, not sure if I actually have to do this. - original_logits = std::vector(logits, logits + llama_n_vocab(llama_get_model(ctx_main))); - } - - // apply params.logit_bias map - for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { - logits[it->first] += it->second; - } - - if (ctx_cfg) { - float * logits_guidance = llama_get_logits_ith(ctx_cfg, idx); - llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale); - } - - cur.clear(); - - for (llama_token token_id = 0; token_id < n_vocab; token_id++) { - cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array cur_p = { cur.data(), cur.size(), false }; - - // apply penalties - const auto& penalty_tokens = params.use_penalty_prompt_tokens ? params.penalty_prompt_tokens : prev; - const int penalty_tokens_used_size = std::min((int)penalty_tokens.size(), penalty_last_n); - if (penalty_tokens_used_size) { - const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; - - llama_sample_repetition_penalties(ctx_main, &cur_p, - penalty_tokens.data() + penalty_tokens.size() - penalty_tokens_used_size, - penalty_tokens_used_size, penalty_repeat, penalty_freq, penalty_present); - - if (!penalize_nl) { - for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { - cur_p.data[idx].logit = nl_logit; - break; - } - } - } - } - - // If we are in the resampling phase, apply grammar checks before sampling logic - if (is_resampling && ctx_sampling->grammar != NULL) { - llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); - } - if (temp < 0.0) { // greedy sampling, with probs llama_sample_softmax(ctx_main, &cur_p); @@ -260,7 +217,7 @@ static llama_token llama_sampling_sample_impl( sampler_queue(ctx_main, params, cur_p, min_keep); - id = llama_sample_token(ctx_main, &cur_p); + id = llama_sample_token_with_rng(ctx_main, &cur_p, ctx_sampling->rng); //{ // const int n_top = 10; @@ -299,14 +256,18 @@ static llama_token llama_sampling_sample_impl( } } + ctx_sampling->n_considered = cur_p.size; + return id; } -static llama_token_data_array llama_sample_probability_distribution_impl( +static llama_token_data_array llama_sampling_prepare_impl( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - const int idx) { + const int idx, + bool apply_grammar, + std::vector * original_logits) { const llama_sampling_params & params = ctx_sampling->params; const int n_vocab = llama_n_vocab(llama_get_model(ctx_main)); @@ -315,6 +276,7 @@ static llama_token_data_array llama_sample_probability_distribution_impl( const float penalty_repeat = params.penalty_repeat; const float penalty_freq = params.penalty_freq; const float penalty_present = params.penalty_present; + const bool penalize_nl = params.penalize_nl; auto & prev = ctx_sampling->prev; @@ -323,8 +285,10 @@ static llama_token_data_array llama_sample_probability_distribution_impl( // Get a pointer to the logits float * logits = llama_get_logits_ith(ctx_main, idx); - // Declare original_logits at the beginning of the function scope - std::vector original_logits; + if (apply_grammar && original_logits != NULL) { + // Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this. + *original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))}; + } // apply params.logit_bias map for (auto it = params.logit_bias.begin(); it != params.logit_bias.end(); it++) { @@ -364,12 +328,11 @@ static llama_token_data_array llama_sample_probability_distribution_impl( } } - // apply grammar checks - if (ctx_sampling->grammar != NULL) { + // apply grammar checks before sampling logic + if (apply_grammar && ctx_sampling->grammar != NULL) { llama_sample_grammar(ctx_main, &cur_p, ctx_sampling->grammar); } - llama_sample_softmax(ctx_main, &cur_p); return cur_p; } @@ -382,12 +345,14 @@ llama_token llama_sampling_sample( return llama_sampling_sample_impl(ctx_sampling, ctx_main, ctx_cfg, idx, false); } -llama_token_data_array llama_sampling_probability_distribution( +llama_token_data_array llama_sampling_prepare( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - const int idx) { - return llama_sample_probability_distribution_impl(ctx_sampling,ctx_main, ctx_cfg, idx); + const int idx, + bool apply_grammar, + std::vector * original_logits) { + return llama_sampling_prepare_impl(ctx_sampling,ctx_main, ctx_cfg, idx, apply_grammar, original_logits); } void llama_sampling_accept( diff --git a/common/sampling.h b/common/sampling.h index 79a998be8..5b73ecdcd 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -4,9 +4,10 @@ #include "grammar-parser.h" +#include #include -#include #include +#include // sampler types enum class llama_sampler_type : char { @@ -20,25 +21,26 @@ enum class llama_sampler_type : char { // sampling parameters typedef struct llama_sampling_params { - int32_t n_prev = 64; // number of previous tokens to remember - int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. - int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens - int32_t top_k = 40; // <= 0 to use vocab size - float top_p = 0.95f; // 1.0 = disabled - float min_p = 0.05f; // 0.0 = disabled - float tfs_z = 1.00f; // 1.0 = disabled - float typical_p = 1.00f; // 1.0 = disabled - float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities - float dynatemp_range = 0.00f; // 0.0 = disabled - float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler - int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.00f; // 1.0 = disabled - float penalty_freq = 0.00f; // 0.0 = disabled - float penalty_present = 0.00f; // 0.0 = disabled - int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 - float mirostat_tau = 5.00f; // target entropy - float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = false; // consider newlines as a repeatable token + int32_t n_prev = 64; // number of previous tokens to remember + int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens. + int32_t min_keep = 0; // 0 = disabled, otherwise samplers should return at least min_keep tokens + int32_t top_k = 40; // <= 0 to use vocab size + float top_p = 0.95f; // 1.0 = disabled + float min_p = 0.05f; // 0.0 = disabled + float tfs_z = 1.00f; // 1.0 = disabled + float typical_p = 1.00f; // 1.0 = disabled + float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities + float dynatemp_range = 0.00f; // 0.0 = disabled + float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler + int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) + float penalty_repeat = 1.00f; // 1.0 = disabled + float penalty_freq = 0.00f; // 0.0 = disabled + float penalty_present = 0.00f; // 0.0 = disabled + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + float mirostat_tau = 5.00f; // target entropy + float mirostat_eta = 0.10f; // learning rate + bool penalize_nl = false; // consider newlines as a repeatable token + uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampling_context std::vector samplers_sequence = { llama_sampler_type::TOP_K, @@ -79,6 +81,9 @@ struct llama_sampling_context { // TODO: replace with ring-buffer std::vector prev; std::vector cur; + size_t n_considered; + + std::mt19937 rng; }; #include "common.h" @@ -93,6 +98,9 @@ void llama_sampling_free(struct llama_sampling_context * ctx); // - reset grammar void llama_sampling_reset(llama_sampling_context * ctx); +// Set the sampler seed +void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed); + // Copy the sampler context void llama_sampling_cp(llama_sampling_context * src, llama_sampling_context * dst); @@ -129,14 +137,16 @@ llama_token llama_sampling_sample( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - int idx = 0); + int idx = -1); -// returns the probability that token of given id will be sampled -llama_token_data_array llama_sampling_probability_distribution( +// Prepares and adjusts the set of token candidates for sampling based on penalties, biases, and sampling parameters. +llama_token_data_array llama_sampling_prepare( struct llama_sampling_context * ctx_sampling, struct llama_context * ctx_main, struct llama_context * ctx_cfg, - int idx = 0); + int idx = 0, + bool apply_grammar = true, + std::vector * original_logits = nullptr); void llama_sampling_accept( struct llama_sampling_context * ctx_sampling, diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py new file mode 100755 index 000000000..e64687722 --- /dev/null +++ b/convert-hf-to-gguf-update.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 + +# This script downloads the tokenizer models of the specified models from Huggingface and +# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py +# +# This is necessary in order to analyze the type of pre-tokenizer used by the model and +# provide the necessary information to llama.cpp via the GGUF header in order to implement +# the same pre-tokenizer. +# +# ref: https://github.com/ggerganov/llama.cpp/pull/6920 +# +# Instructions: +# +# - Add a new model to the "models" list +# - Run the script with your huggingface token: +# +# python3 convert-hf-to-gguf-update.py +# +# - Copy-paste the generated get_vocab_base_pre() function into convert-hf-to-gguf.py +# - Update llama.cpp with the new pre-tokenizer if necessary +# +# TODO: generate tokenizer tests for llama.cpp +# TODO: automate the update of convert-hf-to-gguf.py +# + +import logging +import os +import requests +import sys +import json + +from hashlib import sha256 +from enum import IntEnum, auto +from transformers import AutoTokenizer + +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger("convert-hf-to-gguf-update") + + +class TOKENIZER_TYPE(IntEnum): + SPM = auto() + BPE = auto() + WPM = auto() + + +# TODO: this string has to exercise as much pre-tokenizer functionality as possible +# will be updated with time - contributions welcome +chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + +if len(sys.argv) == 2: + token = sys.argv[1] + if not token.startswith("hf_"): + logger.info("Huggingface token seems invalid") + logger.info("Usage: python convert-hf-to-gguf-update.py ") + sys.exit(1) +else: + logger.info("Usage: python convert-hf-to-gguf-update.py ") + sys.exit(1) + +# TODO: add models here, base models preferred +models = [ + {"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", }, + {"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", }, + {"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", }, + {"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", }, + {"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", }, + {"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", }, + {"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", }, + {"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", }, + {"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", }, + {"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", }, + {"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", }, + {"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", }, + {"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", }, + {"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", }, + {"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", }, +] + +# make directory "models/tokenizers" if it doesn't exist +if not os.path.exists("models/tokenizers"): + os.makedirs("models/tokenizers") + + +def download_file_with_auth(url, token, save_path): + headers = {"Authorization": f"Bearer {token}"} + response = requests.get(url, headers=headers) + if response.status_code == 200: + with open(save_path, 'wb') as f: + f.write(response.content) + logger.info(f"File {save_path} downloaded successfully") + else: + logger.info(f"Failed to download file. Status code: {response.status_code}") + + +# download the tokenizer models +for model in models: + name = model["name"] + repo = model["repo"] + tokt = model["tokt"] + + if not os.path.exists(f"models/tokenizers/{name}"): + os.makedirs(f"models/tokenizers/{name}") + else: + logger.info(f"Directory models/tokenizers/{name} already exists - skipping") + continue + + logger.info(f"Downloading {name} to models/tokenizers/{name}") + + url = f"{repo}/raw/main/config.json" + save_path = f"models/tokenizers/{name}/config.json" + download_file_with_auth(url, token, save_path) + + url = f"{repo}/raw/main/tokenizer.json" + save_path = f"models/tokenizers/{name}/tokenizer.json" + download_file_with_auth(url, token, save_path) + + # if downloaded file is less than 1KB, we likely need to download an LFS instead + if os.path.getsize(save_path) < 1024: + # remove the file + os.remove(save_path) + url = f"{repo}/resolve/main/tokenizer.json" + save_path = f"models/tokenizers/{name}/tokenizer.json" + download_file_with_auth(url, token, save_path) + + if tokt == TOKENIZER_TYPE.SPM: + url = f"{repo}/resolve/main/tokenizer.model" + save_path = f"models/tokenizers/{name}/tokenizer.model" + download_file_with_auth(url, token, save_path) + + url = f"{repo}/raw/main/tokenizer_config.json" + save_path = f"models/tokenizers/{name}/tokenizer_config.json" + download_file_with_auth(url, token, save_path) + +# generate the source code for the convert-hf-to-gguf.py:get_vocab_base_pre() function: +# TODO: auto-update convert-hf-to-gguf.py with the generated function + +src_ifs = "" +for model in models: + name = model["name"] + tokt = model["tokt"] + + if tokt == TOKENIZER_TYPE.SPM: + continue + + # create the tokenizer + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.info(f"model: {name}") + logger.info(f"tokt: {tokt}") + logger.info(f"repo: {model['repo']}") + logger.info(f"chktok: {chktok}") + logger.info(f"chkhsh: {chkhsh}") + + # print the "pre_tokenizer" content from the tokenizer.json + with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f: + cfg = json.load(f) + normalizer = cfg["normalizer"] + logger.info("normalizer: " + json.dumps(normalizer, indent=4)) + pre_tokenizer = cfg["pre_tokenizer"] + logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4)) + + logger.info("") + + src_ifs += f" if chkhsh == \"{chkhsh}\":\n" + src_ifs += f" # ref: {model['repo']}\n" + src_ifs += f" res = \"{name}\"\n" + +src_func = f""" + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = {repr(chktxt)} + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {{chktok}}") + logger.debug(f"chkhsh: {{chkhsh}}") + + res = None + + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! +{src_ifs} + if res is None: + logger.warning("\\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {{chkhsh}}") + logger.warning("**************************************************************************************") + logger.warning("\\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}") + logger.debug(f"chkhsh: {{chkhsh}}") + + return res +""" + +print(src_func) # noqa: NP100 + +logger.info("\n") +logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") +logger.info("\n") + +# generate tests for each tokenizer model + +tests = [ + "ied 4 ½ months", + "Führer", + "", + " ", + " ", + " ", + "\t", + "\n", + "\n\n", + "\n\n\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is 🦙.cpp", + "w048 7tuijk dsdfhu", + "нещо на Български", + "កាន់តែពិសេសអាចខលចេញ", + "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + " (", + "\n =", + "' era", + "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", + "3", + "33", + "333", + "3333", + "33333", + "333333", + "3333333", + "33333333", + "333333333", + # "Cửa Việt", # llama-bpe fails on this + chktxt, +] + +# write the tests to ./models/ggml-vocab-{name}.gguf.inp +# the format is: +# +# test0 +# __ggml_vocab_test__ +# test1 +# __ggml_vocab_test__ +# ... +# + +# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out +# for each test, write the resulting tokens on a separate line + +for model in models: + name = model["name"] + tokt = model["tokt"] + + # create the tokenizer + tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") + + with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f: + for text in tests: + f.write(f"{text}") + f.write("\n__ggml_vocab_test__\n") + + with open(f"models/ggml-vocab-{name}.gguf.out", "w") as f: + for text in tests: + res = tokenizer.encode(text, add_special_tokens=False) + for r in res: + f.write(f" {r}") + f.write("\n") + + logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*") + +# generate commands for creating vocab files + +logger.info("\nRun the following commands to generate the vocab files for testing:\n") + +for model in models: + name = model["name"] + + print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100 + +logger.info("\n") diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 1e49d56c1..1dc18b2a5 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2,16 +2,17 @@ from __future__ import annotations +import logging import argparse import contextlib import json import os import re import sys -from abc import ABC, abstractmethod from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast +from hashlib import sha256 +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast, overload import numpy as np import torch @@ -23,7 +24,9 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf -from convert import HfVocab +from convert import LlamaHfVocab + +logger = logging.getLogger("hf-to-gguf") ###### MODEL DEFINITIONS ###### @@ -40,28 +43,55 @@ class SentencePieceTokenTypes(IntEnum): AnyModel = TypeVar("AnyModel", bound="type[Model]") -class Model(ABC): +class Model: _model_classes: dict[str, type[Model]] = {} - def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): + dir_model: Path + ftype: int + fname_out: Path + is_big_endian: bool + endianess: gguf.GGUFEndian + use_temp_file: bool + lazy: bool + part_names: list[str] + is_safetensors: bool + hparams: dict[str, Any] + gguf_writer: gguf.GGUFWriter + block_count: int + tensor_map: gguf.TensorNameMap + tensor_names: set[str] | None + + # subclasses should define this! + model_arch: gguf.MODEL_ARCH + + def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool): + if self.__class__ == Model: + raise TypeError(f"{self.__class__.__name__!r} should not be directly instantiated") self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out self.is_big_endian = is_big_endian self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE - self.is_safetensors = self._is_model_safetensors() - self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") - self.part_names = self._get_part_names() + self.use_temp_file = use_temp_file + self.lazy = not eager + self.part_names = Model.get_model_part_names(self.dir_model, ".safetensors") + self.is_safetensors = len(self.part_names) > 0 + if not self.is_safetensors: + self.part_names = Model.get_model_part_names(self.dir_model, ".bin") self.hparams = Model.load_hparams(self.dir_model) - self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) + self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + self.tensor_names = None - @property - @abstractmethod - def model_arch(self) -> gguf.MODEL_ARCH: - pass + @classmethod + def __init_subclass__(cls): + # can't use an abstract property, because overriding it without type errors + # would require using decorated functions instead of simply defining the property + if "model_arch" not in cls.__dict__: + raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}") - def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any: + def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) if key is not None: return self.hparams[key] @@ -73,8 +103,24 @@ class Model(ABC): self._set_vocab_gpt2() def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + tensor_names_from_parts: set[str] = set() + + if len(self.part_names) > 1: + self.tensor_names = set() + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + logger.info(f"gguf: loading model weight map from '{index_name}'") + with open(self.dir_model / index_name, "r", encoding="utf-8") as f: + index: dict[str, Any] = json.load(f) + weight_map = index.get("weight_map") + if weight_map is None or not isinstance(weight_map, dict): + raise ValueError(f"Can't load 'weight_map' from {index_name!r}") + self.tensor_names.update(weight_map.keys()) + else: + self.tensor_names = tensor_names_from_parts + for part_name in self.part_names: - print(f"gguf: loading model part '{part_name}'") + logger.info(f"gguf: loading model part '{part_name}'") ctx: ContextManager[Any] if self.is_safetensors: from safetensors import safe_open @@ -83,48 +129,97 @@ class Model(ABC): ctx = contextlib.nullcontext(torch.load(str(self.dir_model / part_name), map_location="cpu", mmap=True, weights_only=True)) with ctx as model_part: + tensor_names_from_parts.update(model_part.keys()) + for name in model_part.keys(): data = model_part.get_tensor(name) if self.is_safetensors else model_part[name] + if self.lazy: + data = LazyTorchTensor.from_eager(data) yield name, data + # only verify tensor name presence; it doesn't matter if they are not in the right files + if len(sym_diff := tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: + raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}") + + def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: + name: str = gguf.TENSOR_NAMES[key] + if key not in gguf.MODEL_TENSORS[self.model_arch]: + raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}") + if "{bid}" in name: + assert bid is not None + name = name.format(bid=bid) + return name + suffix + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes) + if new_name is None: + raise ValueError(f"Can not map tensor {name!r}") + return new_name + def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.block_count) if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None: self.gguf_writer.add_context_length(n_ctx) + logger.info(f"gguf: context length = {n_ctx}") n_embd = self.find_hparam(["hidden_size", "n_embd"]) self.gguf_writer.add_embedding_length(n_embd) + logger.info(f"gguf: embedding length = {n_embd}") if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None: self.gguf_writer.add_feed_forward_length(n_ff) + logger.info(f"gguf: feed forward length = {n_ff}") n_head = self.find_hparam(["num_attention_heads", "n_head"]) self.gguf_writer.add_head_count(n_head) + logger.info(f"gguf: head count = {n_head}") if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) + logger.info(f"gguf: key-value head count = {n_head_kv}") if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) + logger.info(f"gguf: rope theta = {rope_theta}") if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) + logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) + logger.info(f"gguf: layer norm epsilon = {f_norm_eps}") if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) + logger.info(f"gguf: expert count = {n_experts}") if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) + logger.info(f"gguf: experts used count = {n_experts_used}") self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + return [(self.map_tensor_name(name), data_torch)] + + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + + return False + + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused + + return False def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,") + for name, data_torch in self.get_tensors(): # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): + if name.endswith((".attention.masked_bias", ".attention.bias", ".rotary_emb.inv_freq")): continue old_dtype = data_torch.dtype @@ -133,38 +228,52 @@ class Model(ABC): if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) - data = data_torch.squeeze().numpy() + # use the first number-like part of the tensor name as the block id + bid = None + for part in name.split("."): + if part.isdecimal(): + bid = int(part) + break - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + for new_name, data in ((n, d.squeeze().numpy()) for n, d in self.modify_tensors(data_torch, name, bid)): + data: np.ndarray = data # type hint + n_dims = len(data.shape) + data_dtype = data.dtype - n_dims = len(data.shape) - data_dtype = data.dtype + # if f32 desired, convert any float16 to float32 + if self.ftype == 0 and data_dtype == np.float16: + data = data.astype(np.float32) - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) + # when both are True, f32 should win + extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims) + extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims) - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) + # Most of the codebase that takes in 1D tensors or norms only handles F32 tensors + extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight") - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) + # if f16 desired, convert any float32 2-dim weight tensors to float16 + extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") + # when both extra_f32 and extra_f16 are False, convert to float32 by default + if self.ftype == 1 and data_dtype == np.float16 and (extra_f32 or not extra_f16): + data = data.astype(np.float32) - self.gguf_writer.add_tensor(new_name, data) + if self.ftype == 1 and data_dtype == np.float32 and extra_f16 and not extra_f32: + data = data.astype(np.float16) + + # reverse shape to make it similar to the internal ggml dimension order + shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}" + + # n_dims is implicit in the shape + logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data.dtype}, shape = {shape_str}") + + self.gguf_writer.add_tensor(new_name, data) def write(self): self.write_tensors() self.gguf_writer.write_header_to_file() self.gguf_writer.write_kv_data_to_file() - self.gguf_writer.write_tensors_to_file() + self.gguf_writer.write_tensors_to_file(progress=True) self.gguf_writer.close() def write_vocab(self): @@ -173,16 +282,18 @@ class Model(ABC): self.gguf_writer.close() @staticmethod - def count_model_parts(dir_model: Path, prefix: str) -> int: - num_parts = 0 + def get_model_part_names(dir_model: Path, suffix: str) -> list[str]: + part_names: list[str] = [] for filename in os.listdir(dir_model): - if filename.endswith(prefix): - num_parts += 1 + if filename.endswith(suffix): + part_names.append(filename) - return num_parts + part_names.sort() + + return part_names @staticmethod - def load_hparams(dir_model): + def load_hparams(dir_model: Path): with open(dir_model / "config.json", "r", encoding="utf-8") as f: return json.load(f) @@ -190,50 +301,37 @@ class Model(ABC): def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: assert names - def func(modelcls: type[Model]): + def func(modelcls: AnyModel) -> AnyModel: for name in names: cls._model_classes[name] = modelcls return modelcls return func @classmethod - def from_model_architecture(cls, arch): + def from_model_architecture(cls, arch: str) -> type[Model]: try: return cls._model_classes[arch] except KeyError: raise NotImplementedError(f'Architecture {arch!r} not supported!') from None - def _is_model_safetensors(self) -> bool: - return Model.count_model_parts(self.dir_model, ".safetensors") > 0 - - def _get_part_names(self): - if self.is_safetensors: - if self.num_parts == 1: # there's only one .safetensors file - return ("model.safetensors",) - return (f"model-{n:05}-of-{self.num_parts:05}.safetensors" for n in range(1, self.num_parts + 1)) - - if self.num_parts == 1: # there's only one .bin file - return ("pytorch_model.bin",) - return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - - def _set_vocab_gpt2(self): - dir_model = self.dir_model - hparams = self.hparams - tokens: list[bytearray] = [] + # used for GPT-2 BPE and WordPiece vocabs + def get_vocab_base(self) -> tuple[list[str], list[int], str]: + tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(dir_model) - vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) assert max(tokenizer.vocab.values()) < vocab_size + tokpre = self.get_vocab_base_pre(tokenizer) + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() for i in range(vocab_size): if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode('utf-8') - tokens.append(bytearray(pad_token)) + tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) @@ -245,17 +343,104 @@ class Model(ABC): tokens.append(reverse_vocab[i]) toktypes.append(gguf.TokenType.NORMAL) + return tokens, toktypes, tokpre + + # NOTE: this function is generated by convert-hf-to-gguf-update.py + # do not modify it manually! + # ref: https://github.com/ggerganov/llama.cpp/pull/6920 + def get_vocab_base_pre(self, tokenizer) -> str: + # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that + # is specific for the BPE pre-tokenizer used by the model + # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can + # use in llama.cpp to implement the same pre-tokenizer + + chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````""""......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' + + chktok = tokenizer.encode(chktxt) + chkhsh = sha256(str(chktok).encode()).hexdigest() + + logger.debug(f"chktok: {chktok}") + logger.debug(f"chkhsh: {chkhsh}") + + res = None + + # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script + # or pull the latest version of the model from Huggingface + # don't edit the hashes manually! + if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": + # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B + res = "llama-bpe" + if chkhsh == "049ecf7629871e3041641907f3de7c733e4dbfdc736f57d882ba0b0845599754": + # ref: https://huggingface.co/deepseek-ai/deepseek-llm-7b-base + res = "deepseek-llm" + if chkhsh == "347715f544604f9118bb75ed199f68779f423cabb20db6de6f31b908d04d7821": + # ref: https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base + res = "deepseek-coder" + if chkhsh == "8aeee3860c56296a157a1fe2fad249ec40aa59b1bb5709f4ade11c4e6fe652ed": + # ref: https://huggingface.co/tiiuae/falcon-7b + res = "falcon" + if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f": + # ref: https://huggingface.co/BAAI/bge-small-en-v1.5 + res = "bert-bge" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/mosaicml/mpt-7b + res = "mpt" + if chkhsh == "35d91631860c815f952d711435f48d356ebac988362536bed955d43bfa436e34": + # ref: https://huggingface.co/bigcode/starcoder2-3b + res = "starcoder" + if chkhsh == "3ce83efda5659b07b1ad37ca97ca5797ea4285d9b9ab0dc679e4a720c9da7454": + # ref: https://huggingface.co/openai-community/gpt2 + res = "gpt-2" + if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff": + # ref: https://huggingface.co/smallcloudai/Refact-1_6-base + res = "refact" + if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8": + # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01 + res = "command-r" + if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea": + # ref: https://huggingface.co/Qwen/Qwen1.5-7B + res = "qwen2" + if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166": + # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf + res = "olmo" + if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e": + # ref: https://huggingface.co/databricks/dbrx-instruct + res = "dbrx" + + if res is None: + logger.warning("\n") + logger.warning("**************************************************************************************") + logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!") + logger.warning("** There are 2 possible reasons for this:") + logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet") + logger.warning("** - the pre-tokenization config has changed upstream") + logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.") + logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920") + logger.warning("**") + logger.warning(f"** chkhsh: {chkhsh}") + logger.warning("**************************************************************************************") + logger.warning("\n") + raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()") + + logger.debug(f"tokenizer.ggml.pre: {repr(res)}") + logger.debug(f"chkhsh: {chkhsh}") + + return res + + def _set_vocab_gpt2(self) -> None: + tokens, toktypes, tokpre = self.get_vocab_base() self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) - special_vocab = gguf.SpecialVocab(dir_model, load_merges=True) + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) special_vocab.add_to_gguf(self.gguf_writer) def _set_vocab_qwen(self): dir_model = self.dir_model hparams = self.hparams - tokens: list[bytearray] = [] + tokens: list[str] = [] toktypes: list[int] = [] from transformers import AutoTokenizer @@ -263,6 +448,8 @@ class Model(ABC): vocab_size = hparams["vocab_size"] assert max(tokenizer.get_vocab().values()) < vocab_size + tokpre = self.get_vocab_base_pre(tokenizer) + merges = [] vocab = {} mergeable_ranks = tokenizer.mergeable_ranks @@ -280,8 +467,7 @@ class Model(ABC): for i in range(vocab_size): if i not in reverse_vocab: - pad_token = f"[PAD{i}]".encode("utf-8") - tokens.append(bytearray(pad_token)) + tokens.append(f"[PAD{i}]") toktypes.append(gguf.TokenType.USER_DEFINED) elif reverse_vocab[i] in added_vocab: tokens.append(reverse_vocab[i]) @@ -291,6 +477,7 @@ class Model(ABC): toktypes.append(gguf.TokenType.NORMAL) self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_types(toktypes) @@ -314,25 +501,26 @@ class Model(ABC): toktypes: list[int] = [] if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) - sys.exit(1) + raise FileNotFoundError(f"File not found: {tokenizer_path}") + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) - tokenizer = SentencePieceProcessor(str(tokenizer_path)) vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) - for token_id in range(vocab_size): - piece = tokenizer.id_to_piece(token_id) + for token_id in range(tokenizer.vocab_size()): + piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") - score = tokenizer.get_score(token_id) + score = tokenizer.GetScore(token_id) toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.is_unknown(token_id): + if tokenizer.IsUnknown(token_id): toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.is_control(token_id): + elif tokenizer.IsControl(token_id): toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.is_unused(token_id): + elif tokenizer.IsUnused(token_id): toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.is_byte(token_id): + elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE tokens.append(text) @@ -345,11 +533,24 @@ class Model(ABC): added_tokens_json = json.load(f) for key in added_tokens_json: - tokens.append(key.encode("utf-8")) - scores.append(-1000.0) - toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + key = key.encode("utf-8") + if key not in tokens: + tokens.append(key) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.USER_DEFINED) + + if vocab_size > len(tokens): + pad_count = vocab_size - len(tokens) + logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]") + for i in range(1, pad_count + 1): + tokens.append(bytes(f"[PAD{i}]", encoding="utf-8")) + scores.append(-1000.0) + toktypes.append(SentencePieceTokenTypes.UNUSED) + + assert len(tokens) == vocab_size self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) @@ -357,12 +558,8 @@ class Model(ABC): special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def _set_vocab_hf(self): - path = self.dir_model - added_tokens_path = self.dir_model - vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) + def _set_vocab_llama_hf(self): + vocab = LlamaHfVocab(self.dir_model) tokens = [] scores = [] toktypes = [] @@ -375,6 +572,7 @@ class Model(ABC): assert len(tokens) == vocab.vocab_size self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) @@ -420,88 +618,69 @@ class BloomModel(Model): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def write_tensors(self): - block_count = self.hparams["n_layer"] - tensors = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - has_lm_head = True + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + n_head = self.hparams.get("n_head", self.hparams.get("num_attention_heads")) n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) - for name, data_torch in tensors.items(): - if "lm_head.weight" not in tensors.keys() and "output.weight" not in tensors.keys(): - has_lm_head = False + name = re.sub(r'transformer\.', '', name) - name = re.sub(r'transformer\.', '', name) + tensors: list[tuple[str, Tensor]] = [] - old_dtype = data_torch.dtype + if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa + qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) + data_torch = torch.cat( + ( + qkv_weights[:, 0, :, :].reshape((-1, n_embed)), + qkv_weights[:, 1, :, :].reshape((-1, n_embed)), + qkv_weights[:, 2, :, :].reshape((-1, n_embed)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.weight") + elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): + qkv_bias = data_torch.reshape((n_head, 3, n_embed // n_head)) + data_torch = torch.cat( + ( + qkv_bias[:, 0, :].reshape((n_embed,)), + qkv_bias[:, 1, :].reshape((n_embed,)), + qkv_bias[:, 2, :].reshape((n_embed,)), + ), + dim=0, + ) + logger.info("re-format attention.linear_qkv.bias") - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) + tensors.append((self.map_tensor_name(name), data_torch)) - data = data_torch.squeeze().numpy() + if name == "word_embeddings.weight": + assert self.tensor_names is not None - if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa - qkv_weights = data.reshape((n_head, 3, n_embed // n_head, n_embed)) - data = np.concatenate( - ( - qkv_weights[:, 0, :, :].reshape((-1, n_embed)), - qkv_weights[:, 1, :, :].reshape((-1, n_embed)), - qkv_weights[:, 2, :, :].reshape((-1, n_embed)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.weight") - elif re.match(r"h\.\d+\.self_attention\.query_key_value\.bias", name): - qkv_bias = data.reshape((n_head, 3, n_embed // n_head)) - data = np.concatenate( - ( - qkv_bias[:, 0, :].reshape((n_embed,)), - qkv_bias[:, 1, :].reshape((n_embed,)), - qkv_bias[:, 2, :].reshape((n_embed,)), - ), - axis=0, - ) - print("re-format attention.linear_qkv.bias") + # TODO: tie them at runtime, don't duplicate in the model file + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"=> {new_name}, shape = {data.shape}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "word_embeddings.weight": - self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + return tensors @Model.register("MPTForCausalLM") class MPTModel(Model): model_arch = gguf.MODEL_ARCH.MPT + def set_vocab(self): + try: + self._set_vocab_gpt2() + except Exception: + # Fallback for SEA-LION model + self._set_vocab_sentencepiece() + self.gguf_writer.add_add_bos_token(False) + self.gguf_writer.add_pad_token_id(3) + self.gguf_writer.add_eos_token_id(1) + self.gguf_writer.add_unk_token_id(0) + def set_gguf_parameters(self): block_count = self.hparams["n_layers"] self.gguf_writer.add_name(self.dir_model.name) @@ -515,53 +694,21 @@ class MPTModel(Model): self.gguf_writer.add_layer_norm_eps(1e-5) if self.hparams["attn_config"]["clip_qkv"] is not None: self.gguf_writer.add_clamp_kqv(self.hparams["attn_config"]["clip_qkv"]) - self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + if self.hparams["attn_config"]["alibi"]: + self.gguf_writer.add_max_alibi_bias(self.hparams["attn_config"]["alibi_bias_max"]) + else: + self.gguf_writer.add_max_alibi_bias(0.0) - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - old_dtype = data_torch.dtype + if "scales" in name: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias", ".scales")) + new_name = new_name.replace("scales", "act.scales") + else: + new_name = self.map_tensor_name(name, try_suffixes=(".weight", ".bias")) - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - if "scales" in name: - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias", ".scales")) - if new_name is not None: - new_name = new_name.replace("scales", "act.scales") - else: - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + return [(new_name, data_torch)] @Model.register("OrionForCausalLM") @@ -585,8 +732,7 @@ class OrionModel(Model): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_file_type(self.ftype) self.gguf_writer.add_name(self.dir_model.name) @@ -602,49 +748,6 @@ class OrionModel(Model): # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) - def write_tensors(self): - # Collect tensors from generator object - model_kv = dict(self.get_tensors()) - block_count = self.hparams["num_hidden_layers"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - @Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") class BaichuanModel(Model): @@ -667,8 +770,7 @@ class BaichuanModel(Model): elif "model_max_length" in self.hparams: ctx_length = self.hparams["model_max_length"] else: - print("gguf: can not find ctx length parameter.") - sys.exit() + raise ValueError("gguf: can not find ctx length parameter.") self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_source_hf_repo(hf_repo) @@ -687,61 +789,26 @@ class BaichuanModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) - def write_tensors(self): - # Collect tensors from generator object - model_kv = dict(self.get_tensors()) - block_count = self.hparams["num_hidden_layers"] + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: head_count = self.hparams["num_attention_heads"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) head_count_kv = self.hparams.get("num_key_value_heads", head_count) - for i in range(block_count): - if (w := model_kv.get(f"model.layers.{i}.self_attn.W_pack.weight")) is not None: - print(f"Unpacking and permuting layer {i}") - model_kv[f"model.layers.{i}.self_attn.q_proj.weight"] = \ - self._reverse_hf_permute_part(w, 0, head_count, head_count) - model_kv[f"model.layers.{i}.self_attn.k_proj.weight"] = \ - self._reverse_hf_permute_part(w, 1, head_count, head_count_kv) - model_kv[f"model.layers.{i}.self_attn.v_proj.weight"] = \ - self._reverse_hf_part(w, 2) - del model_kv[f"model.layers.{i}.self_attn.W_pack.weight"] + tensors: list[tuple[str, Tensor]] = [] - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue + if bid is not None and name == f"model.layers.{bid}.self_attn.W_pack.weight": + logger.info(f"Unpacking and permuting layer {bid}") + tensors = [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), + self._reverse_hf_permute_part(data_torch, 0, head_count, head_count)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), + self._reverse_hf_permute_part(data_torch, 1, head_count, head_count_kv)), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), + self._reverse_hf_part(data_torch, 2)), + ] + else: + tensors = [(self.map_tensor_name(name), data_torch)] - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{name} -> {new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) + return tensors def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -764,6 +831,111 @@ class BaichuanModel(Model): return weights[r * n_part:r * n_part + r, ...] +@Model.register("XverseForCausalLM") +class XverseModel(Model): + model_arch = gguf.MODEL_ARCH.XVERSE + + def set_vocab(self): + assert (self.dir_model / "tokenizer.json").is_file() + dir_model = self.dir_model + hparams = self.hparams + + tokens: list[bytes] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(dir_model) + vocab_size = hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size + + reverse_vocab: dict[int, str] = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + for token_id in range(vocab_size): + token_text = reverse_vocab[token_id].encode('utf-8') + # replace "\x00" to string with length > 0 + if token_text == b"\x00": + toktype = gguf.TokenType.BYTE # special + token_text = f"<{token_text}>".encode('utf-8') + elif re.fullmatch(br"<0x[0-9A-Fa-f]{2}>", token_text): + toktype = gguf.TokenType.BYTE # special + elif reverse_vocab[token_id] in added_vocab: + if tokenizer.added_tokens_decoder[token_id].special: + toktype = gguf.TokenType.CONTROL + else: + toktype = gguf.TokenType.USER_DEFINED + else: + toktype = gguf.TokenType.NORMAL + + tokens.append(token_text) + toktypes.append(toktype) + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.hparams["num_hidden_layers"] + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + hf_repo = self.hparams.get("_name_or_path", "") + + ctx_length = 0 + if "max_sequence_length" in self.hparams: + ctx_length = self.hparams["max_sequence_length"] + elif "max_position_embeddings" in self.hparams: + ctx_length = self.hparams["max_position_embeddings"] + elif "model_max_length" in self.hparams: + ctx_length = self.hparams["model_max_length"] + else: + raise ValueError("gguf: can not find ctx length parameter.") + + self.gguf_writer.add_name(self.dir_model.name) + self.gguf_writer.add_source_hf_repo(hf_repo) + self.gguf_writer.add_tensor_data_layout("Meta AI original pth") + self.gguf_writer.add_context_length(ctx_length) + self.gguf_writer.add_embedding_length(self.hparams["hidden_size"]) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) + self.gguf_writer.add_rope_dimension_count(self.hparams["hidden_size"] // self.hparams["num_attention_heads"]) + self.gguf_writer.add_head_count(head_count) + self.gguf_writer.add_head_count_kv(head_count_kv) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + head_count = self.hparams["num_attention_heads"] + head_count_kv = self.hparams.get("num_key_value_heads", head_count) + + # HF models permute some of the tensors, so we need to undo that + if name.endswith("q_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count) + if name.endswith("k_proj.weight"): + data_torch = self._reverse_hf_permute(data_torch, head_count, head_count_kv) + + return [(self.map_tensor_name(name), data_torch)] + + def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: + if n_kv_head is not None and n_head != n_kv_head: + n_head //= n_kv_head + + return ( + weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape) + ) + + @Model.register("FalconForCausalLM", "RWForCausalLM") class FalconModel(Model): model_arch = gguf.MODEL_ARCH.FALCON @@ -792,72 +964,31 @@ class FalconModel(Model): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def write_tensors(self): - block_count = self.hparams.get("num_hidden_layers") - if block_count is None: - block_count = self.hparams["n_layer"] # old name + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - n_head = self.hparams.get("num_attention_heads") - if n_head is None: - n_head = self.hparams["n_head"] # old name + # QKV tensor transform + # The original query_key_value tensor contains n_head_kv "kv groups", + # each consisting of n_head/n_head_kv query weights followed by one key + # and one value weight (shared by all query heads in the kv group). + # This layout makes it a big pain to work with in GGML. + # So we rearrange them here,, so that we have n_head query weights + # followed by n_head_kv key weights followed by n_head_kv value weights, + # in contiguous fashion. + # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - n_head_kv = self.hparams.get("num_kv_heads") - if n_head_kv is None: - n_head_kv = self.hparams.get("n_head_kv", 1) # old name + if "query_key_value" in name: + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + n_head_kv = self.find_hparam(["num_kv_heads", "n_head_kv"], optional=True) or 1 + head_dim = self.hparams["hidden_size"] // n_head - head_dim = self.hparams["hidden_size"] // n_head - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) + q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) + k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) + v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) + data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - for name, data_torch in self.get_tensors(): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # QKV tensor transform - # The original query_key_value tensor contains n_head_kv "kv groups", - # each consisting of n_head/n_head_kv query weights followed by one key - # and one value weight (shared by all query heads in the kv group). - # This layout makes it a big pain to work with in GGML. - # So we rearrange them here,, so that we have n_head query weights - # followed by n_head_kv key weights followed by n_head_kv value weights, - # in contiguous fashion. - # ref: https://github.com/jploski/ggml/blob/falcon40b/examples/falcon/convert-hf-to-ggml.py - - if "query_key_value" in name: - qkv = data_torch.view(n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head) - q = qkv[:, :-2].reshape(n_head * head_dim, head_dim * n_head) - k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head) - v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head) - data_torch = torch.cat((q, k, v)).reshape_as(data_torch) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + return [(self.map_tensor_name(name), data_torch)] @Model.register("GPTBigCodeForCausalLM") @@ -903,7 +1034,7 @@ class RefactModel(Model): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def write_tensors(self): + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: hidden_dim = self.hparams["n_embd"] inner_dim = 4 * hidden_dim hidden_dim = int(2 * inner_dim / 3) @@ -912,57 +1043,23 @@ class RefactModel(Model): n_head = self.hparams["n_head"] n_head_kv = 1 head_dim = self.hparams["n_embd"] // n_head - block_count = self.hparams["n_layer"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + tensors: list[tuple[str, Tensor]] = [] - tensors = dict(self.get_tensors()) - for i in range(block_count): - if (w := tensors.get(f"transformer.h.{i}.attn.kv.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.k_proj.weight"] = w[:n_head_kv * head_dim] - tensors[f"model.layers.{i}.self_attn.v_proj.weight"] = w[n_head_kv * head_dim:] - del tensors[f"transformer.h.{i}.attn.kv.weight"] - if (w := tensors.get(f"transformer.h.{i}.attn.q.weight")) is not None: - tensors[f"model.layers.{i}.self_attn.q_proj.weight"] = w - del tensors[f"transformer.h.{i}.attn.q.weight"] - if (w := tensors.get(f"transformer.h.{i}.mlp.gate_up_proj.weight")) is not None: - tensors[f"model.layers.{i}.mlp.gate_proj.weight"] = w[:ff_dim] - tensors[f"model.layers.{i}.mlp.up_proj.weight"] = w[ff_dim:] - del tensors[f"transformer.h.{i}.mlp.gate_up_proj.weight"] + if bid is not None: + if name == f"transformer.h.{bid}.attn.kv.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch[:n_head_kv * head_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), data_torch[n_head_kv * head_dim:])) + elif name == f"transformer.h.{bid}.attn.q.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch)) + elif name == f"transformer.h.{bid}.mlp.gate_up_proj.weight": + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch[:ff_dim])) + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), data_torch[ff_dim:])) - for name, data_torch in tensors.items(): - old_dtype = data_torch.dtype + if len(tensors) == 0: + tensors.append((self.map_tensor_name(name), data_torch)) - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight",)) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + return tensors @Model.register("PersimmonForCausalLM") @@ -997,23 +1094,11 @@ class PersimmonModel(Model): # self.gguf_writer.add_bos_token_id(71013) # self.gguf_writer.add_eos_token_id(71013) - def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid, n_dims # unused - for name, data_torch in self.get_tensors(): - if name.endswith(".self_attention.rotary_emb.inv_freq"): - continue - old_dtype = data_torch.dtype - # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data_torch.to(torch.float32).squeeze().numpy() - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - n_dims = len(data.shape) - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) + # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) + return True @Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") @@ -1039,17 +1124,299 @@ class StableLMModel(Model): rotary_factor = self.find_hparam(["partial_rotary_factor", "rope_pct"]) self.gguf_writer.add_rope_dimension_count(int(rotary_factor * (hparams["hidden_size"] // hparams["num_attention_heads"]))) self.gguf_writer.add_head_count(hparams["num_attention_heads"]) + self.gguf_writer.add_head_count_kv(hparams["num_key_value_heads"]) self.gguf_writer.add_parallel_residual(hparams["use_parallel_residual"] if "use_parallel_residual" in hparams else True) self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) + _q_norms: list[dict[str, Tensor]] | None = None + _k_norms: list[dict[str, Tensor]] | None = None -@Model.register("MixtralForCausalLM") -class MixtralModel(Model): + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams["num_key_value_heads"] + + if name.find("q_layernorm.norms") != -1: + assert bid is not None + + if self._q_norms is None: + self._q_norms = [{} for _ in range(self.block_count)] + + self._q_norms[bid][name] = data_torch + + if len(self._q_norms[bid]) >= n_head: + return self._stack_qk_norm(bid, n_head, self._q_norms[bid], "q_layernorm") + else: + return [] + + if name.find("k_layernorm.norms") != -1: + assert bid is not None + + if self._k_norms is None: + self._k_norms = [{} for _ in range(self.block_count)] + + self._k_norms[bid][name] = data_torch + + if len(self._k_norms[bid]) >= n_kv_head: + return self._stack_qk_norm(bid, n_kv_head, self._k_norms[bid], "k_layernorm") + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def _stack_qk_norm(self, bid: int, n_head: int, norms: dict[str, Tensor], layer_name: str = "q_layernorm"): + datas: list[Tensor] = [] + # extract the norms in order + for xid in range(n_head): + ename = f"model.layers.{bid}.self_attn.{layer_name}.norms.{xid}.weight" + datas.append(norms[ename]) + del norms[ename] + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.self_attn.{layer_name}.weight" + new_name = self.map_tensor_name(merged_name) + + return [(new_name, data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._q_norms is not None or self._k_norms is not None: + # flatten two `list[dict[str, Tensor]]` into a single `list[str]` + norms = ( + [k for d in self._q_norms for k in d.keys()] if self._q_norms is not None else [] + ) + ( + [k for d in self._k_norms for k in d.keys()] if self._k_norms is not None else [] + ) + if len(norms) > 0: + raise ValueError(f"Unprocessed norms: {norms}") + + +@Model.register("LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM") +class LlamaModel(Model): model_arch = gguf.MODEL_ARCH.LLAMA + def set_vocab(self): + try: + self. _set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_rope_dimension_count(hparams["hidden_size"] // hparams["num_attention_heads"]) + + if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]: + if self.hparams["rope_scaling"].get("type") == "linear": + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) + self.gguf_writer.add_rope_scaling_factor(self.hparams["rope_scaling"]["factor"]) + + @staticmethod + def permute(weights: Tensor, n_head: int, n_head_kv: int | None): + if n_head_kv is not None and n_head != n_head_kv: + n_head = n_head_kv + return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) + .swapaxes(1, 2) + .reshape(weights.shape)) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + # process the experts separately + if name.find("block_sparse_moe.experts") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["w1", "w2", "w3"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"layers.{bid}.feed_forward.experts.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@Model.register("GrokForCausalLM") +class GrokModel(Model): + model_arch = gguf.MODEL_ARCH.GROK + def set_vocab(self): self._set_vocab_sentencepiece() + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_name("Grok") + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find(".moe.") != -1: + n_experts = self.hparams["num_local_experts"] + + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for wid in ["linear", "linear_1", "linear_v"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + +@Model.register("DbrxForCausalLM") +class DbrxModel(Model): + model_arch = gguf.MODEL_ARCH.DBRX + + def set_gguf_parameters(self): + ffn_config = self.hparams["ffn_config"] + attn_config = self.hparams["attn_config"] + self.gguf_writer.add_name(self.hparams["model_type"]) + self.gguf_writer.add_block_count(self.hparams["n_layers"]) + + self.gguf_writer.add_context_length(self.hparams["max_seq_len"]) + self.gguf_writer.add_embedding_length(self.hparams["d_model"]) + self.gguf_writer.add_feed_forward_length(ffn_config["ffn_hidden_size"]) + + self.gguf_writer.add_head_count(self.hparams["n_heads"]) + self.gguf_writer.add_head_count_kv(attn_config["kv_n_heads"]) + + self.gguf_writer.add_rope_freq_base(attn_config["rope_theta"]) + + self.gguf_writer.add_clamp_kqv(attn_config["clip_qkv"]) + self.gguf_writer.add_file_type(self.ftype) + + self.gguf_writer.add_expert_count(ffn_config["moe_num_experts"]) + self.gguf_writer.add_expert_used_count(ffn_config["moe_top_k"]) + + self.gguf_writer.add_layer_norm_eps(1e-5) + + self.gguf_writer.add_file_type(self.ftype) + logger.info(f"gguf: file type = {self.ftype}") + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_expert = self.hparams["ffn_config"]["moe_num_experts"] + n_ff = self.hparams["ffn_config"]["ffn_hidden_size"] + n_embd = self.hparams["d_model"] + + # Specific behavior for experts tensors: suffix .weight, view as 3D and transpose + # original implementation expects (n_expert, n_ff, n_embd) for all experts weights + # But llama.cpp moe graph works differently + # AND the dimensions in ggml are typically in the reverse order of the pytorch dimensions + # so (n_expert, n_ff, n_embd) in pytorch is {n_embd, n_ff, n_expert} in ggml_tensor + exp_tensor_names = {"ffn.experts.mlp.w1": None, # LLM_TENSOR_FFN_GATE_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + "ffn.experts.mlp.w2": (0, 2, 1), # LLM_TENSOR_FFN_DOWN_EXPS ggml_tensor->ne{n_ff, n_embd, n_expert} + "ffn.experts.mlp.v1": None} # LLM_TENSOR_FFN_UP_EXPS ggml_tensor->ne{n_embd, n_ff, n_expert} + experts = False + + for exp_tensor_name in exp_tensor_names.keys(): + if name.find(exp_tensor_name) != -1 and name.find(".weight") == -1: + experts = True + data_torch = data_torch.view(n_expert, n_ff, n_embd) + if (permute_tensor := exp_tensor_names[exp_tensor_name]) is not None: + data_torch = data_torch.permute(*permute_tensor) + break + + # map tensor names + # In MoE models the ffn tensors are typically most of the model weights, + # and need to be quantizable. Quantize expects tensor names to be suffixed by .weight. + # Every other model has the weight names ending in .weight, + # let's assume that is the convention which is not the case for dbrx: + # https://huggingface.co/databricks/dbrx-instruct/blob/main/model.safetensors.index.json#L15 + new_name = self.map_tensor_name(name if not experts else name + ".weight", try_suffixes=(".weight",)) + + return [(new_name, data_torch)] + + def extra_f16_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del name, new_name, bid # unused + + return n_dims > 1 + @Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): @@ -1069,7 +1436,7 @@ class MiniCPMModel(Model): self.gguf_writer.add_file_type(self.ftype) def set_vocab(self): - self._set_vocab_hf() + self._set_vocab_llama_hf() def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor: if n_kv_head is not None and n_head != n_kv_head: @@ -1081,54 +1448,19 @@ class MiniCPMModel(Model): .reshape(weights.shape) ) - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - n_head = self.hparams.get("num_attention_heads") + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] n_kv_head = self.hparams.get("num_key_value_heads") - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")): - continue - old_dtype = data_torch.dtype + # HF models permute some of the tensors, so we need to undo that + if name.endswith(("q_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) + if name.endswith(("k_proj.weight")): + data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - # HF models permute some of the tensors, so we need to undo that - if name.endswith(("q_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight")): - data_torch = self._reverse_hf_permute(data_torch, n_head, n_kv_head) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + return [(self.map_tensor_name(name), data_torch)] @Model.register("QWenLMHeadModel") @@ -1172,52 +1504,74 @@ class QwenModel(Model): self.gguf_writer.add_head_count(self.hparams["num_attention_heads"]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["layer_norm_epsilon"]) - def write_tensors(self): - block_count = self.hparams["num_hidden_layers"] - model_kv = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - @Model.register("Qwen2ForCausalLM") class Qwen2Model(Model): model_arch = gguf.MODEL_ARCH.QWEN2 + def set_vocab(self): + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + +@Model.register("Qwen2MoeForCausalLM") +class Qwen2MoeModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN2MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + datas: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(datas, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def write_tensors(self): + super().write_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + @Model.register("GPT2LMHeadModel") class GPT2Model(Model): @@ -1233,55 +1587,27 @@ class GPT2Model(Model): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_epsilon"]) self.gguf_writer.add_file_type(self.ftype) - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - for name, data_torch in self.get_tensors(): - # we don't need these - if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq", ".attn.bias", ".attn.masked_bias")): - continue + tensors: list[tuple[str, Tensor]] = [] - if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): - data_torch = data_torch.transpose(1, 0) + # we don't need these + if name.endswith((".attn.bias", ".attn.masked_bias")): + return tensors - old_dtype = data_torch.dtype + if name.endswith((".c_attn.weight", ".c_proj.weight", ".c_fc.weight", ".c_proj.weight")): + data_torch = data_torch.transpose(1, 0) - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) + new_name = self.map_tensor_name(name) - data = data_torch.squeeze().numpy() + tensors.append((new_name, data_torch)) - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + # note: GPT2 output is tied to (same as) wte in original model + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - # note: GPT2 output is tied to (same as) wte in original model - if new_name == "token_embd.weight": - print(f"output.weight, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor("output.weight", data) + return tensors @Model.register("PhiForCausalLM") @@ -1309,6 +1635,92 @@ class Phi2Model(Model): self.gguf_writer.add_add_bos_token(False) +@Model.register("Phi3ForCausalLM") +class Phi3MiniModel(Model): + model_arch = gguf.MODEL_ARCH.PHI3 + + def set_vocab(self): + from sentencepiece import SentencePieceProcessor + + tokenizer_path = self.dir_model / 'tokenizer.model' + + if not tokenizer_path.is_file(): + raise ValueError(f'Error: Missing {tokenizer_path}') + + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) + + tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)] + scores: list[float] = [-10000.0] * vocab_size + toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size + + for token_id in range(tokenizer.vocab_size()): + + piece = tokenizer.IdToPiece(token_id) + text = piece.encode("utf-8") + score = tokenizer.GetScore(token_id) + + toktype = SentencePieceTokenTypes.NORMAL + if tokenizer.IsUnknown(token_id): + toktype = SentencePieceTokenTypes.UNKNOWN + elif tokenizer.IsControl(token_id): + toktype = SentencePieceTokenTypes.CONTROL + elif tokenizer.IsUnused(token_id): + toktype = SentencePieceTokenTypes.UNUSED + elif tokenizer.IsByte(token_id): + toktype = SentencePieceTokenTypes.BYTE + + tokens[token_id] = text + scores[token_id] = score + toktypes[token_id] = toktype + + added_tokens_file = self.dir_model / 'added_tokens.json' + if added_tokens_file.is_file(): + with open(added_tokens_file, "r", encoding="utf-8") as f: + added_tokens_json = json.load(f) + + for key in added_tokens_json: + token_id = added_tokens_json[key] + if (token_id >= vocab_size): + logger.debug(f'ignore token {token_id}: id is out of range, max={vocab_size - 1}') + continue + + tokens[token_id] = key.encode("utf-8") + scores[token_id] = -1000.0 + toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED + + self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) + special_vocab.add_to_gguf(self.gguf_writer) + + def set_gguf_parameters(self): + block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) + + rot_pct = 1.0 + n_embd = self.find_hparam(["hidden_size", "n_embd"]) + n_head = self.find_hparam(["num_attention_heads", "n_head"]) + rms_eps = self.find_hparam(["rms_norm_eps"]) + + self.gguf_writer.add_name("Phi3") + self.gguf_writer.add_context_length(self.find_hparam(["n_positions", "max_position_embeddings"])) + + self.gguf_writer.add_embedding_length(n_embd) + self.gguf_writer.add_feed_forward_length(8192) + self.gguf_writer.add_block_count(block_count) + self.gguf_writer.add_head_count(n_head) + self.gguf_writer.add_head_count_kv(n_head) + self.gguf_writer.add_layer_norm_rms_eps(rms_eps) + self.gguf_writer.add_rope_dimension_count(int(rot_pct * n_embd) // n_head) + self.gguf_writer.add_file_type(self.ftype) + + @Model.register("PlamoForCausalLM") class PlamoModel(Model): model_arch = gguf.MODEL_ARCH.PLAMO @@ -1343,52 +1755,18 @@ class PlamoModel(Model): data_torch = torch.reshape(data_torch, (5120, 5120)) return data_torch - def write_tensors(self): - block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - for name, data_torch in self.get_tensors(): - if "self_attn.rotary_emb.inv_freq" in name: - continue + new_name = self.map_tensor_name(name) - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + # shuffle for broadcasting of gqa in ggml_mul_mat + if new_name.endswith("attn_q.weight"): + data_torch = self.shuffle_attn_q_weight(data_torch) + elif new_name.endswith("attn_output.weight"): + data_torch = self.shuffle_attn_output_weight(data_torch) - # shuffle for broadcasting of gqa in ggml_mul_mat - if new_name.endswith("attn_q.weight"): - data_torch = self.shuffle_attn_q_weight(data_torch) - elif new_name.endswith("attn_output.weight"): - data_torch = self.shuffle_attn_output_weight(data_torch) - - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + return [(new_name, data_torch)] @Model.register("CodeShellForCausalLM") @@ -1411,52 +1789,21 @@ class CodeShellModel(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) - tensors = dict(self.get_tensors()) - has_lm_head = "lm_head.weight" in tensors.keys() or "output.weight" in tensors.keys() - for name, data_torch in tensors.items(): - # we don't need these - if name.endswith((".attn.rotary_emb.inv_freq")): - continue + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - old_dtype = data_torch.dtype + new_name = self.map_tensor_name(name) - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) + tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)] - data = data_torch.squeeze().numpy() + if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD): + assert self.tensor_names is not None - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")): + # copy tok_embd.weight to output.weight + tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch)) - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) - - if not has_lm_head and name == "transformer.wte.weight": - self.gguf_writer.add_tensor("output.weight", data) - print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") + return tensors @Model.register("InternLM2ForCausalLM") @@ -1478,34 +1825,36 @@ class InternLM2Model(Model): toktypes: list[int] = [] if not tokenizer_path.is_file(): - print(f'Error: Missing {tokenizer_path}', file=sys.stderr) + logger.error(f'Error: Missing {tokenizer_path}') sys.exit(1) sentencepiece_model = model.ModelProto() sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read()) add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix - tokenizer = SentencePieceProcessor(str(tokenizer_path)) + tokenizer = SentencePieceProcessor() + tokenizer.LoadFromFile(str(tokenizer_path)) + vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size()) for token_id in range(vocab_size): - piece = tokenizer.id_to_piece(token_id) + piece = tokenizer.IdToPiece(token_id) text = piece.encode("utf-8") - score = tokenizer.get_score(token_id) + score = tokenizer.GetScore(token_id) if text == b"\x00": # (TODO): fixme # Hack here and replace the \x00 characters. - print(f"InternLM2 convert token '{text}' to '🐉'!") - text = "🐉" + logger.warning(f"InternLM2 convert token '{text}' to '🐉'!") + text = "🐉".encode("utf-8") toktype = SentencePieceTokenTypes.NORMAL - if tokenizer.is_unknown(token_id): + if tokenizer.IsUnknown(token_id): toktype = SentencePieceTokenTypes.UNKNOWN - elif tokenizer.is_control(token_id): + elif tokenizer.IsControl(token_id): toktype = SentencePieceTokenTypes.CONTROL - elif tokenizer.is_unused(token_id): + elif tokenizer.IsUnused(token_id): toktype = SentencePieceTokenTypes.UNUSED - elif tokenizer.is_byte(token_id): + elif tokenizer.IsByte(token_id): toktype = SentencePieceTokenTypes.BYTE tokens.append(text) @@ -1523,6 +1872,7 @@ class InternLM2Model(Model): toktypes.append(SentencePieceTokenTypes.USER_DEFINED) self.gguf_writer.add_tokenizer_model("llama") + self.gguf_writer.add_tokenizer_pre("default") self.gguf_writer.add_token_list(tokens) self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) @@ -1532,20 +1882,24 @@ class InternLM2Model(Model): old_eos = special_vocab.special_token_ids["eos"] if "chat" in os.path.basename(self.dir_model.absolute()): # For the chat model, we replace the eos with '<|im_end|>'. + # TODO: this is a hack, should be fixed + # https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048 special_vocab.special_token_ids["eos"] = self._try_get_sft_eos(tokenizer) - print(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ + logger.warning(f"Replace eos:{old_eos} with a special token:{special_vocab.special_token_ids['eos']} \ in chat mode so that the conversation can end normally.") special_vocab.add_to_gguf(self.gguf_writer) def _try_get_sft_eos(self, tokenizer): - unused_145_list = tokenizer.encode('[UNUSED_TOKEN_145]') - im_end_list = tokenizer.encode('<|im_end|>') + unused_145_list = tokenizer.Encode('[UNUSED_TOKEN_145]') + im_end_list = tokenizer.Encode('<|im_end|>') + eos_token = None assert (len(unused_145_list) == 1) ^ (len(im_end_list) == 1) if len(unused_145_list) == 1: eos_token = unused_145_list[0] if len(im_end_list) == 1: eos_token = im_end_list[0] + assert eos_token return eos_token def _hf_permute_qk(self, weights, n_head: int, n_head_kv: int): @@ -1566,72 +1920,36 @@ in chat mode so that the conversation can end normally.") self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_head_count_kv(self.hparams["num_key_value_heads"]) - def post_write_tensors(self, tensor_map, name, data_torch): - old_dtype = data_torch.dtype - - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) - - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - self.gguf_writer.add_tensor(new_name, data) - - def write_tensors(self): - from einops import rearrange - - num_heads = self.hparams.get("num_attention_heads") - num_kv_heads = self.hparams.get("num_key_value_heads") - hidden_size = self.hparams.get("hidden_size") + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + num_heads = self.hparams["num_attention_heads"] + num_kv_heads = self.hparams["num_key_value_heads"] + hidden_size = self.hparams["hidden_size"] q_per_kv = num_heads // num_kv_heads head_dim = hidden_size // num_heads num_groups = num_heads // q_per_kv - block_count = self.hparams["num_hidden_layers"] - model_kv = dict(self.get_tensors()) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) qkv_pattern = r"model\.layers\.(\d+)\.attention\.wqkv" - for name, data_torch in model_kv.items(): - # we don't need these - if name.endswith(".rotary_emb.inv_freq"): - continue - if re.match(qkv_pattern, name): - bid = re.findall(qkv_pattern, name)[0] - qkv = data_torch - qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) - q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] - # The model weights of q and k equire additional reshape. - q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) - k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) - v = rearrange(v, " o g n i -> o (g n i)").T - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wq.weight", q) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wk.weight", k) - self.post_write_tensors(tensor_map, f"model.layers.{bid}.attention.wv.weight", v) - else: - self.post_write_tensors(tensor_map, name, data_torch) + if re.match(qkv_pattern, name): + bid = re.findall(qkv_pattern, name)[0] + qkv = data_torch + # qkv = rearrange(qkv.T, " o (g n i) ->o g n i", g=num_groups, n=q_per_kv + 2, i=head_dim) + qkv = qkv.T.reshape((-1, num_groups, q_per_kv + 2, head_dim)) + q, k, v = qkv[..., : q_per_kv, :], qkv[..., q_per_kv: q_per_kv + 1, :], qkv[..., q_per_kv + 1: q_per_kv + 2, :] + # The model weights of q and k equire additional reshape. + # q = self._hf_permute_qk(rearrange(q, " o g n i -> o (g n i)").T, num_heads, num_heads) + q = self._hf_permute_qk(q.reshape((q.shape[0], -1)).T, num_heads, num_heads) + # k = self._hf_permute_qk(rearrange(k, " o g n i -> o (g n i)").T, num_heads, num_kv_heads) + k = self._hf_permute_qk(k.reshape((k.shape[0], -1)).T, num_heads, num_kv_heads) + # v = rearrange(v, " o g n i -> o (g n i)").T + v = v.reshape((v.shape[0], -1)).T + return [ + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), q), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), k), + (self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), v), + ] + else: + return [(self.map_tensor_name(name), data_torch)] @Model.register("BertModel", "CamembertModel") @@ -1670,77 +1988,46 @@ class BertModel(Model): self.gguf_writer.add_pooling_type(pooling_type) def set_vocab(self): - path = self.dir_model - added_tokens_path = self.dir_model if self.dir_model.exists() else None - - # use huggingface vocab to get all tokens - vocab = HfVocab(path, added_tokens_path) - tokens, scores, toktypes = zip(*vocab.all_tokens()) - assert len(tokens) == vocab.vocab_size - self.vocab_size = vocab.vocab_size + tokens, toktypes, tokpre = self.get_vocab_base() + self.vocab_size = len(tokens) # we need this to validate the size of the token_type embeddings # though currently we are passing all zeros to the token_type embeddings - n_token_types = len(set(toktypes)) - self.gguf_writer.add_token_type_count(n_token_types) + self.gguf_writer.add_token_type_count(2) # "Sequence A" or "Sequence B" # convert to phantom space vocab - def phantom(tok, typ): - if tok.startswith(b"[") and tok.endswith(b"]"): + def phantom(tok): + if tok.startswith("[") and tok.endswith("]"): return tok - if tok.startswith(b"##"): + if tok.startswith("##"): return tok[2:] - return b"\xe2\x96\x81" + tok - tokens = tuple(phantom(t, y) for t, y in zip(tokens, toktypes)) - - # set up bos and eos tokens (cls and sep) - self.gguf_writer.add_bos_token_id(vocab.tokenizer.cls_token_id) - self.gguf_writer.add_eos_token_id(vocab.tokenizer.sep_token_id) + return "\u2581" + tok + tokens = list(map(phantom, tokens)) # add vocab to gguf self.gguf_writer.add_tokenizer_model("bert") + self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_scores(scores) self.gguf_writer.add_token_types(toktypes) # handle special tokens special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens)) special_vocab.add_to_gguf(self.gguf_writer) - def write_tensors(self): - tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) - tensors = dict(self.get_tensors()) - for name, data_torch in tensors.items(): - # we are only using BERT for embeddings so we don't need the pooling layer - if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): - continue # we don't need these + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + # we are only using BERT for embeddings so we don't need the pooling layer + if name in ("embeddings.position_ids", "pooler.dense.weight", "pooler.dense.bias"): + return [] # we don't need these - data = data_torch.squeeze().numpy() - n_dims = len(data.shape) - new_dtype: type[np.floating[Any]] + return [(self.map_tensor_name(name), data_torch)] - if ( - self.ftype == 1 and name.endswith(".weight") and n_dims == 2 - and name != "embeddings.token_type_embeddings.weight" # not used with get_rows, must be F32 - ): - # if f16 desired, convert any float32 2-dim weight tensors to float16 - new_dtype = np.float16 - else: - # if f32 desired, convert any float16 to float32 - new_dtype = np.float32 + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del new_name, bid, n_dims # unused - print(f"{new_name}, n_dims = {n_dims}, {data_torch.dtype} --> {new_dtype}") - - if data.dtype != new_dtype: - data = data.astype(new_dtype) - - self.gguf_writer.add_tensor(new_name, data) + # not used with get_rows, must be F32 + return name == "embeddings.token_type_embeddings.weight" @Model.register("NomicBertModel") @@ -1772,16 +2059,6 @@ class NomicBertModel(BertModel): super().set_gguf_parameters() self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"]) - def get_tensors(self): - assert self.vocab_size is not None - for name, data in super().get_tensors(): - # Nomic Embed's token embeddings tensor is padded, but llama.cpp wants tensor sizes to match exactly. - if name == 'embeddings.word_embeddings.weight' and data.shape[1] != self.vocab_size: - rounded_vocab_size = (self.vocab_size + 63) // 64 * 64 - assert data.shape == (rounded_vocab_size, self.hparams["n_embd"]) - data = data[:self.vocab_size, :] - yield name, data - @Model.register("GemmaForCausalLM") class GemmaModel(Model): @@ -1790,6 +2067,16 @@ class GemmaModel(Model): def set_vocab(self): self._set_vocab_sentencepiece() + # TODO: these special tokens should be exported only for the CodeGemma family + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'fsep', 'eot']) + special_vocab._set_special_token("prefix", 67) + special_vocab._set_special_token("suffix", 69) + special_vocab._set_special_token("middle", 68) + special_vocab._set_special_token("fsep", 70) + special_vocab._set_special_token("eot", 107) + special_vocab.add_to_gguf(self.gguf_writer) + def set_gguf_parameters(self): hparams = self.hparams block_count = hparams["num_hidden_layers"] @@ -1806,40 +2093,20 @@ class GemmaModel(Model): self.gguf_writer.add_value_length(hparams["head_dim"]) self.gguf_writer.add_file_type(self.ftype) - def write_tensors(self): - block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer"))) - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - for name, data_torch in self.get_tensors(): - old_dtype = data_torch.dtype + # lm_head is not used in llama.cpp, while autoawq will include this tensor in model + # To prevent errors, skip loading lm_head.weight. + if name == "lm_head.weight": + logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") + return [] - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - data = data_torch.squeeze().numpy() - - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() - - n_dims = len(data.shape) - data_dtype = data.dtype - - data = data.astype(np.float32) - - # if f16 desired, convert any float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + return [(self.map_tensor_name(name), data_torch)] @Model.register("Starcoder2ForCausalLM") @@ -1862,36 +2129,53 @@ class MambaModel(Model): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() + elif (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() else: # Use the GPT-NeoX tokenizer when no tokenizer files are present tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf" - print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") + logger.warning(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'") neox_reader = gguf.GGUFReader(tokenizer_path, "r") field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL) - self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1])) + self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]).decode("utf-8") if field else "gpt2") + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PRE) + self.gguf_writer.add_tokenizer_pre(bytes(field.parts[-1]).decode("utf-8") if field else "mpt") + field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST) + assert field self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE) + assert field self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES) + assert field self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data]) + field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID) - self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0]) + self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0] if field else 1) + field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID) - self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0]) + self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0] if field else 0) + field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID) - self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0]) + self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0] if field else 0) + + field = neox_reader.get_field(gguf.Keys.Tokenizer.PAD_ID) + self.gguf_writer.add_pad_token_id(field.parts[-1].tolist()[0] if field else 0) def set_gguf_parameters(self): - d_model = self.find_hparam(["hidden_size", "d_model"]) - d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 + d_model = self.find_hparam(["hidden_size", "d_model"]) + d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model - d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 # ceiling division # ref: https://stackoverflow.com/a/17511341/22827863 # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 - dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) + dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 # Fail early for models which don't have a block expansion factor of 2 @@ -1910,59 +2194,42 @@ class MambaModel(Model): self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps) self.gguf_writer.add_file_type(self.ftype) - def write_tensors(self): - block_count = self.hparams["n_layer"] - tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) + _tok_embd = None - tok_embd = None - tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight" - output_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT] + ".weight" + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused - for name, data_torch in self.get_tensors(): - old_dtype = data_torch.dtype + output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) + tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - # convert any unsupported data types to float32 - if data_torch.dtype not in (torch.float16, torch.float32): - data_torch = data_torch.to(torch.float32) + new_name = self.map_tensor_name(name) - # map tensor names - new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias")) - if new_name is None: - print(f"Can not map tensor {name!r}") - sys.exit() + if name.endswith(".A_log"): + logger.debug("A_log --> A ==> " + new_name) + data_torch = -torch.exp(data_torch) - if name.endswith(".A_log"): - print("A_log --> A ==> " + new_name) - data_torch = -torch.exp(data_torch) + # assuming token_embd.weight is seen before output.weight + if self._tok_embd is not None and new_name == output_name: + if torch.equal(self._tok_embd, data_torch): + logger.debug(f"{output_name} is equivalent to {tok_embd_name}, omitting") + return [] + elif new_name == tok_embd_name: + self._tok_embd = data_torch - # assuming token_embd.weight is seen before output.weight - if tok_embd is not None and new_name == output_name: - if torch.equal(tok_embd, data_torch): - print(f"{output_name} is equivalent to {tok_embd_name}, omitting") - continue - if new_name == tok_embd_name: - tok_embd = data_torch + return [(new_name, data_torch)] - data = data_torch.squeeze().numpy() + def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool: + del n_dims # unused - n_dims = len(data.shape) - data_dtype = data.dtype - - # if f32 desired, convert any float16 to float32 - if self.ftype == 0 and data_dtype == np.float16: - data = data.astype(np.float32) - - # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32 - if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1: - data = data.astype(np.float32) - - # if f16 desired, convert big float32 2-dim weight tensors to float16 - if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2: - data = data.astype(np.float16) - - print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}") - - self.gguf_writer.add_tensor(new_name, data) + return bid is not None and new_name in ( + self.format_tensor_name(n, bid, ".weight" if name.endswith(".weight") else "") for n in [ + gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.SSM_X, + gguf.MODEL_TENSOR.SSM_DT, + gguf.MODEL_TENSOR.SSM_A, + gguf.MODEL_TENSOR.SSM_D, + ] + ) @Model.register("CohereForCausalLM") @@ -1982,9 +2249,158 @@ class CommandR2Model(Model): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) +@Model.register("OlmoForCausalLM") +@Model.register("OLMoForCausalLM") +class OlmoModel(Model): + model_arch = gguf.MODEL_ARCH.OLMO + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_layer_norm_eps(1e-5) + clip_qkv = self.hparams.get("clip_qkv") + if clip_qkv is not None: + self.gguf_writer.add_clamp_kqv(clip_qkv) + + # Same as super class, but permuting q_proj, k_proj + # Copied from: LlamaModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + + n_head = self.hparams["num_attention_heads"] + n_kv_head = self.hparams.get("num_key_value_heads") + + if name.endswith("q_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_head) + if name.endswith("k_proj.weight"): + data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) + + return [(self.map_tensor_name(name), data_torch)] + + ###### CONVERSION LOGIC ###### +# tree of lazy tensors +class LazyTorchTensor: + _meta: Tensor + _data: Tensor | None + _args: tuple + _func: Callable[[tuple], Tensor] | None + + def __init__(self, *, meta: Tensor, data: Tensor | None = None, args: tuple = (), func: Callable[[tuple], Tensor] | None = None): + self._meta = meta + self._data = data + self._args = args + self._func = func + + @staticmethod + def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any: + # TODO: dict and set + if isinstance(o, (list, tuple)): + L = [] + for item in o: + L.append(LazyTorchTensor._recurse_apply(item, fn)) + if isinstance(o, tuple): + L = tuple(L) + return L + elif isinstance(o, LazyTorchTensor): + return fn(o) + else: + return o + + def _wrap_fn(self, fn: Callable, use_self: bool = False) -> Callable[[Any], LazyTorchTensor]: + def wrapped_fn(*args, **kwargs): + if kwargs is None: + kwargs = {} + args = ((self,) if use_self else ()) + args + + meta_args = LazyTorchTensor._recurse_apply(args, lambda t: t._meta) + + return LazyTorchTensor(meta=fn(*meta_args, **kwargs), args=args, func=lambda a: fn(*a, **kwargs)) + return wrapped_fn + + def __getattr__(self, __name: str) -> Any: + meta_attr = getattr(self._meta, __name) + if callable(meta_attr): + return self._wrap_fn(getattr(torch.Tensor, __name), use_self=True) + elif isinstance(meta_attr, torch.Tensor): + # for things like self.T + return self._wrap_fn(lambda s: getattr(s, __name))(self) + else: + return meta_attr + + _dtype_map: dict[torch.dtype, type] = { + torch.float16: np.float16, + torch.float32: np.float32, + } + + def numpy(self) -> gguf.LazyTensor: + dtype = self._dtype_map[self.dtype] + return gguf.LazyTensor(lambda: LazyTorchTensor.to_eager(self).numpy(), dtype=dtype, shape=self.shape) + + @overload + @staticmethod + def to_eager(t: Tensor | LazyTorchTensor) -> Tensor: ... + + @overload + @staticmethod + def to_eager(t: tuple) -> tuple: ... + + @staticmethod + def to_eager(t: Any) -> Any: + def simple_to_eager(_t: LazyTorchTensor) -> Tensor: + # wake up the lazy tensor + if _t._data is None and _t._func is not None: + # recurse into its arguments + _t._args = LazyTorchTensor.to_eager(_t._args) + _t._data = _t._func(_t._args) + if _t._data is not None: + return _t._data + else: + raise ValueError(f"Could not compute lazy tensor {_t!r} with args {_t._args!r}") + + # recurse into lists and/or tuples, keeping their structure + return LazyTorchTensor._recurse_apply(t, simple_to_eager) + + @staticmethod + def from_eager(t: Tensor) -> Tensor: + if (t.__class__ == LazyTorchTensor): + return t + return LazyTorchTensor(meta=t.detach().to("meta"), data=t) # type: ignore + + @classmethod + def __torch_function__(cls, func, types, args=(), kwargs=None): + del types # unused + + if kwargs is None: + kwargs = {} + + if func is torch.Tensor.numpy: + return args[0].numpy() + if func is torch.equal: + eager_args = LazyTorchTensor.to_eager(args) + return func(*eager_args, **kwargs) + + return LazyTorchTensor._wrap_fn(args[0], func)(*args, **kwargs) + + # special methods bypass __getattr__, so they need to be added manually + # ref: https://docs.python.org/3/reference/datamodel.html#special-lookup + # NOTE: LazyTorchTensor can't be a subclass of Tensor (and then be used + # as self._meta is currently used), because then the following + # operations would by default not be wrapped, and so not propagated + # when the tensor is made eager. + # It's better to get non-silent errors for not-yet-supported operators. + # TODO: add more when needed to avoid clutter, or find a more concise way + def __neg__(self, *args): # mamba + return self._wrap_fn(torch.Tensor.__neg__)(self, *args) + + def __add__(self, *args): # gemma + return self._wrap_fn(torch.Tensor.__add__)(self, *args) + + def __getitem__(self, *args): # bloom falcon refact internlm2 + return self._wrap_fn(torch.Tensor.__getitem__)(self, *args) + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( description="Convert a huggingface model to a GGML compatible file") @@ -1994,7 +2410,8 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--awq-path", type=Path, default=None, - help="Path to scale awq cache file") + help="Path to scale awq cache file", + ) parser.add_argument( "--outfile", type=Path, help="path to write to; default: based on input", @@ -2003,11 +2420,30 @@ def parse_args() -> argparse.Namespace: "--outtype", type=str, choices=["f32", "f16"], default="f16", help="output format - use f32 for float32, f16 for float16", ) - parser.add_argument("--bigendian", action="store_true", help="model is executed on big endian machine") + parser.add_argument( + "--bigendian", action="store_true", + help="model is executed on big endian machine", + ) parser.add_argument( "model", type=Path, help="directory containing model file", ) + parser.add_argument( + "--use-temp-file", action="store_true", + help="use the tempfile library while processing (helpful when running out of memory, process killed)", + ) + parser.add_argument( + "--no-lazy", action="store_true", + help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", + ) + parser.add_argument( + "--model-name", type=str, default=None, + help="name of the model", + ) + parser.add_argument( + "--verbose", action="store_true", + help="increase output verbosity", + ) return parser.parse_args() @@ -2015,6 +2451,8 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + dir_model = args.model if args.awq_path: @@ -2023,15 +2461,15 @@ def main() -> None: tmp_model_path = args.model / "weighted_model" dir_model = tmp_model_path if tmp_model_path.is_dir(): - print(f"{tmp_model_path} exists as a weighted model.") + logger.info(f"{tmp_model_path} exists as a weighted model.") else: tmp_model_path.mkdir(parents=True, exist_ok=True) - print("Saving new weighted model ...") + logger.info("Saving new weighted model ...") add_scale_weights(str(args.model), str(args.awq_path), str(tmp_model_path)) - print(f"Saved weighted model at {tmp_model_path}.") + logger.info(f"Saved weighted model at {tmp_model_path}.") if not dir_model.is_dir(): - print(f'Error: {args.model} is not a directory', file=sys.stderr) + logger.error(f'Error: {args.model} is not a directory') sys.exit(1) ftype_map = { @@ -2045,28 +2483,28 @@ def main() -> None: # output in the same directory as the model by default fname_out = dir_model / f'ggml-model-{args.outtype}.gguf' - print(f"Loading model: {dir_model.name}") + logger.info(f"Loading model: {dir_model.name}") hparams = Model.load_hparams(dir_model) with torch.inference_mode(): model_class = Model.from_model_architecture(hparams["architectures"][0]) - model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian) + model_instance = model_class(dir_model, ftype_map[args.outtype], fname_out, args.bigendian, args.use_temp_file, args.no_lazy) - print("Set model parameters") + logger.info("Set model parameters") model_instance.set_gguf_parameters() - print("Set model tokenizer") + logger.info("Set model tokenizer") model_instance.set_vocab() if args.vocab_only: - print(f"Exporting model vocab to '{fname_out}'") + logger.info(f"Exporting model vocab to '{fname_out}'") model_instance.write_vocab() else: - print(f"Exporting model to '{fname_out}'") + logger.info(f"Exporting model to '{fname_out}'") model_instance.write() - print(f"Model successfully exported to '{fname_out}'") + logger.info(f"Model successfully exported to '{fname_out}'") if __name__ == '__main__': diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index cd9644fcb..9349de3b3 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import struct @@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +logger = logging.getLogger("ggml-to-gguf") + class GGMLFormat(IntEnum): GGML = 0 @@ -125,7 +128,6 @@ class Tensor: self.start_offset = offset self.len_bytes = n_bytes offset += n_bytes - # print(n_dims, name_len, dtype, self.dims, self.name, pad) return offset - orig_offset @@ -175,7 +177,7 @@ class GGMLModel: offset += self.validate_header(data, offset) hp = Hyperparameters() offset += hp.load(data, offset) - print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') + logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}') self.validate_conversion(hp.ftype) vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML) offset += vocab.load(data, offset, hp.n_vocab) @@ -215,12 +217,12 @@ class GGMLToGGUF: if float(hp.n_head) / float(x) == gqa: n_kv_head = x assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param" - print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') + logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}') self.n_kv_head = n_kv_head self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer) def save(self): - print('* Preparing to save GGUF file') + logger.info('* Preparing to save GGUF file') gguf_writer = gguf.GGUFWriter( self.cfg.output, gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA], @@ -230,11 +232,11 @@ class GGMLToGGUF: if self.special_vocab is not None: self.special_vocab.add_to_gguf(gguf_writer) self.add_tensors(gguf_writer) - print(" gguf: write header") + logger.info(" gguf: write header") gguf_writer.write_header_to_file() - print(" gguf: write metadata") + logger.info(" gguf: write metadata") gguf_writer.write_kv_data_to_file() - print(" gguf: write tensors") + logger.info(" gguf: write tensors") gguf_writer.write_tensors_to_file() gguf_writer.close() @@ -250,7 +252,7 @@ class GGMLToGGUF: name = cfg.name if cfg.name is not None else cfg.input.name except UnicodeDecodeError: name = None - print('* Adding model parameters and KV items') + logger.info('* Adding model parameters and KV items') if name is not None: gguf_writer.add_name(name) gguf_writer.add_description(desc) @@ -281,12 +283,13 @@ class GGMLToGGUF: def add_vocab(self, gguf_writer): hp = self.model.hyperparameters gguf_writer.add_tokenizer_model('llama') + gguf_writer.add_tokenizer_pre('default') tokens = [] scores = [] toktypes = [] if self.vocab_override is not None: vo = self.vocab_override - print('* Adding vocab item(s)') + logger.info('* Adding vocab item(s)') for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()): tokens.append(vbytes) scores.append(score) @@ -298,7 +301,7 @@ class GGMLToGGUF: if len(toktypes) > 0: gguf_writer.add_token_types(toktypes) return - print(f'* Adding {hp.n_vocab} vocab item(s)') + logger.info(f'* Adding {hp.n_vocab} vocab item(s)') assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab' for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items): tt = 1 # Normal @@ -333,7 +336,7 @@ class GGMLToGGUF: def add_tensors(self, gguf_writer): tensor_map = self.name_map data = self.data - print(f'* Adding {len(self.model.tensors)} tensor(s)') + logger.info(f'* Adding {len(self.model.tensors)} tensor(s)') for tensor in self.model.tensors: name = str(tensor.name, 'UTF-8') mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) @@ -343,7 +346,6 @@ class GGMLToGGUF: temp = tempdims[1] tempdims[1] = tempdims[0] tempdims[0] = temp - # print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}') gguf_writer.add_tensor( mapped_name, data[tensor.start_offset:tensor.start_offset + tensor.len_bytes], @@ -400,33 +402,35 @@ def handle_args(): help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") parser.add_argument("--vocabtype", default="spm,hfft", help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") return parser.parse_args() def main(): cfg = handle_args() - print(f'* Using config: {cfg}') - print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n') + logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO) + logger.info(f'* Using config: {cfg}') + logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===') if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'): - print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') + logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".') data = np.memmap(cfg.input, mode = 'r') model = GGMLModel() - print('* Scanning GGML input file') + logger.info('* Scanning GGML input file') offset = model.load(data, 0) # noqa - print(f'* GGML model hyperparameters: {model.hyperparameters}') + logger.info(f'* GGML model hyperparameters: {model.hyperparameters}') vocab_override = None params_override = None special_vocab = None if cfg.model_metadata_dir is not None: (params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters) - print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') - print(f'* Overriding params: {params_override}') - print(f'* Overriding vocab: {vocab_override}') - print(f'* Special vocab: {special_vocab}') + logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.') + logger.info(f'* Overriding params: {params_override}') + logger.info(f'* Overriding vocab: {vocab_override}') + logger.info(f'* Special vocab: {special_vocab}') else: - print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') + logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n') if model.file_format == GGMLFormat.GGML: - print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') + logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!') converter = GGMLToGGUF( model, data, cfg, params_override = params_override, @@ -434,7 +438,7 @@ def main(): special_vocab = special_vocab ) converter.save() - print(f'* Successful completion. Output saved to: {cfg.output}') + logger.info(f'* Successful completion. Output saved to: {cfg.output}') if __name__ == '__main__': diff --git a/convert-lora-to-ggml.py b/convert-lora-to-ggml.py index 9a9936dec..f09fa85fe 100755 --- a/convert-lora-to-ggml.py +++ b/convert-lora-to-ggml.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import json import os import struct @@ -15,6 +16,9 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) import gguf +logging.basicConfig(level=logging.DEBUG) +logger = logging.getLogger("lora-to-gguf") + NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1} @@ -48,11 +52,9 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty if __name__ == '__main__': if len(sys.argv) < 2: - print(f"Usage: python {sys.argv[0]} [arch]") - print( - "Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'" - ) - print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") + logger.info(f"Usage: python {sys.argv[0]} [arch]") + logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'") + logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)") sys.exit(1) input_json = os.path.join(sys.argv[1], "adapter_config.json") @@ -70,7 +72,7 @@ if __name__ == '__main__': arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama" if arch_name not in gguf.MODEL_ARCH_NAMES.values(): - print(f"Error: unsupported architecture {arch_name}") + logger.error(f"Error: unsupported architecture {arch_name}") sys.exit(1) arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)] @@ -80,21 +82,21 @@ if __name__ == '__main__': params = json.load(f) if params["peft_type"] != "LORA": - print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") + logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA") sys.exit(1) if params["fan_in_fan_out"] is True: - print("Error: param fan_in_fan_out is not supported") + logger.error("Error: param fan_in_fan_out is not supported") sys.exit(1) if params["bias"] is not None and params["bias"] != "none": - print("Error: param bias is not supported") + logger.error("Error: param bias is not supported") sys.exit(1) # TODO: these seem to be layers that have been trained but without lora. # doesn't seem widely used but eventually should be supported if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0: - print("Error: param modules_to_save is not supported") + logger.error("Error: param modules_to_save is not supported") sys.exit(1) with open(output_path, "wb") as fout: @@ -125,13 +127,13 @@ if __name__ == '__main__': suffix = k[-len(lora_suffixes[0]):] k = k[: -len(lora_suffixes[0])] else: - print(f"Error: unrecognized tensor name {orig_k}") + logger.error(f"Error: unrecognized tensor name {orig_k}") sys.exit(1) tname = name_map.get_name(k) if tname is None: - print(f"Error: could not map tensor name {orig_k}") - print(" Note: the arch parameter must be specified if the model is not llama") + logger.error(f"Error: could not map tensor name {orig_k}") + logger.error(" Note: the arch parameter must be specified if the model is not llama") sys.exit(1) if suffix == ".lora_A.weight": @@ -141,8 +143,8 @@ if __name__ == '__main__': else: assert False - print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") + logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB") write_tensor_header(fout, tname, t.shape, t.dtype) t.tofile(fout) - print(f"Converted {input_json} and {input_model} to {output_path}") + logger.info(f"Converted {input_json} and {input_model} to {output_path}") diff --git a/convert-persimmon-to-gguf.py b/convert-persimmon-to-gguf.py index def210531..07dcade74 100755 --- a/convert-persimmon-to-gguf.py +++ b/convert-persimmon-to-gguf.py @@ -1,4 +1,7 @@ #!/usr/bin/env python3 +from __future__ import annotations + +import logging import argparse import os import sys @@ -12,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +logger = logging.getLogger("persimmon-to-gguf") + def _flatten_dict(dct, tensors, prefix=None): assert isinstance(dct, dict) @@ -28,9 +33,9 @@ def _flatten_dict(dct, tensors, prefix=None): def _get_sentencepiece_tokenizer_info(dir_model: Path): tokenizer_path = dir_model / 'adept_vocab.model' - print('gguf: getting sentencepiece tokenizer from', tokenizer_path) + logger.info('getting sentencepiece tokenizer from', tokenizer_path) tokenizer = SentencePieceProcessor(str(tokenizer_path)) - print('gguf: adding tokens') + logger.info('adding tokens') tokens: list[bytes] = [] scores: list[float] = [] toktypes: list[int] = [] @@ -65,8 +70,10 @@ def main(): parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file") parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release") - parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory") + parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) sys.path.append(str(args.adept_inference_dir)) persimmon_model = torch.load(args.ckpt_path) hparams = persimmon_model['args'] @@ -97,6 +104,7 @@ def main(): tokens, scores, toktypes = _get_sentencepiece_tokenizer_info(args.model_dir) gguf_writer.add_tokenizer_model('llama') + gguf_writer.add_tokenizer_pre('default') gguf_writer.add_token_list(tokens) gguf_writer.add_token_scores(scores) gguf_writer.add_token_types(toktypes) @@ -104,32 +112,31 @@ def main(): gguf_writer.add_eos_token_id(71013) tensor_map = gguf.get_tensor_name_map(arch, block_count) - print(tensor_map) + logger.info(tensor_map) for name in tensors.keys(): - data = tensors[name] + data_torch = tensors[name] if name.endswith(".self_attention.rotary_emb.inv_freq"): continue - old_dtype = data.dtype + old_dtype = data_torch.dtype # TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?) - data = data.to(torch.float32).squeeze().numpy() + data = data_torch.to(torch.float32).squeeze().numpy() new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias")) if new_name is None: - print("Can not map tensor '" + name + "'") - sys.exit() + raise ValueError(f"Can not map tensor '{name}'") + n_dims = len(data.shape) - print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype)) + logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}") gguf_writer.add_tensor(new_name, data) - print("gguf: write header") + logger.info("gguf: write header") gguf_writer.write_header_to_file() - print("gguf: write metadata") + logger.info("gguf: write metadata") gguf_writer.write_kv_data_to_file() - print("gguf: write tensors") + logger.info("gguf: write tensors") gguf_writer.write_tensors_to_file() gguf_writer.close() - print(f"gguf: model successfully exported to '{args.outfile}'") - print("") + logger.info(f"gguf: model successfully exported to '{args.outfile}'") if __name__ == '__main__': diff --git a/convert.py b/convert.py index 817cb6612..148bfd66a 100755 --- a/convert.py +++ b/convert.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import concurrent.futures import enum @@ -16,13 +17,14 @@ import re import signal import struct import sys +import textwrap import time import zipfile -from abc import ABCMeta, abstractmethod +from abc import ABC, abstractmethod from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path -from typing import IO, TYPE_CHECKING, Any, Callable, Iterable, Literal, TypeVar +from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable import numpy as np from sentencepiece import SentencePieceProcessor @@ -32,7 +34,9 @@ if 'NO_LOCAL_GGUF' not in os.environ: import gguf if TYPE_CHECKING: - from typing import TypeAlias + from typing_extensions import Self, TypeAlias + +logger = logging.getLogger("convert") if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'): faulthandler.register(signal.SIGUSR1) @@ -43,6 +47,9 @@ ARCH = gguf.MODEL_ARCH.LLAMA DEFAULT_CONCURRENCY = 8 +ADDED_TOKENS_FILE = 'added_tokens.json' +FAST_TOKENIZER_FILE = 'tokenizer.json' + # # data types # @@ -135,7 +142,8 @@ class GGMLFileType(enum.IntEnum): dt = GGML_FILE_TYPE_TO_DATA_TYPE.get(self) if dt is None: raise ValueError(self) - # 1D tensors are always F32. + # Convert all 1D tensors to F32. Most of the codebase that takes in 1D tensors only handles F32 tensors, and most of the outputs tensors are F32. + # Also The 1d tensors aren't much of a performance/size issue. So instead of having to have separate F32 and F16 implementations of both, just convert everything to F32 for now. return dt if len(tensor.shape) > 1 else DT_F32 @@ -188,8 +196,10 @@ class Params: n_layer = next(i for i in itertools.count() if f"layers.{i}.attention.wq.weight" not in model) if n_layer < 1: - raise Exception("failed to guess 'n_layer'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + msg = """\ + failed to guess 'n_layer'. This model is unknown or unsupported. + Suggestion: provide 'config.json' of the model in the same directory containing model files.""" + raise KeyError(textwrap.dedent(msg)) n_head = n_embd // 128 # guessed n_mult = 256 # guessed @@ -211,7 +221,8 @@ class Params: @staticmethod def loadHFTransformerJson(model: LazyModel, config_path: Path) -> Params: - config = json.load(open(config_path)) + with open(config_path) as f: + config = json.load(f) rope_scaling_type = f_rope_scale = n_orig_ctx = rope_finetuned = None rope_scaling = config.get("rope_scaling") @@ -233,8 +244,10 @@ class Params: elif "max_position_embeddings" in config: n_ctx = config["max_position_embeddings"] else: - raise Exception("failed to guess 'n_ctx'. This model is unknown or unsupported.\n" - "Suggestion: provide 'config.json' of the model in the same directory containing model files.") + msg = """\ + failed to guess 'n_ctx'. This model is unknown or unsupported. + Suggestion: provide 'config.json' of the model in the same directory containing model files.""" + raise KeyError(textwrap.dedent(msg)) n_experts = None n_experts_used = None @@ -265,11 +278,13 @@ class Params: # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1} @staticmethod def loadOriginalParamsJson(model: LazyModel, config_path: Path) -> Params: - config = json.load(open(config_path)) + with open(config_path) as f: + config = json.load(f) n_experts = None n_experts_used = None f_rope_freq_base = None + n_ff = None # hack to determine LLaMA v1 vs v2 vs CodeLlama if config.get("moe"): @@ -294,6 +309,8 @@ class Params: n_experts_used = config["moe"]["num_experts_per_tok"] f_rope_freq_base = 1e6 + assert n_ff is not None + return Params( n_vocab = model["tok_embeddings.weight"].shape[0], n_embd = config["dim"], @@ -331,47 +348,86 @@ class Params: # vocab # -class BpeVocab: +@runtime_checkable +class BaseVocab(Protocol): + tokenizer_model: ClassVar[str] + name: ClassVar[str] + + +class NoVocab(BaseVocab): + tokenizer_model = "no_vocab" + name = "no_vocab" + + def __repr__(self) -> str: + return "" + + +@runtime_checkable +class Vocab(BaseVocab, Protocol): + vocab_size: int + added_tokens_dict: dict[str, int] + added_tokens_list: list[str] + fname_tokenizer: Path + + def __init__(self, base_path: Path): ... + def all_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: ... + + +class BpeVocab(Vocab): tokenizer_model = "gpt2" name = "bpe" - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - if isinstance(self.bpe_tokenizer.get('model'), dict): - self.vocab = self.bpe_tokenizer["model"]["vocab"] - else: - self.vocab = self.bpe_tokenizer - added_tokens: dict[str, int] - if fname_added_tokens is not None: - # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - # Fall back to trying to find the added tokens in tokenizer.json - tokenizer_json_file = fname_tokenizer.parent / 'tokenizer.json' - if not tokenizer_json_file.is_file(): - added_tokens = {} - else: - tokenizer_json = json.load(open(tokenizer_json_file, encoding="utf-8")) - added_tokens = dict( - (item['content'], item['id']) - for item in tokenizer_json.get('added_tokens', []) - # Added tokens here can be duplicates of the main vocabulary. - if item['content'] not in self.bpe_tokenizer) + def __init__(self, base_path: Path): + added_tokens: dict[str, int] = {} - vocab_size: int = len(self.vocab) - expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) - actual_ids = sorted(added_tokens.values()) + if (fname_tokenizer := base_path / 'vocab.json').exists(): + # "slow" tokenizer + with open(fname_tokenizer, encoding="utf-8") as f: + self.vocab = json.load(f) + + try: + # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab. + with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: + added_tokens = json.load(f) + except FileNotFoundError: + pass + else: + # "fast" tokenizer + fname_tokenizer = base_path / FAST_TOKENIZER_FILE + + # if this fails, FileNotFoundError propagates to caller + with open(fname_tokenizer, encoding="utf-8") as f: + tokenizer_json = json.load(f) + + tokenizer_model: dict[str, Any] = tokenizer_json['model'] + if ( + tokenizer_model['type'] != 'BPE' or tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'ByteLevel' + ): + raise FileNotFoundError('Cannot find GPT-2 BPE tokenizer') + + self.vocab = tokenizer_model["vocab"] + + if (added := tokenizer_json.get('added_tokens')) is not None: + # Added tokens here can be duplicates of the main vocabulary. + added_tokens = {item['content']: item['id'] + for item in added + if item['content'] not in self.vocab} + + vocab_size = len(self.vocab) + expected_ids = list(range(vocab_size, vocab_size + len(added_tokens))) + actual_ids = sorted(added_tokens.values()) if expected_ids != actual_ids: expected_end_id = vocab_size + len(actual_ids) - 1 - raise Exception(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range {vocab_size} - {expected_end_id}; got {actual_ids}") + raise ValueError(f"Expected the {len(actual_ids)} added token ID(s) to be sequential in the range " + f"{vocab_size} - {expected_end_id}; got {actual_ids}") items = sorted(added_tokens.items(), key=lambda text_idx: text_idx[1]) self.added_tokens_dict = added_tokens self.added_tokens_list = [text for (text, idx) in items] - self.vocab_size_base: int = vocab_size - self.vocab_size: int = self.vocab_size_base + len(self.added_tokens_list) + self.vocab_size_base = vocab_size + self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = {id: encoded_tok for encoded_tok, id in self.vocab.items()} @@ -392,19 +448,26 @@ class BpeVocab: return f"" -class SentencePieceVocab: +class SentencePieceVocab(Vocab): tokenizer_model = "llama" name = "spm" - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: - self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer)) - added_tokens: dict[str, int] - if fname_added_tokens is not None: - added_tokens = json.load(open(fname_added_tokens, encoding="utf-8")) - else: - added_tokens = {} + def __init__(self, base_path: Path): + added_tokens: dict[str, int] = {} + if (fname_tokenizer := base_path / 'tokenizer.model').exists(): + # normal location + try: + with open(base_path / ADDED_TOKENS_FILE, encoding="utf-8") as f: + added_tokens = json.load(f) + except FileNotFoundError: + pass + elif not (fname_tokenizer := base_path.parent / 'tokenizer.model').exists(): + # not found in alternate location either + raise FileNotFoundError('Cannot find tokenizer.model') - vocab_size: int = self.sentencepiece_tokenizer.vocab_size() + self.sentencepiece_tokenizer = SentencePieceProcessor() + self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer)) + vocab_size = self.sentencepiece_tokenizer.vocab_size() new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size} expected_new_ids = list(range(vocab_size, vocab_size + len(new_tokens))) @@ -414,33 +477,32 @@ class SentencePieceVocab: raise ValueError(f"Expected new token IDs {expected_new_ids} to be sequential; got {actual_new_ids}") # Token pieces that were added to the base vocabulary. - self.added_tokens_dict = added_tokens + self.added_tokens_dict = added_tokens self.added_tokens_list = [new_tokens[id] for id in actual_new_ids] self.vocab_size_base = vocab_size self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.sentencepiece_tokenizer for i in range(tokenizer.vocab_size()): - piece = tokenizer.id_to_piece(i) - text: bytes = piece.encode("utf-8") - score: float = tokenizer.get_score(i) + piece = tokenizer.IdToPiece(i) + text = piece.encode("utf-8") + score: float = tokenizer.GetScore(i) toktype = gguf.TokenType.NORMAL - if tokenizer.is_unknown(i): + if tokenizer.IsUnknown(i): toktype = gguf.TokenType.UNKNOWN - if tokenizer.is_control(i): + if tokenizer.IsControl(i): toktype = gguf.TokenType.CONTROL # NOTE: I think added_tokens are user defined. # ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto # if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED - if tokenizer.is_unused(i): + if tokenizer.IsUnused(i): toktype = gguf.TokenType.UNUSED - if tokenizer.is_byte(i): + if tokenizer.IsByte(i): toktype = gguf.TokenType.BYTE yield text, score, toktype @@ -458,27 +520,47 @@ class SentencePieceVocab: return f"" -class HfVocab: +class LlamaHfVocab(Vocab): tokenizer_model = "llama" name = "hfft" - def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None = None) -> None: + def __init__(self, base_path: Path): + fname_tokenizer = base_path / FAST_TOKENIZER_FILE + # if this fails, FileNotFoundError propagates to caller + with open(fname_tokenizer, encoding='utf-8') as f: + tokenizer_json = json.load(f) + + # pre-check so we know if we need transformers + tokenizer_model: dict[str, Any] = tokenizer_json['model'] + is_llama3 = ( + tokenizer_model['type'] == 'BPE' and tokenizer_model.get('ignore_merges', False) + and not tokenizer_model.get('byte_fallback', True) + ) + if is_llama3: + raise TypeError('Llama 3 must be converted with BpeVocab') + + if not is_llama3 and ( + tokenizer_model['type'] != 'BPE' or not tokenizer_model.get('byte_fallback', False) + or tokenizer_json['decoder']['type'] != 'Sequence' + ): + raise FileNotFoundError('Cannot find Llama BPE tokenizer') + try: from transformers import AutoTokenizer except ImportError as e: raise ImportError( - "To use HfVocab, please install the `transformers` package. " + "To use LlamaHfVocab, please install the `transformers` package. " "You can install it with `pip install transformers`." ) from e - print("fname_tokenizer:", fname_tokenizer) # Allow the tokenizer to default to slow or fast versions. # Explicitly set tokenizer to use local paths. self.tokenizer = AutoTokenizer.from_pretrained( - fname_tokenizer, - cache_dir=fname_tokenizer, + base_path, + cache_dir=base_path, local_files_only=True, ) + assert self.tokenizer.is_fast # assume tokenizer.json is used # Initialize lists and dictionaries for added tokens self.added_tokens_list = [] @@ -506,8 +588,7 @@ class HfVocab: self.vocab_size_base = self.tokenizer.vocab_size self.vocab_size = self.vocab_size_base + len(self.added_tokens_list) - self.fname_tokenizer = fname_tokenizer - self.fname_added_tokens = fname_added_tokens + self.fname_tokenizer = fname_tokenizer def hf_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: reverse_vocab = { @@ -559,18 +640,7 @@ class HfVocab: yield from self.added_tokens() def __repr__(self) -> str: - return f"" - - -class NoVocab: - tokenizer_model = "no_vocab" - name = "no_vocab" - - def __repr__(self) -> str: - return "" - - -Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab" + return f"" # @@ -580,7 +650,6 @@ Vocab: TypeAlias = "BpeVocab | SentencePieceVocab | HfVocab | NoVocab" def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: - # print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) ) if n_head_kv is not None and n_head != n_head_kv: n_head = n_head_kv return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:]) @@ -588,17 +657,18 @@ def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray: .reshape(weights.shape)) -class Tensor(metaclass=ABCMeta): +class Tensor(ABC): + ndarray: NDArray data_type: DataType @abstractmethod - def astype(self, data_type: DataType) -> Tensor: ... + def astype(self, data_type: DataType) -> Self: ... @abstractmethod - def permute(self, n_head: int, n_head_kv: int) -> Tensor: ... + def permute(self, n_head: int, n_head_kv: int) -> Self: ... @abstractmethod - def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: ... + def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> Self: ... @abstractmethod - def part(self, n_part: int) -> UnquantizedTensor: ... + def part(self, n_part: int) -> Self: ... @abstractmethod def to_ggml(self) -> GGMLCompatibleTensor: ... @@ -610,18 +680,18 @@ def bf16_to_fp32(bf16_arr: np.ndarray[Any, np.dtype[np.uint16]]) -> NDArray: class UnquantizedTensor(Tensor): - def __init__(self, ndarray: NDArray) -> None: + def __init__(self, ndarray: NDArray): assert isinstance(ndarray, np.ndarray) self.ndarray = ndarray self.data_type = NUMPY_TYPE_TO_DATA_TYPE[ndarray.dtype] - def astype(self, data_type: DataType) -> Tensor: + def astype(self, data_type: DataType) -> UnquantizedTensor: dtype = data_type.dtype if self.data_type == DT_BF16: self.ndarray = bf16_to_fp32(self.ndarray) return UnquantizedTensor(self.ndarray.astype(dtype)) - def to_ggml(self) -> UnquantizedTensor: + def to_ggml(self) -> Self: return self def permute_part(self, n_part: int, n_head: int, n_head_kv: int) -> UnquantizedTensor: @@ -689,7 +759,7 @@ class ModelPlus: model: LazyModel paths: list[Path] # Where this was read from. format: Literal['ggml', 'torch', 'safetensors', 'none'] - vocab: Vocab | None # For GGML models (which have vocab built in), the vocab. + vocab: BaseVocab | None # For GGML models (which have vocab built in), the vocab. def merge_sharded(models: list[LazyModel]) -> LazyModel: @@ -698,7 +768,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel: names = {name: None for model in models for name in model} def convert(name: str) -> LazyTensor: - lazy_tensors: list[LazyTensor] = [model[name] for model in models] + lazy_tensors = [model[name] for model in models] if len(lazy_tensors) == 1: # only one file; don't go through this procedure since there might # be quantized tensors @@ -719,7 +789,7 @@ def merge_sharded(models: list[LazyModel]) -> LazyModel: def load() -> UnquantizedTensor: ndarrays = [load_unquantized(tensor) for tensor in lazy_tensors] - concatenated: NDArray = np.concatenate(ndarrays, axis=axis) + concatenated = np.concatenate(ndarrays, axis=axis) return UnquantizedTensor(concatenated) description = 'concatenated[[' + '] | ['.join(lt.description for lt in lazy_tensors) + ']]' return LazyTensor(load, concatenated_shape, lazy_tensors[0].data_type, description) @@ -771,6 +841,15 @@ def part_lazy(lazy_tensor: LazyTensor, n_part: int) -> LazyTensor: return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description) +def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor: + def load() -> Tensor: + tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors] + return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors])) + s = lazy_tensors[0].shape.copy() + s.insert(0, len(lazy_tensors)) + return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors)) + + # Functionality that simulates `torch.load` but where individual tensors are # only loaded into memory on demand, not all at once. # PyTorch can't do this natively as of time of writing: @@ -807,10 +886,10 @@ class LazyUnpickler(pickle.Unpickler): def load(offset: int, elm_count: int) -> NDArray: dtype = data_type.dtype - fp = self.zip_file.open(info) - fp.seek(offset * dtype.itemsize) - size = elm_count * dtype.itemsize - data = fp.read(size) + with self.zip_file.open(info) as fp: + fp.seek(offset * dtype.itemsize) + size = elm_count * dtype.itemsize + data = fp.read(size) assert len(data) == size return np.frombuffer(data, dtype) description = f'storage data_type={data_type} path-in-zip={filename} path={self.zip_file.filename}' @@ -831,7 +910,7 @@ class LazyUnpickler(pickle.Unpickler): def rebuild_from_type_v2(func, new_type, args, state): return func(*args) - CLASSES: dict[tuple[str, str], Any] = { + CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = { # getattr used here as a workaround for mypy not being smart enough to determine # the staticmethods have a __func__ attribute. ('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'), @@ -890,7 +969,7 @@ def lazy_load_safetensors_file(fp: IO[bytes], path: Path) -> ModelPlus: def must_read(fp: IO[bytes], length: int) -> bytes: ret = fp.read(length) if len(ret) < length: - raise Exception("unexpectedly reached end of file") + raise EOFError("unexpectedly reached end of file") return ret @@ -948,23 +1027,24 @@ def bounded_parallel_map(func: Callable[[In], Out], iterable: Iterable[In], conc yield result -def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> None: +def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False) -> None: # Handle special case where the model's vocab size is not set if params.n_vocab == -1: raise ValueError( - f"The model's vocab size is set to -1 in params.json. Please update it manually.{f' Maybe {vocab.vocab_size}?' if hasattr(vocab, 'vocab_size') else ''}" + "The model's vocab size is set to -1 in params.json. Please update it manually." + + (f" Maybe {vocab.vocab_size}?" if isinstance(vocab, Vocab) else ""), ) - if isinstance(vocab, NoVocab): + if not isinstance(vocab, Vocab): return # model has no vocab # Check for a vocab size mismatch if params.n_vocab == vocab.vocab_size: - print("Ignoring added_tokens.json since model matches vocab size without it.") + logger.warning("Ignoring added_tokens.json since model matches vocab size without it.") return if pad_vocab and params.n_vocab > vocab.vocab_size: pad_count = params.n_vocab - vocab.vocab_size - print( + logger.debug( f"Padding vocab with {pad_count} token(s) - through " ) for i in range(1, pad_count + 1): @@ -979,11 +1059,11 @@ def check_vocab_size(params: Params, vocab: Vocab, pad_vocab: bool = False) -> N if vocab.vocab_size < params.n_vocab: msg += " Add the --pad-vocab option and try again." - raise Exception(msg) + raise ValueError(msg) class OutputFile: - def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE) -> None: + def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE): self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess) def add_meta_arch(self, params: Params) -> None: @@ -1034,8 +1114,6 @@ class OutputFile: self.gguf.add_file_type(params.ftype) def extract_vocabulary_from_model(self, vocab: Vocab) -> tuple[list[bytes], list[float], list[gguf.TokenType]]: - assert not isinstance(vocab, NoVocab) - tokens = [] scores = [] toktypes = [] @@ -1094,7 +1172,7 @@ class OutputFile: elapsed = time.time() - start size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape) padi = len(str(len(model))) - print( + logger.info( f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}" ) self.gguf.write_tensor_data(ndarray) @@ -1135,7 +1213,7 @@ class OutputFile: @staticmethod def write_all( - fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: Vocab, svocab: gguf.SpecialVocab, + fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab, concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, ) -> None: @@ -1145,11 +1223,11 @@ class OutputFile: # meta data of.add_meta_arch(params) - if isinstance(vocab, NoVocab): - of.gguf.add_tokenizer_model(vocab.tokenizer_model) - else: + if isinstance(vocab, Vocab): of.add_meta_vocab(vocab) of.add_meta_special_vocab(svocab) + else: # NoVocab + of.gguf.add_tokenizer_model(vocab.tokenizer_model) # tensor info for name, lazy_tensor in model.items(): @@ -1176,7 +1254,7 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT name_to_type = {name: lazy_tensor.data_type for (name, lazy_tensor) in model.items()} - raise Exception(f"Unexpected combination of types: {name_to_type}") + raise ValueError(f"Unexpected combination of types: {name_to_type}") def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel: @@ -1186,19 +1264,35 @@ def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyM def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> LazyModel: tmap = gguf.TensorNameMap(ARCH, params.n_layer) - should_skip: set[gguf.MODEL_TENSOR] = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) + should_skip = set(gguf.MODEL_TENSOR_SKIP.get(ARCH, [])) tmp = model + # merge experts into one tensor + if params.n_experts and params.n_experts > 0: + for i_l in range(params.n_layer): + for w in range(1, 4): + experts = [] + for e in range(params.n_experts): + if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model: + experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]) + del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"] + elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model: + experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]) + del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"] + else: + raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight") + tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts) + # HF models permut or pack some of the tensors, so we need to undo that for i in itertools.count(): if f"model.layers.{i}.self_attn.q_proj.weight" in model: - print(f"Permuting layer {i}") + logger.debug(f"Permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv) # tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"] elif f"model.layers.{i}.self_attn.W_pack.weight" in model: - print(f"Unpacking and permuting layer {i}") + logger.debug(f"Unpacking and permuting layer {i}") tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head) tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv) tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2) @@ -1211,16 +1305,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) -> tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None) if name_new is None: if skip_unknown: - print(f"Unexpected tensor name: {name} - skipping") + logger.warning(f"Unexpected tensor name: {name} - skipping") continue - else: - raise Exception(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") + raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)") if tensor_type in should_skip: - print(f"skipping tensor {name_new}") + logger.debug(f"skipping tensor {name_new}") continue - print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") + logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}") out[name_new] = lazy_tensor return out @@ -1231,7 +1324,7 @@ def nth_multifile_path(path: Path, n: int) -> Path | None: the nth path in the model. ''' # Support the following patterns: - patterns: list[tuple[str, str]] = [ + patterns = [ # - x.00.pth, x.01.pth, etc. (r'\.[0-9]{2}\.pth$', f'.{n:02}.pth'), # - x-00001-of-00002.bin, x-00002-of-00002.bin, etc. @@ -1270,22 +1363,22 @@ def load_some_model(path: Path) -> ModelPlus: # Be extra-friendly and accept either a file or a directory: if path.is_dir(): # Check if it's a set of safetensors files first - globs = ["model-00001-of-*.safetensors", "model.safetensors"] + globs = ["model-00001-of-*.safetensors", "model.safetensors", "consolidated.safetensors"] files = [file for glob in globs for file in path.glob(glob)] if not files: # Try the PyTorch patterns too, with lower priority globs = ["consolidated.00.pth", "pytorch_model-00001-of-*.bin", "*.pt", "pytorch_model.bin"] files = [file for glob in globs for file in path.glob(glob)] if not files: - raise Exception(f"Can't find model in directory {path}") + raise FileNotFoundError(f"Can't find model in directory {path}") if len(files) > 1: - raise Exception(f"Found multiple models in {path}, not sure which to pick: {files}") + raise ValueError(f"Found multiple models in {path}, not sure which to pick: {files}") path = files[0] paths = find_multifile_paths(path) models_plus: list[ModelPlus] = [] for path in paths: - print(f"Loading model file {path}") + logger.info(f"Loading model file {path}") models_plus.append(lazy_load_file(path)) model_plus = merge_multifile_models(models_plus) @@ -1293,36 +1386,14 @@ def load_some_model(path: Path) -> ModelPlus: class VocabFactory: - _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"} + _VOCAB_CLASSES: list[type[Vocab]] = [SentencePieceVocab, BpeVocab, LlamaHfVocab] def __init__(self, path: Path): self.path = path - self.file_paths = self._detect_files() - print(f"Found vocab files: {self.file_paths}") - def _detect_files(self) -> dict[str, Path | None]: - def locate(file: str) -> Path | None: - if (path := self.path / file).exists(): - return path - if (path := self.path.parent / file).exists(): - return path - return None - - return {vt: locate(f) for vt, f in self._FILES.items()} - - def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]: - for vtype in vocab_types: - try: - path = self.file_paths[vtype] - except KeyError: - raise ValueError(f"Unsupported vocabulary type {vtype}") from None - if path is not None: - return vtype, path - raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}") - - def _create_special_vocab(self, vocab: Vocab, model_parent_path: Path) -> gguf.SpecialVocab: + def _create_special_vocab(self, vocab: BaseVocab, model_parent_path: Path) -> gguf.SpecialVocab: load_merges = vocab.name == "bpe" - n_vocab = vocab.vocab_size if hasattr(vocab, "vocab_size") else None + n_vocab = vocab.vocab_size if isinstance(vocab, Vocab) else None return gguf.SpecialVocab( model_parent_path, load_merges=load_merges, @@ -1331,27 +1402,29 @@ class VocabFactory: ) def _create_vocab_by_path(self, vocab_types: list[str]) -> Vocab: - vocab_type, path = self._select_file(vocab_types) - print(f"Loading vocab file {path!r}, type {vocab_type!r}") + vocab_classes: dict[str, type[Vocab]] = {cls.name: cls for cls in self._VOCAB_CLASSES} + selected_vocabs: dict[str, type[Vocab]] = {} + for vtype in vocab_types: + try: + selected_vocabs[vtype] = vocab_classes[vtype] + except KeyError: + raise ValueError(f"Unsupported vocabulary type {vtype}") from None - added_tokens_path = path.parent / "added_tokens.json" - if vocab_type == "bpe": - return BpeVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - if vocab_type == "spm": - return SentencePieceVocab( - path, added_tokens_path if added_tokens_path.exists() else None - ) - if vocab_type == "hfft": - return HfVocab( - path.parent, added_tokens_path if added_tokens_path.exists() else None - ) - raise ValueError(vocab_type) + for vtype, cls in selected_vocabs.items(): + try: + vocab = cls(self.path) + break + except FileNotFoundError: + pass # ignore unavailable tokenizers + else: + raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}") - def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: - vocab: Vocab - if len(vocab_types) == 1 and "no_vocab" in vocab_types: + logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}") + return vocab + + def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]: + vocab: BaseVocab + if vocab_types is None: vocab = NoVocab() else: vocab = self._create_vocab_by_path(vocab_types) @@ -1371,19 +1444,19 @@ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path: }[file_type] ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf" if ret in model_paths: - sys.stderr.write( + logger.error( f"Error: Default output path ({ret}) would overwrite the input. " - "Please explicitly specify a path using --outfile.\n") + "Please explicitly specify a path using --outfile.") sys.exit(1) return ret def do_dump_model(model_plus: ModelPlus) -> None: - print(f"model_plus.paths = {model_plus.paths!r}") - print(f"model_plus.format = {model_plus.format!r}") - print(f"model_plus.vocab = {model_plus.vocab!r}") + print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100 + print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100 + print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100 for name, lazy_tensor in model_plus.model.items(): - print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") + print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100 def main(args_in: list[str] | None = None) -> None: @@ -1406,12 +1479,20 @@ def main(args_in: list[str] | None = None) -> None: parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine") parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides") parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args(args_in) - if args.no_vocab: - if args.vocab_only: - raise ValueError("no need to specify --vocab-only if using --no-vocab") - args.vocab_type = "no_vocab" + + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + elif args.dump_single or args.dump: + # Avoid printing anything besides the dump output + logging.basicConfig(level=logging.WARNING) + else: + logging.basicConfig(level=logging.INFO) + + if args.no_vocab and args.vocab_only: + raise ValueError("--vocab-only does not make sense with --no-vocab") if args.dump_single: model_plus = lazy_load_file(args.model) @@ -1426,48 +1507,65 @@ def main(args_in: list[str] | None = None) -> None: if args.dump: do_dump_model(model_plus) return + endianess = gguf.GGUFEndian.LITTLE if args.big_endian: endianess = gguf.GGUFEndian.BIG - params = Params.load(model_plus) - if params.n_ctx == -1: - if args.ctx is None: - raise Exception("The model doesn't have a context size, and you didn't specify one with --ctx\n" - "Please specify one with --ctx:\n" - " - LLaMA v1: --ctx 2048\n" - " - LLaMA v2: --ctx 4096\n") - params.n_ctx = args.ctx + params = None + if args.pad_vocab or not args.vocab_only: + params = Params.load(model_plus) + if params.n_ctx == -1: + if args.ctx is None: + msg = """\ + The model doesn't have a context size, and you didn't specify one with --ctx + Please specify one with --ctx: + - LLaMA v1: --ctx 2048 + - LLaMA v2: --ctx 4096""" + parser.error(textwrap.dedent(msg)) + params.n_ctx = args.ctx - if args.outtype: - params.ftype = { - "f32": GGMLFileType.AllF32, - "f16": GGMLFileType.MostlyF16, - "q8_0": GGMLFileType.MostlyQ8_0, - }[args.outtype] + if args.outtype: + params.ftype = { + "f32": GGMLFileType.AllF32, + "f16": GGMLFileType.MostlyF16, + "q8_0": GGMLFileType.MostlyQ8_0, + }[args.outtype] - print(f"params = {params}") + logger.info(f"params = {params}") model_parent_path = model_plus.paths[0].parent vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_factory = VocabFactory(vocab_path) - vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path) + vocab_types = None if args.no_vocab else args.vocab_type.split(",") + vocab, special_vocab = vocab_factory.load_vocab(vocab_types, model_parent_path) if args.vocab_only: + assert isinstance(vocab, Vocab) if not args.outfile: raise ValueError("need --outfile if using --vocab-only") outfile = args.outfile + if params is None: + params = Params( + n_vocab = vocab.vocab_size, + n_embd = 1, + n_layer = 1, + n_ctx = 1, + n_ff = 1, + n_head = 1, + n_head_kv = 1, + f_norm_eps = 1e-5, + ) OutputFile.write_vocab_only(outfile, params, vocab, special_vocab, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") + logger.info(f"Wrote {outfile}") return if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab: vocab = model_plus.vocab - print(f"Vocab info: {vocab}") - print(f"Special vocab info: {special_vocab}") - + logger.info(f"Vocab info: {vocab}") + logger.info(f"Special vocab info: {special_vocab}") model = model_plus.model model = convert_model_names(model, params, args.skip_unknown) ftype = pick_output_type(model, args.outtype) @@ -1475,11 +1573,11 @@ def main(args_in: list[str] | None = None) -> None: outfile = args.outfile or default_outfile(model_plus.paths, ftype) params.ftype = ftype - print(f"Writing {outfile}, format {ftype}") + logger.info(f"Writing {outfile}, format {ftype}") OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab, concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab) - print(f"Wrote {outfile}") + logger.info(f"Wrote {outfile}") if __name__ == '__main__': diff --git a/docs/BLIS.md b/docs/BLIS.md index 0bcd6eeef..c933766b7 100644 --- a/docs/BLIS.md +++ b/docs/BLIS.md @@ -23,7 +23,7 @@ Install BLIS: sudo make install ``` -We recommend using openmp since it's easier to modify the cores been used. +We recommend using openmp since it's easier to modify the cores being used. ### llama.cpp compilation diff --git a/docs/HOWTO-add-model.md b/docs/HOWTO-add-model.md new file mode 100644 index 000000000..48769cdf6 --- /dev/null +++ b/docs/HOWTO-add-model.md @@ -0,0 +1,119 @@ +## Add a new model architecture to `llama.cpp` + +Adding a model requires few steps: + +1. Convert the model to GGUF +2. Define the model architecture in `llama.cpp` +3. Build the GGML graph implementation + +After following these steps, you can open PR. + +Also, it is important to check that the examples and main ggml backends (CUDA, METAL, CPU) are working with the new architecture, especially: +- [main](../examples/main) +- [imatrix](../examples/imatrix) +- [quantize](../examples/quantize) +- [server](../examples/server) + +### 1. Convert the model to GGUF + +This step is done in python with a `convert` script using the [gguf](https://pypi.org/project/gguf/) library. +Depending on the model architecture, you can use either [convert.py](../convert.py) or [convert-hf-to-gguf.py](../convert-hf-to-gguf.py). + +The convert script reads the model configuration, tokenizer, tensor names+data and converts them to GGUF metadata and tensors. + +The required steps to implement for an HF model are: + +1. Define the model `Model.register` annotation in a new `Model` subclass, example: + +```python +@Model.register("MyModelForCausalLM") +class MyModel(Model): + model_arch = gguf.MODEL_ARCH.GROK +``` + +2. Define the layout of the GGUF tensors in [constants.py](../gguf-py/gguf/constants.py) + +Add an enum entry in `MODEL_ARCH`, the model human friendly name in `MODEL_ARCH_NAMES` and the GGUF tensor names in `MODEL_TENSORS`. + +Example for `falcon` model: +```python + MODEL_ARCH.FALCON: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_NORM_2, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ] +``` + +3. Map the original tensor names to the standardize equivalent in GGUF + +As a general rule, before adding a new tensor name to GGUF, be sure the equivalent naming does not already exist. + +Once you have found the GGUF tensor name equivalent, add it to the [tensor_mapping.py](../gguf-py/gguf/tensor_mapping.py) file. + +If the tensor name is part of a repetitive layer/block, the key word `bid` substitutes it. + +Example for the normalization tensor in attention layers: + +```python +block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { + # Attention norm + MODEL_TENSOR.ATTN_NORM: ( + "gpt_neox.layers.{bid}.input_layernorm", # gptneox + "transformer.h.{bid}.ln_1", # gpt2 gpt-j refact qwen + "transformer.blocks.{bid}.norm_1", # mpt + ... + ) +} +``` + +`transformer.blocks.{bid}.norm_1` will be mapped to `blk.{bid}.attn_norm` in GGUF. + +Depending on the model configuration, tokenizer, code and tensors layout, you will have to override: +- `Model#set_gguf_parameters` +- `Model#set_vocab` +- `Model#write_tensors` + +NOTE: Tensor names must end with `.weight` suffix, that is the convention and several tools like `quantize` expect this to proceed the weights. + +### 2. Define the model architecture in `llama.cpp` + +The model params and tensors layout must be defined in `llama.cpp`: +1. Define a new `llm_arch` +2. Define the tensors layout in `LLM_TENSOR_NAMES` +3. Add any non standard metadata in `llm_load_hparams` +4. Create the tensors for inference in `llm_load_tensors` +5. If the model has a RoPE operation, add the rope type in `llama_rope_type` + +NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions. + +### 3. Build the GGML graph implementation + +This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. + +Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`. + +When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR. + +Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback). + +## GGUF specification + +https://github.com/ggerganov/ggml/blob/master/docs/gguf.md + +## Resources + +- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268 +- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009 +- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283 +- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406 +- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423 +- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204 +- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491 +- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515 +- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948 diff --git a/docs/token_generation_performance_tips.md b/docs/token_generation_performance_tips.md index d7e863dff..3c4343147 100644 --- a/docs/token_generation_performance_tips.md +++ b/docs/token_generation_performance_tips.md @@ -1,7 +1,7 @@ # Token generation performance troubleshooting -## Verifying that the model is running on the GPU with cuBLAS -Make sure you compiled llama with the correct env variables according to [this guide](../README.md#cublas), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: +## Verifying that the model is running on the GPU with CUDA +Make sure you compiled llama with the correct env variables according to [this guide](../README.md#CUDA), so that llama accepts the `-ngl N` (or `--n-gpu-layers N`) flag. When running llama, you may configure `N` to be very large, and llama will offload the maximum possible number of layers to the GPU, even if it's less than the number you configured. For example: ```shell ./main -m "path/to/model.gguf" -ngl 200000 -p "Please sir, may I have some " ``` diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b59cc65bf..f421769cc 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -19,6 +19,7 @@ else() add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) + add_subdirectory(eval-callback) add_subdirectory(finetune) add_subdirectory(gritlm) add_subdirectory(gguf-split) @@ -34,6 +35,7 @@ else() add_subdirectory(perplexity) add_subdirectory(quantize) add_subdirectory(quantize-stats) + add_subdirectory(retrieval) add_subdirectory(save-load-state) add_subdirectory(simple) add_subdirectory(passkey) diff --git a/examples/batched-bench/README.md b/examples/batched-bench/README.md index 34b343f66..bf951baf7 100644 --- a/examples/batched-bench/README.md +++ b/examples/batched-bench/README.md @@ -10,16 +10,16 @@ There are 2 modes of operation: - `prompt is shared` - there is a common prompt of size `PP` used by all batches (i.e. `N_KV = PP + B*TG`) ```bash -./batched-bench MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] +./batched-bench MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] [MMQ] # LLaMA 7B, F16, N_KV_MAX = 16384 (8GB), prompt not shared -./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 0 99 +./batched-bench ./models/llama-7b/ggml-model-f16.gguf 16384 2048 512 0 99 # LLaMA 7B, Q8_0, N_KV_MAX = 16384 (8GB), prompt is shared -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 1 99 +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 16384 2048 512 1 99 # custom set of batches -./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32 +./batched-bench ./models/llama-7b/ggml-model-q8_0.gguf 2048 512 512 0 999 0 128,256,512 128,256 1,2,4,8,16,32 ``` ## Sample results diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 19674dfd3..2924d8116 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -32,13 +32,16 @@ int main(int argc, char ** argv) { gpt_params params; if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] \n" , argv[0]); + printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] \n" , argv[0]); printf(" , and PL are comma-separated lists of numbers without spaces\n\n"); - printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); + printf(" example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); return 1 ; } int n_kv_max = 2048; + int n_batch = 2048; + int n_ubatch = 512; + bool flash_attn = false; int is_pp_shared = 0; int n_gpu_layers = 0; @@ -56,23 +59,35 @@ int main(int argc, char ** argv) { } if (argc >= 4) { - is_pp_shared = std::atoi(argv[3]); + n_batch = std::atoi(argv[3]); } if (argc >= 5) { - n_gpu_layers = std::atoi(argv[4]); + n_ubatch = std::atoi(argv[4]); } if (argc >= 6) { - n_pp = parse_list(argv[5]); + flash_attn = std::atoi(argv[5]); } if (argc >= 7) { - n_tg = parse_list(argv[6]); + is_pp_shared = std::atoi(argv[6]); } if (argc >= 8) { - n_pl = parse_list(argv[7]); + n_gpu_layers = std::atoi(argv[7]); + } + + if (argc >= 9) { + n_pp = parse_list(argv[8]); + } + + if (argc >= 10) { + n_tg = parse_list(argv[9]); + } + + if (argc >= 11) { + n_pl = parse_list(argv[10]); } // init LLM @@ -98,9 +113,11 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); - ctx_params.seed = 1234; - ctx_params.n_ctx = n_kv_max; - ctx_params.n_batch = 512; + ctx_params.seed = 1234; + ctx_params.n_ctx = n_kv_max; + ctx_params.n_batch = n_batch; + ctx_params.n_ubatch = n_ubatch; + ctx_params.flash_attn = flash_attn; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -158,7 +175,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("\n"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift index d75c503d5..dbbd06da5 100644 --- a/examples/batched.swift/Sources/main.swift +++ b/examples/batched.swift/Sources/main.swift @@ -153,7 +153,7 @@ while n_cur <= n_len { // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? -> mark the stream as finished - if new_token_id == llama_token_eos(model) || n_cur == n_len { + if llama_token_is_eog(model, new_token_id) || n_cur == n_len { i_batch[i] = -1 // print("") if n_parallel > 1 { @@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] { private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { var result = [CChar](repeating: 0, count: 8) - let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count)) + let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false) if nTokens < 0 { let actualTokensCount = -Int(nTokens) result = .init(repeating: 0, count: actualTokensCount) @@ -237,7 +237,8 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String model, token, &result, - Int32(result.count) + Int32(result.count), + false ) assert(check == actualTokensCount) } else { diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index ee1f8f1bf..be30d20bf 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -48,6 +48,8 @@ int main(int argc, char ** argv) { params.prompt = "Hello my name is"; } + process_escapes(params.prompt); + // init LLM llama_backend_init(); @@ -78,7 +80,7 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); ctx_params.seed = 1234; - ctx_params.n_ctx = n_kv_req; + ctx_params.n_ctx = n_kv_req; ctx_params.n_batch = std::max(n_len, n_parallel); ctx_params.n_seq_max = n_parallel; ctx_params.n_threads = params.n_threads; @@ -189,8 +191,8 @@ int main(int argc, char ** argv) { //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - // is it an end of stream? -> mark the stream as finished - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + // is it an end of generation? -> mark the stream as finished + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { i_batch[i] = -1; LOG_TEE("\n"); if (n_parallel > 1) { diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index 866c6d7a6..3d34378a5 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -47,7 +47,7 @@ struct beam_search_callback_data { // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // For example, eob can be flagged due to maximum token length, stop words, etc. static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { - return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx)); + return n_tokens && llama_token_is_eog(llama_get_model(callback_data.ctx), tokens[n_tokens-1]); } // Function matching type llama_beam_search_callback_fn_t. diff --git a/examples/convert-llama2c-to-ggml/README.md b/examples/convert-llama2c-to-ggml/README.md index 0f37d295b..742dcf7a3 100644 --- a/examples/convert-llama2c-to-ggml/README.md +++ b/examples/convert-llama2c-to-ggml/README.md @@ -2,7 +2,7 @@ This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default. -To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository: +To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository: `$ make -j` @@ -21,6 +21,8 @@ An example command using a model from [karpathy/tinyllamas](https://huggingface. `$ ./convert-llama2c-to-ggml --copy-vocab-from-model llama-2-7b-chat.gguf.q2_K.bin --llama2c-model stories42M.bin --llama2c-output-model stories42M.gguf.bin` +Note: The vocabulary for `stories260K.bin` should be its own tokenizer `tok512.bin` found in [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K). + Now you can use the model with a command like: `$ ./main -m stories42M.gguf.bin -p "One day, Lily met a Shoggoth" -n 500 -c 256` diff --git a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp index 8209dcb64..746c3fbef 100644 --- a/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp +++ b/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp @@ -1,6 +1,7 @@ #include "ggml.h" #include "llama.h" #include "common.h" +#include "log.h" #include #include @@ -78,111 +79,101 @@ typedef struct { struct TransformerWeights { // token embedding table - float* token_embedding_table; // (vocab_size, dim) + std::vector token_embedding_table; // (vocab_size, dim) // weights for rmsnorms - float* rms_att_weight; // (layer, dim) rmsnorm weights - float* rms_ffn_weight; // (layer, dim) + std::vector rms_att_weight; // (layer, dim) rmsnorm weights + std::vector rms_ffn_weight; // (layer, dim) // weights for matmuls - float* wq; // (layer, dim, dim) - float* wk; // (layer, dim, dim) - float* wv; // (layer, dim, dim) - float* wo; // (layer, dim, dim) + std::vector wq; // (layer, dim, dim) + std::vector wk; // (layer, dim, dim) + std::vector wv; // (layer, dim, dim) + std::vector wo; // (layer, dim, dim) // weights for ffn - float* w1; // (layer, hidden_dim, dim) - float* w2; // (layer, dim, hidden_dim) - float* w3; // (layer, hidden_dim, dim) + std::vector w1; // (layer, hidden_dim, dim) + std::vector w2; // (layer, dim, hidden_dim) + std::vector w3; // (layer, hidden_dim, dim) // final rmsnorm - float* rms_final_weight; // (dim,) + std::vector rms_final_weight; // (dim,) // freq_cis for RoPE relatively positional embeddings - // float* freq_cis_real; // (seq_len, dim/2) - // float* freq_cis_imag; // (seq_len, dim/2) + // std::vector freq_cis_real; // (seq_len, dim/2) + // std::vector freq_cis_imag; // (seq_len, dim/2) // (optional) classifier weights for the logits, on the last layer - float* wcls; - - ~TransformerWeights() { - delete[] token_embedding_table; - delete[] rms_att_weight; - delete[] rms_ffn_weight; - delete[] wq; - delete[] wk; - delete[] wv; - delete[] wo; - delete[] w1; - delete[] w2; - delete[] w3; - delete[] rms_final_weight; - delete[] wcls; - } + std::vector wcls; }; -static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) { - // we calloc instead of malloc to keep valgrind happy - w->token_embedding_table = new float[p->vocab_size * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); +static void alloc_weights(TransformerWeights * w, const Config * p, bool shared_weights) { + const int n_multiqueries = p->n_kv_heads <= 0 || p->n_kv_heads >= p->n_heads ? 1 : p->n_heads / p->n_kv_heads; + try { + w->token_embedding_table.resize(p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); - w->rms_att_weight = new float[p->n_layers * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); + w->rms_att_weight.resize(p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_att_weight\n",__func__,p->n_layers, p->dim, p->n_layers * p->dim); - w->rms_ffn_weight = new float[p->n_layers * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); + w->rms_ffn_weight.resize(p->n_layers * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->rms_ffn_weight\n",__func__,p->n_layers , p->dim, p->n_layers * p->dim); - w->wq = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wq.resize(p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wq\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - w->wk = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wk.resize(p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wk\n",__func__,p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - w->wv = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wv.resize(p->n_layers * p->dim * p->dim / n_multiqueries); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wv\n",__func__, p->n_layers, p->dim, p->dim / n_multiqueries, p->n_layers * p->dim * p->dim / n_multiqueries); - w->wo = new float[p->n_layers * p->dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); + w->wo.resize(p->n_layers * p->dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->wo\n",__func__,p->n_layers, p->dim, p->dim, p->n_layers * p->dim * p->dim); - w->w1 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + w->w1.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w1\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - w->w2 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); + w->w2.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w2\n",__func__,p->n_layers, p->dim, p->hidden_dim, p->n_layers * p->hidden_dim * p->dim); - w->w3 = new float[p->n_layers * p->hidden_dim * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); + w->w3.resize(p->n_layers * p->hidden_dim * p->dim); + LOG("%s: Allocating [%d] x [%d] x [%d] = [%d] float space for w->w3\n",__func__,p->n_layers, p->hidden_dim, p->dim, p->n_layers * p->hidden_dim * p->dim); - w->rms_final_weight = new float[p->dim](); - printf("[%s:AK] Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); + w->rms_final_weight.resize(p->dim); + LOG("%s: Allocating [%d] float space for w->rms_final_weight\n",__func__,p->dim); - if (shared_weights) { - w->wcls = NULL; - } else { - w->wcls = new float[p->vocab_size * p->dim](); - printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + if (shared_weights) { + w->wcls = {}; + } else { + w->wcls.resize(p->vocab_size * p->dim); + LOG("%s: Allocating [%d] x [%d] = [%d] float space for w->wcls\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim); + } + } + catch (std::length_error &) { + die("Invalid configuration. Failed to allocate memory for weights"); } } -static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) { - if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; - if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; - if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wk, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wv, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->wo, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast(p->n_layers * p->dim * p->dim)) return 1; - if (fread(w->rms_ffn_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast(p->n_layers * p->dim)) return 1; - if (fread(w->w1, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; - if (fread(w->w2, sizeof(float), p->n_layers * p->hidden_dim * p->dim, f) != static_cast(p->n_layers * p->hidden_dim * p->dim)) return 1; - if (fread(w->w3, sizeof(float), p->n_layers * p->dim * p->hidden_dim, f) != static_cast(p->n_layers * p->dim * p->hidden_dim)) return 1; - if (fread(w->rms_final_weight, sizeof(float), p->dim, f) != static_cast(p->dim)) return 1; +static int checkpoint_init_weights(TransformerWeights * w, const Config * p, FILE * f, bool shared_weights) { + if (fread(w->token_embedding_table.data(), sizeof(float), w->token_embedding_table.size(), f) != w->token_embedding_table.size()) return 1; + if (fread(w->rms_att_weight.data(), sizeof(float), w->rms_att_weight.size(), f) != w->rms_att_weight.size()) return 1; + if (fread(w->wq.data(), sizeof(float), w->wq.size(), f) != w->wq.size()) return 1; + if (fread(w->wk.data(), sizeof(float), w->wk.size(), f) != w->wk.size()) return 1; + if (fread(w->wv.data(), sizeof(float), w->wv.size(), f) != w->wv.size()) return 1; + if (fread(w->wo.data(), sizeof(float), w->wo.size(), f) != w->wo.size()) return 1; + if (fread(w->rms_ffn_weight.data(), sizeof(float), w->rms_ffn_weight.size(), f) != w->rms_ffn_weight.size()) return 1; + if (fread(w->w1.data(), sizeof(float), w->w1.size(), f) != w->w1.size()) return 1; + if (fread(w->w2.data(), sizeof(float), w->w2.size(), f) != w->w2.size()) return 1; + if (fread(w->w3.data(), sizeof(float), w->w3.size(), f) != w->w3.size()) return 1; + if (fread(w->rms_final_weight.data(), sizeof(float), w->rms_final_weight.size(), f) != w->rms_final_weight.size()) return 1; // Skip freq_cis_real & freq_cis_imag int head_size = p->dim / p->n_heads; fseek(f, p->seq_len * head_size * sizeof(float), SEEK_CUR); - if (!shared_weights && fread(w->wcls, sizeof(float), p->vocab_size * p->dim, f) != static_cast(p->vocab_size * p->dim)) return 1; + if (!shared_weights && fread(w->wcls.data(), sizeof(float), w->wcls.size(), f) != w->wcls.size()) return 1; // Check we didn't forget to read anything auto curr = ftell(f); fseek(f, 0, SEEK_END); auto end = ftell(f); if (curr != end) { - printf("Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", curr, end); + LOG("%s: Error: failed to read the checkpoint file to the end (curr = %ld, end = %ld)\n", __func__, curr, end); return 1; } @@ -190,20 +181,20 @@ static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bo } static void print_sample_weights(TransformerWeights *w){ - printf("----- Quick print of first of the weight vales of all the variables\n"); - printf("%f\n", w->token_embedding_table[0]); - printf("%f\n", w->rms_att_weight[0]); - printf("%f\n", w->rms_ffn_weight[0]); + LOG("----- Quick print of first of the weight vales of all the variables\n"); + LOG("%f\n", w->token_embedding_table[0]); + LOG("%f\n", w->rms_att_weight[0]); + LOG("%f\n", w->rms_ffn_weight[0]); - printf("%f\n", w->wq[0]); - printf("%f\n", w->wk[0]); - printf("%f\n", w->wv[0]); - printf("%f\n", w->wo[0]); - printf("%f\n", w->w1[0]); - printf("%f\n", w->w2[0]); - printf("%f\n", w->w3[0]); - printf("%f\n", w->rms_att_weight[0]); - if (w->wcls) printf("%f\n", w->wcls[0]); + LOG("%f\n", w->wq[0]); + LOG("%f\n", w->wk[0]); + LOG("%f\n", w->wv[0]); + LOG("%f\n", w->wo[0]); + LOG("%f\n", w->w1[0]); + LOG("%f\n", w->w2[0]); + LOG("%f\n", w->w3[0]); + LOG("%f\n", w->rms_att_weight[0]); + if (!w->wcls.empty()) LOG("%f\n", w->wcls[0]); } //////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -225,14 +216,16 @@ struct llama_vocab { }; struct my_llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_ff = 11008; - uint32_t n_mult = 4; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_ff = 11008; + uint32_t n_mult = 4; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + bool operator!=(const my_llama_hparams& other) const { return memcmp(this, &other, sizeof(my_llama_hparams)); } @@ -325,14 +318,30 @@ struct train_params { }; static void print_params(struct my_llama_hparams * params) { - printf("%s: n_vocab: %u\n", __func__, params->n_vocab); - printf("%s: n_ctx: %u\n", __func__, params->n_ctx); - printf("%s: n_embd: %u\n", __func__, params->n_embd); - printf("%s: n_mult: %u\n", __func__, params->n_mult); - printf("%s: n_head: %u\n", __func__, params->n_head); - printf("%s: n_ff: %u\n", __func__, params->n_ff); - printf("%s: n_layer: %u\n", __func__, params->n_layer); - printf("%s: n_rot: %u\n", __func__, params->n_rot); + LOG("%s: n_vocab: %u\n", __func__, params->n_vocab); + LOG("%s: n_ctx: %u\n", __func__, params->n_ctx); + LOG("%s: n_embd: %u\n", __func__, params->n_embd); + LOG("%s: n_mult: %u\n", __func__, params->n_mult); + LOG("%s: n_head: %u\n", __func__, params->n_head); + LOG("%s: n_head_kv: %u\n", __func__, params->n_head_kv); + LOG("%s: n_ff: %u\n", __func__, params->n_ff); + LOG("%s: n_layer: %u\n", __func__, params->n_layer); + LOG("%s: n_rot: %u\n", __func__, params->n_rot); +} + +static void print_tensor_info(const struct ggml_context * ctx) { + for (auto t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + LOG("%s: Allocating ", __func__); + int64_t total = 1; + int i = 0; + for (; i < ggml_n_dims(t); ++i) { + if (i > 0) LOG("x "); + LOG("[%" PRId64 "] ", t->ne[i]); + total *= t->ne[i]; + } + if (i > 1) LOG("= [%" PRId64 "] ", total); + LOG("float space for %s\n", ggml_get_name(t)); + } } static void init_model(struct my_llama_model * model) { @@ -342,6 +351,8 @@ static void init_model(struct my_llama_model * model) { const uint32_t n_layer = hparams.n_layer; const uint32_t n_vocab = hparams.n_vocab; + const uint32_t n_multiqueries = hparams.n_head_kv <= 0 || hparams.n_head_kv >= hparams.n_head ? 1 : hparams.n_head / hparams.n_head_kv; + const uint32_t n_ff = hparams.n_ff; struct ggml_context * ctx = model->ctx; @@ -350,25 +361,8 @@ static void init_model(struct my_llama_model * model) { model->train_tokens = 0; model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%u] x [%u] = [%u] float space for model->tok_embeddings\n",__func__,n_embd , n_vocab, n_embd * n_vocab); - model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - printf("[%s:GG] Allocating [%u] float space for model->norm\n",__func__,n_embd); - model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for model->output\n",__func__,n_embd, n_vocab, n_embd * n_vocab); - - // printing the per-layer allocations here so we dont print in the for loop. - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wq for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wk for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wv for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.wo for [%u] layers\n",__func__, n_embd, n_embd, n_embd * n_embd, n_layer); - - printf("[%s:GG] Allocating [%u] float space for layer.ffn_norm for [%u] layers\n",__func__,n_embd, n_layer); - - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w1 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w2 for [%u] layers\n",__func__, n_embd, n_ff, n_ff * n_embd, n_layer); - printf("[%s:GG] Allocating [%u] x[%u] = [%u] float space for layer.w3 for [%u] layers\n",__func__, n_ff, n_embd, n_embd * n_ff, n_layer); ggml_set_name(model->tok_embeddings, "tok_embeddings.weight"); ggml_set_name(model->norm, "norm.weight"); @@ -383,8 +377,8 @@ static void init_model(struct my_llama_model * model) { layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); - layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); + layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); + layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd / n_multiqueries); layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); @@ -406,6 +400,8 @@ static void init_model(struct my_llama_model * model) { ggml_format_name(layer.w2, "%s.feed_forward.w2.weight", layers_i.c_str()); ggml_format_name(layer.w3, "%s.feed_forward.w3.weight", layers_i.c_str()); } + + print_tensor_info(ctx); } static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { @@ -421,9 +417,9 @@ static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) { static void print_row(struct ggml_tensor * probs, int i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); - printf(" %f", p); + LOG(" %f", p); } - printf("\n"); + LOG("\n"); } static void print_matrix(struct ggml_tensor * probs) { @@ -431,33 +427,12 @@ static void print_matrix(struct ggml_tensor * probs) { for (int i = 0; i < probs->ne[1]; ++i) { for (int k = 0; k < probs->ne[0]; ++k) { float p = get_f32_2d(probs, k, i); - printf(" %.2f", p); + LOG(" %.2f", p); } - printf("\n"); + LOG("\n"); } } -#ifdef __GNUC__ -#ifdef __MINGW32__ -__attribute__((format(gnu_printf, 1, 2))) -#else -__attribute__((format(printf, 1, 2))) -#endif -#endif -static std::string format(const char * fmt, ...) { - va_list ap, ap2; - va_start(ap, fmt); - va_copy(ap2, ap); - int size = vsnprintf(NULL, 0, fmt, ap); - GGML_ASSERT(size >= 0 && size < INT_MAX); - std::vector buf(size + 1); - int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2); - GGML_ASSERT(size2 == size); - va_end(ap2); - va_end(ap); - return std::string(buf.data(), size); -} - struct llama_file { // use FILE * so we don't have to re-open the file to mmap FILE * fp; @@ -549,8 +524,9 @@ static std::string llama_escape_whitespaces(const std::string & text) { return out.str(); } -static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) { +static void load_vocab(const char * filename, const Config * config, struct llama_vocab * vocab) { if (is_ggml_file(filename)) { + LOG("%s: Loading vocabulary from gguf file %s\n", __func__, filename); struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -578,6 +554,9 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab const int * toktypes = (const int * ) gguf_get_arr_data(ctx, toktype_idx); const uint32_t n_vocab = gguf_get_arr_n(ctx, token_idx); + if (n_vocab != static_cast(config->vocab_size)) { + die_fmt("vocab size mismatch: (gguf) %u != (llama2c) %d", n_vocab, config->vocab_size); + } vocab->id_to_token.resize(n_vocab); @@ -595,7 +574,7 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab gguf_free(ctx); } else { // assume llama2.c vocabulary - printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename); + LOG("%s: Assuming llama2.c vocabulary since %s is not a gguf file\n", __func__, filename); llama_file file(filename, "rb"); if (!file.fp) { die_fmt("%s: %s", strerror(errno), filename); @@ -638,38 +617,15 @@ static void load_vocab(const char *filename, Config *config, struct llama_vocab } static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) { - int ct; - switch (ggml_n_dims(gg_weights)) { - case 1: - ct = 0; - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++){ - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0]); - *ptr = karpathy_weights[ct]; - ct++; - } - break; - case 2: - ct = 0; - for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1]); - *ptr = karpathy_weights[ct]; - ct++; - } - } - break; - case 3: - ct = 0; - for (int i2 = 0; i2 < gg_weights->ne[2]; i2++) { - for (int i1 = 0; i1 < gg_weights->ne[1]; i1++) { - for (int i0 = 0; i0 < gg_weights->ne[0]; i0++) { - float * ptr = (float *) ((char *) gg_weights->data + i0*gg_weights->nb[0] + i1*gg_weights->nb[1] + i2*gg_weights->nb[2]); - *ptr = karpathy_weights[ct]; - ct++; - } - } - } - break; + int size = 1; + for (int dim = 0; dim < ggml_n_dims(gg_weights); ++dim) { + size *= gg_weights->ne[dim]; + } + for (int ct = 0; ct < size; ++ct) { + int64_t i0 = 0; int64_t i1 = 0; + int64_t i2 = 0; int64_t i3 = 0; + ggml_unravel_index(gg_weights, ct, &i0, &i1, &i2, &i3); + ggml_set_f32_nd(gg_weights, i0, i1, i2, i3, karpathy_weights[ct]); } } @@ -679,16 +635,18 @@ static void save_as_llama_model( // convert AK weights into GG weights one by one. // w->token_embedding_table -> model->tok_embeddings // float* -> struct ggml_tensor - convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table); - convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table); + convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table.data()); + convert_weights_ak_to_gg(model->output, !w->wcls.empty() ? w->wcls.data() : w->token_embedding_table.data()); - convert_weights_ak_to_gg(model->norm, w->rms_final_weight); + convert_weights_ak_to_gg(model->norm, w->rms_final_weight.data()); //print_row(model->norm, 0); // for rms-att-weight int row_length = model->hparams.n_embd; int n_ff = model->hparams.n_ff; + const uint32_t n_multiqueries = model->hparams.n_head_kv <= 0 || model->hparams.n_head_kv >= model->hparams.n_head ? 1 : model->hparams.n_head / model->hparams.n_head_kv; + for (uint32_t i = 0; i < model->hparams.n_layer; ++i){ auto & layer = model->layers[i]; // 1d @@ -697,9 +655,10 @@ static void save_as_llama_model( // from 3d matrix layer x dim x dim to 2d matrix dim x dim convert_weights_ak_to_gg(layer.wq , &w->wq[i*row_length*row_length]); - convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length]); - convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length]); convert_weights_ak_to_gg(layer.wo , &w->wo[i*row_length*row_length]); + // from 3d matrix layer x dim x dim to 2d matrix dim x dim / n_multiqueries + convert_weights_ak_to_gg(layer.wk , &w->wk[i*row_length*row_length/n_multiqueries]); + convert_weights_ak_to_gg(layer.wv , &w->wv[i*row_length*row_length/n_multiqueries]); convert_weights_ak_to_gg(layer.w1 , &w->w1[i*row_length*n_ff]); convert_weights_ak_to_gg(layer.w2 , &w->w2[i*n_ff*row_length]); @@ -736,8 +695,8 @@ static void save_as_llama_model( gguf_set_val_u32(ctx, KV_EMBEDDING_LENGTH, model->hparams.n_embd); gguf_set_val_u32(ctx, KV_FEED_FORWARD_LENGTH, model->hparams.n_ff); gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); - // n_head_kv is optional, default to n_head - // gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, ...); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT, model->hparams.n_head); + gguf_set_val_u32(ctx, KV_ATTENTION_HEAD_COUNT_KV, model->hparams.n_head_kv); gguf_set_val_u32(ctx, KV_BLOCK_COUNT, model->hparams.n_layer); gguf_set_val_u32(ctx, KV_ROPE_DIMENSION_COUNT, model->hparams.n_rot); gguf_set_val_f32(ctx, KV_ATTENTION_LAYERNORM_RMS_EPS, 1e-5f); @@ -789,12 +748,12 @@ static void save_as_llama_model( static struct train_params get_default_train_params() { struct train_params params; - params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; + params.fn_vocab_model = "models/7B/ggml-model-f16.gguf"; params.fn_llama2c_output_model = "ak_llama_model.bin"; - params.fn_train_data = "shakespeare.txt"; - params.fn_checkpoint_in = "checkpoint.bin"; - params.fn_checkpoint_out = "checkpoint.bin"; - params.fn_model_out = "ggml-checkpoint-f32.bin"; + params.fn_train_data = "shakespeare.txt"; + params.fn_checkpoint_in = "checkpoint.bin"; + params.fn_checkpoint_out = "checkpoint.bin"; + params.fn_model_out = "ggml-checkpoint-f32.bin"; params.seed = -1; @@ -829,8 +788,8 @@ static struct train_params get_default_train_params() { params.adam_alpha = 1e-3f; params.adam_decay = 1e-3f; - params.mem_model_gb = 2; - params.mem_compute_gb = 24; + params.mem_model_gb = 2; + params.mem_compute_gb = 24; params.mem_compute0_gb = 8; params.mem_compute1_gb = 2; @@ -916,19 +875,30 @@ int main(int argc, char ** argv) { if (!params_parse(argc, argv, ¶ms)) { return 1; } + log_set_target(stdout); Config config; TransformerWeights weights = {}; { - FILE *file = fopen(params.fn_llama2c_model, "rb"); - if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; } + LOG("%s: Loading llama2c model from %s\n", __func__, params.fn_llama2c_model); + FILE * file = fopen(params.fn_llama2c_model, "rb"); + if (!file) { + LOG("%s: Unable to open the checkpoint file %s!\n", __func__, params.fn_llama2c_model); + return 1; + } // read in the config header - if(fread(&config, sizeof(Config), 1, file) != 1) { return 1; } + if (fread(&config, sizeof(Config), 1, file) != 1) { + LOG("%s: Unable to read llama2c config from %s!\n",__func__,params.fn_llama2c_model); + return 1; + } auto shared_weights = config.vocab_size > 0; config.vocab_size = abs(config.vocab_size); // read in the Transformer weights - malloc_weights(&weights, &config, shared_weights); - if(checkpoint_init_weights(&weights, &config, file, shared_weights)) { return 1; } + alloc_weights(&weights, &config, shared_weights); + if (checkpoint_init_weights(&weights, &config, file, shared_weights)) { + LOG("%s: Unable to initialize transformer weights from %s!",__func__,params.fn_llama2c_model); + return 1; + } fclose(file); } @@ -936,15 +906,18 @@ int main(int argc, char ** argv) { load_vocab(params.fn_vocab_model, &config, &vocab); struct my_llama_model model; - model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); - model.hparams.n_ctx = params.n_ctx; - model.hparams.n_embd = config.dim; //params.n_embd; - model.hparams.n_ff = config.hidden_dim; - model.hparams.n_mult = 32;//params.n_mult; - model.hparams.n_head = config.n_heads; //params.n_head; - model.hparams.n_layer = config.n_layers; //params.n_layer; - model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + model.hparams.n_vocab = config.vocab_size; //llama_n_vocab(lctx); + model.hparams.n_ctx = params.n_ctx; + model.hparams.n_embd = config.dim; //params.n_embd; + model.hparams.n_ff = config.hidden_dim; + model.hparams.n_mult = 32;//params.n_mult; + model.hparams.n_head = config.n_heads; //params.n_head; + model.hparams.n_head_kv = config.n_kv_heads; + model.hparams.n_layer = config.n_layers; //params.n_layer; + model.hparams.n_rot = std::min((uint32_t)params.n_rotmax, model.hparams.n_embd / model.hparams.n_head); + print_params(&model.hparams); + struct ggml_init_params lcparams; lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); lcparams.mem_buffer = NULL; @@ -956,7 +929,7 @@ int main(int argc, char ** argv) { model.name = basename(params.fn_llama2c_model); save_as_llama_model(&vocab, &model, &weights, params.fn_llama2c_output_model); - printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model); + LOG("%s: Saving llama.c model file %s in ggml format at %s\n", __func__, params.fn_llama2c_model, params.fn_llama2c_output_model); ggml_free(model.ctx); return 0; diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp index cbf9aa2b5..6a93147d7 100644 --- a/examples/embedding/embedding.cpp +++ b/examples/embedding/embedding.cpp @@ -61,6 +61,8 @@ int main(int argc, char ** argv) { } params.embedding = true; + // For non-causal models, batch size must be equal to ubatch size + params.n_ubatch = params.n_batch; print_build_info(); @@ -114,15 +116,17 @@ int main(int argc, char ** argv) { for (const auto & prompt : prompts) { auto inp = ::llama_tokenize(ctx, prompt, true, false); if (inp.size() > n_batch) { - inp.resize(n_batch); + fprintf(stderr, "%s: error: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + __func__, (long long int) inp.size(), (long long int) n_batch); + return 1; } inputs.push_back(inp); } - // add eos if not present + // add SEP if not present for (auto & inp : inputs) { - if (inp.empty() || inp.back() != llama_token_eos(model)) { - inp.push_back(llama_token_eos(model)); + if (inp.empty() || inp.back() != llama_token_sep(model)) { + inp.push_back(llama_token_sep(model)); } } @@ -174,25 +178,27 @@ int main(int argc, char ** argv) { float * out = emb + p * n_embd; batch_decode(ctx, batch, out, s, n_embd); - // print the first part of the embeddings + // print the first part of the embeddings or for a single prompt, the full embedding fprintf(stdout, "\n"); for (int j = 0; j < n_prompts; j++) { fprintf(stdout, "embedding %d: ", j); - for (int i = 0; i < std::min(16, n_embd); i++) { + for (int i = 0; i < (n_prompts > 1 ? std::min(16, n_embd) : n_embd); i++) { fprintf(stdout, "%9.6f ", emb[j * n_embd + i]); } fprintf(stdout, "\n"); } // print cosine similarity matrix - fprintf(stdout, "\n"); - printf("cosine similarity matrix:\n\n"); - for (int i = 0; i < n_prompts; i++) { - for (int j = 0; j < n_prompts; j++) { - float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); - fprintf(stdout, "%6.2f ", sim); - } + if (n_prompts > 1) { fprintf(stdout, "\n"); + printf("cosine similarity matrix:\n\n"); + for (int i = 0; i < n_prompts; i++) { + for (int j = 0; j < n_prompts; j++) { + float sim = llama_embd_similarity_cos(emb + i * n_embd, emb + j * n_embd, n_embd); + fprintf(stdout, "%6.2f ", sim); + } + fprintf(stdout, "\n"); + } } // clean up diff --git a/examples/eval-callback/CMakeLists.txt b/examples/eval-callback/CMakeLists.txt new file mode 100644 index 000000000..c56ba780b --- /dev/null +++ b/examples/eval-callback/CMakeLists.txt @@ -0,0 +1,9 @@ +set(TARGET eval-callback) +add_executable(${TARGET} eval-callback.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TEST_TARGET test-eval-callback) +add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42 -ngl 0) +set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl) diff --git a/examples/eval-callback/README.md b/examples/eval-callback/README.md new file mode 100644 index 000000000..66a37e878 --- /dev/null +++ b/examples/eval-callback/README.md @@ -0,0 +1,95 @@ +# llama.cpp/examples/eval-callback + +A simple example which demonstrates how to use callback during the inference. +It simply prints to the console all operations and tensor data. + +Usage: + +```shell +eval-callback \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model phi-2-q4_0.gguf \ + --prompt hello \ + --seed 42 \ + -ngl 33 +``` + +Will print: + +```shell +llm_load_tensors: offloaded 33/33 layers to GPU +... +llama_new_context_with_model: n_ctx = 512 +... +llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB +llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB +llama_new_context_with_model: graph nodes = 1225 +llama_new_context_with_model: graph splits = 2 +ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.0181, 0.0272, 0.0272, ...], + ], + ] +ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -0.6989, 1.0636, 1.0636, ...], + ], + ] +ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1800, 0.2817, 0.2632, ...], + ], + ] +ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1} + [ + [ + [ -0.1863, 0.2970, 0.2604, ...], + ], + ] +ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1238, 1.2876, -1.8086, ...], + ], + ] +ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + ], + ] +ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1} + [ + [ + [ -1.1135, 1.4604, -1.9226, ...], + [ -0.3608, 0.5076, -1.8866, ...], + [ 1.7643, 0.0273, -2.1065, ...], + ... + ], + ] +``` diff --git a/examples/eval-callback/eval-callback.cpp b/examples/eval-callback/eval-callback.cpp new file mode 100644 index 000000000..e670d3769 --- /dev/null +++ b/examples/eval-callback/eval-callback.cpp @@ -0,0 +1,195 @@ +#include "common.h" +#include "llama.h" +#include "ggml.h" + +#include +#include +#include +#include +#include + +/** + * This the arbitrary data which will be passed to each callback. + * Later on we can for example add operation or tensor name filter from the CLI arg, or a file descriptor to dump the tensor. + */ +struct callback_data { + std::vector data; +}; + +static std::string ggml_ne_string(const ggml_tensor * t) { + std::string str; + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + str += std::to_string(t->ne[i]); + if (i + 1 < GGML_MAX_DIMS) { + str += ", "; + } + } + return str; +} + +static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne, const size_t * nb, int64_t n) { + GGML_ASSERT(n > 0); + float sum = 0; + for (int64_t i3 = 0; i3 < ne[3]; i3++) { + printf(" [\n"); + for (int64_t i2 = 0; i2 < ne[2]; i2++) { + if (i2 == n && ne[2] > 2*n) { + printf(" ..., \n"); + i2 = ne[2] - n; + } + printf(" [\n"); + for (int64_t i1 = 0; i1 < ne[1]; i1++) { + if (i1 == n && ne[1] > 2*n) { + printf(" ..., \n"); + i1 = ne[1] - n; + } + printf(" ["); + for (int64_t i0 = 0; i0 < ne[0]; i0++) { + if (i0 == n && ne[0] > 2*n) { + printf("..., "); + i0 = ne[0] - n; + } + size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; + float v; + if (type == GGML_TYPE_F16) { + v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]); + } else if (type == GGML_TYPE_F32) { + v = *(float *) &data[i]; + } else if (type == GGML_TYPE_I32) { + v = (float) *(int32_t *) &data[i]; + } else if (type == GGML_TYPE_I16) { + v = (float) *(int16_t *) &data[i]; + } else if (type == GGML_TYPE_I8) { + v = (float) *(int8_t *) &data[i]; + } else { + GGML_ASSERT(false); + } + printf("%12.4f", v); + sum += v; + if (i0 < ne[0] - 1) printf(", "); + } + printf("],\n"); + } + printf(" ],\n"); + } + printf(" ]\n"); + printf(" sum = %f\n", sum); + } +} + +/** + * GGML operations callback during the graph execution. + * + * @param t current tensor + * @param ask when ask is true, the scheduler wants to know if we are interested in data from this tensor + * if we return true, a follow-up call will be made with ask=false in which we can do the actual collection. + * see ggml_backend_sched_eval_callback + * @param user_data user data to pass at each call back + * @return true to receive data or continue the graph, false otherwise + */ +static bool ggml_debug(struct ggml_tensor * t, bool ask, void * user_data) { + auto * cb_data = (callback_data *) user_data; + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + if (ask) { + return true; // Always retrieve data + } + + char src1_str[128] = {0}; + if (src1) { + sprintf(src1_str, "%s{%s}", src1->name, ggml_ne_string(src1).c_str()); + } + + printf("%s: %24s = (%s) %10s(%s{%s}, %s}) = {%s}\n", __func__, + t->name, ggml_type_name(t->type), ggml_op_desc(t), + src0->name, ggml_ne_string(src0).c_str(), + src1 ? src1_str : "", + ggml_ne_string(t).c_str()); + + + // copy the data from the GPU memory if needed + const bool is_host = ggml_backend_buffer_is_host(t->buffer); + + if (!is_host) { + auto n_bytes = ggml_nbytes(t); + cb_data->data.resize(n_bytes); + ggml_backend_tensor_get(t, cb_data->data.data(), 0, n_bytes); + } + + if (!ggml_is_quantized(t->type)) { + uint8_t * data = is_host ? (uint8_t *) t->data : cb_data->data.data(); + ggml_print_tensor(data, t->type, t->ne, t->nb, 3); + } + + return true; +} + +static bool run(llama_context * ctx, const gpt_params & params) { + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size(), 0, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + + return true; +} + +int main(int argc, char ** argv) { + + callback_data cb_data; + + gpt_params params; + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + print_build_info(); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_backend_init(); + llama_numa_init(params.numa); + + // pass the callback to the backend scheduler + // it will be executed for each node during the graph computation + params.cb_eval = ggml_debug; + params.cb_eval_user_data = &cb_data; + params.warmup = false; + + // init + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); + return 1; + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + bool OK = run(ctx, params); + if (!OK) { + return 1; + } + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/examples/finetune/finetune.cpp b/examples/finetune/finetune.cpp index 3da5317b3..22743b1bf 100644 --- a/examples/finetune/finetune.cpp +++ b/examples/finetune/finetune.cpp @@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs( GGML_ASSERT(tokens_input->type == GGML_TYPE_I32); auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) { + if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) { return ggml_add_cast(ctx, a, b, GGML_TYPE_F32); } else if (a->type == GGML_TYPE_F32) { return ggml_add(ctx, a, b); diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt new file mode 100644 index 000000000..166e3ad2a --- /dev/null +++ b/examples/gbnf-validator/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gbnf-validator) +add_executable(${TARGET} gbnf-validator.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common grammar-parser llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/examples/gbnf-validator/gbnf-validator.cpp new file mode 100644 index 000000000..091069ffa --- /dev/null +++ b/examples/gbnf-validator/gbnf-validator.cpp @@ -0,0 +1,132 @@ +#define LLAMA_API_INTERNAL + +#include "grammar-parser.h" +#include "ggml.h" +#include "llama.h" +#include "unicode.h" + +#include +#include +#include +#include + +static bool llama_sample_grammar_string(struct llama_grammar * grammar, const std::string & input_str, size_t & error_pos, std::string & error_msg) { + auto decoded = decode_utf8(input_str, {}); + const auto & code_points = decoded.first; + + size_t pos = 0; + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + auto prev_stacks = grammar->stacks; + llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); + if (grammar->stacks.empty()) { + error_pos = pos; + error_msg = "Unexpected character '" + unicode_cpt_to_utf8(*it) + "'"; + grammar->stacks = prev_stacks; + return false; + } + ++pos; + } + + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + return true; + } + } + + error_pos = pos; + error_msg = "Unexpected end of input"; + return false; +} + +static void print_error_message(const std::string & input_str, size_t error_pos, const std::string & error_msg) { + fprintf(stdout, "Input string is invalid according to the grammar.\n"); + fprintf(stdout, "Error: %s at position %zu\n", error_msg.c_str(), error_pos); + fprintf(stdout, "\n"); + fprintf(stdout, "Input string:\n"); + fprintf(stdout, "%s", input_str.substr(0, error_pos).c_str()); + if (error_pos < input_str.size()) { + fprintf(stdout, "\033[1;31m%c", input_str[error_pos]); + if (error_pos+1 < input_str.size()) { + fprintf(stdout, "\033[0;31m%s", input_str.substr(error_pos+1).c_str()); + } + fprintf(stdout, "\033[0m\n"); + } +} + +int main(int argc, char** argv) { + if (argc != 3) { + fprintf(stdout, "Usage: %s \n", argv[0]); + return 1; + } + + const std::string grammar_filename = argv[1]; + const std::string input_filename = argv[2]; + + // Read the GBNF grammar file + FILE* grammar_file = fopen(grammar_filename.c_str(), "r"); + if (!grammar_file) { + fprintf(stdout, "Failed to open grammar file: %s\n", grammar_filename.c_str()); + return 1; + } + + fseek(grammar_file, 0, SEEK_END); + size_t grammar_size = ftell(grammar_file); + fseek(grammar_file, 0, SEEK_SET); + + std::string grammar_str(grammar_size, ' '); + fread(&grammar_str[0], 1, grammar_size, grammar_file); + fclose(grammar_file); + + // Parse the GBNF grammar + auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // will be empty (default) if there are parse errors + if (parsed_grammar.rules.empty()) { + fprintf(stdout, "%s: failed to parse grammar\n", __func__); + return 1; + } + + // Ensure that there is a "root" node. + if (parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()) { + fprintf(stdout, "%s: grammar does not contain a 'root' symbol\n", __func__); + return 1; + } + + std::vector grammar_rules(parsed_grammar.c_rules()); + + // Create the LLAMA grammar + auto grammar = llama_grammar_init( + grammar_rules.data(), + grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + + // Read the input file + FILE* input_file = fopen(input_filename.c_str(), "r"); + if (!input_file) { + fprintf(stdout, "Failed to open input file: %s\n", input_filename.c_str()); + return 1; + } + + fseek(input_file, 0, SEEK_END); + size_t input_size = ftell(input_file); + fseek(input_file, 0, SEEK_SET); + + std::string input_str(input_size, ' '); + fread(&input_str[0], 1, input_size, input_file); + fclose(input_file); + + // Validate the input string against the grammar + size_t error_pos; + std::string error_msg; + bool is_valid = llama_sample_grammar_string(grammar, input_str, error_pos, error_msg); + + if (is_valid) { + fprintf(stdout, "Input string is valid according to the grammar.\n"); + } else { + print_error_message(input_str, error_pos, error_msg); + } + + // Clean up + llama_grammar_free(grammar); + + return 0; +} diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md index ddb1f7649..ad1d86651 100644 --- a/examples/gguf-split/README.md +++ b/examples/gguf-split/README.md @@ -5,5 +5,6 @@ CLI to split / merge GGUF files. **Command line options:** - `--split`: split GGUF to multiple GGUF, default operation. +- `--split-max-size`: max size per split in `M` or `G`, f.ex. `500M` or `2G`. - `--split-max-tensors`: maximum tensors in each split: default(128) - `--merge`: merge multiple GGUF to a single GGUF. diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 8e12e6493..e04feeae3 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -1,37 +1,39 @@ #include "llama.h" -#include "ggml.h" #include "common.h" #include #include -#include #include #include -#include #include #include #include -#include #include +#include +#include + +#if defined(_WIN32) + #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif + #include +#endif enum split_operation : uint8_t { SPLIT_OP_SPLIT, SPLIT_OP_MERGE, }; -static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split"; -static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count"; - -static const int SPLIT_FILENAME_MAX = 256; - -static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf"; - struct split_params { split_operation operation = SPLIT_OP_SPLIT; + size_t n_bytes_split = 0; int n_split_tensors = 128; std::string input; std::string output; + bool no_tensor_first_split = false; + bool dry_run = false; }; static void split_print_usage(const char * executable) { @@ -42,15 +44,37 @@ static void split_print_usage(const char * executable) { printf("Apply a GGUF operation on IN to OUT."); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); - printf(" --split split GGUF to multiple GGUF (default)\n"); - printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors); - printf(" --merge merge multiple GGUF to a single GGUF\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --version show version and build info\n"); + printf(" --split split GGUF to multiple GGUF (enabled by default)\n"); + printf(" --merge merge multiple GGUF to a single GGUF\n"); + printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors); + printf(" --split-max-size N(M|G) max size per split\n"); + printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n"); + printf(" --dry-run only print out a split plan and exit, without writing any new files\n"); printf("\n"); } -static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) { +// return convert string, for example "128M" or "4G" to number of bytes +static size_t split_str_to_n_bytes(std::string str) { + size_t n_bytes = 0; + int n; + if (str.back() == 'M') { + sscanf(str.c_str(), "%d", &n); + n_bytes = (size_t)n * 1024 * 1024; // megabytes + } else if (str.back() == 'G') { + sscanf(str.c_str(), "%d", &n); + n_bytes = (size_t)n * 1024 * 1024 * 1024; // gigabytes + } else { + throw std::invalid_argument("error: supported units are M (megabytes) or G (gigabytes), but got: " + std::string(1, str.back())); + } + if (n <= 0) { + throw std::invalid_argument("error: size must be a positive value"); + } + return n_bytes; +} + +static void split_params_parse_ex(int argc, const char ** argv, split_params & params) { std::string arg; const std::string arg_prefix = "--"; bool invalid_param = false; @@ -63,6 +87,8 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p } bool arg_found = false; + bool is_op_set = false; + bool is_mode_set = false; if (arg == "-h" || arg == "--help") { split_print_usage(argv[0]); exit(0); @@ -72,23 +98,50 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); exit(0); } + if (arg == "--dry-run") { + arg_found = true; + params.dry_run = true; + } + if (arg == "--no-tensor-first-split") { + arg_found = true; + params.no_tensor_first_split = true; + } + if (is_op_set) { + throw std::invalid_argument("error: either --split or --merge can be specified, but not both"); + } if (arg == "--merge") { arg_found = true; + is_op_set = true; params.operation = SPLIT_OP_MERGE; } if (arg == "--split") { arg_found = true; + is_op_set = true; params.operation = SPLIT_OP_SPLIT; } + + if (is_mode_set) { + throw std::invalid_argument("error: either --split-max-tensors or --split-max-size can be specified, but not both"); + } if (arg == "--split-max-tensors") { if (++arg_idx >= argc) { invalid_param = true; break; } arg_found = true; + is_mode_set = true; params.n_split_tensors = atoi(argv[arg_idx]); } + if (arg == "--split-max-size") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + is_mode_set = true; + params.n_bytes_split = split_str_to_n_bytes(argv[arg_idx]); + } if (!arg_found) { throw std::invalid_argument("error: unknown argument: " + arg); @@ -100,29 +153,22 @@ static bool split_params_parse_ex(int argc, const char ** argv, split_params & p } if (argc - arg_idx < 2) { - printf("%s: bad arguments\n", argv[0]); - split_print_usage(argv[0]); - return false; + throw std::invalid_argument("error: bad arguments"); } params.input = argv[arg_idx++]; params.output = argv[arg_idx++]; - - return true; } static bool split_params_parse(int argc, const char ** argv, split_params & params) { bool result = true; try { - if (!split_params_parse_ex(argc, argv, params)) { - split_print_usage(argv[0]); - exit(1); - } + split_params_parse_ex(argc, argv, params); } catch (const std::invalid_argument & ex) { fprintf(stderr, "%s\n", ex.what()); split_print_usage(argv[0]); - exit(1); + exit(EXIT_FAILURE); } return result; } @@ -134,12 +180,6 @@ static void zeros(std::ofstream & file, size_t n) { } } -static std::string split_file_name(const std::string & path, int i_split, int n_split) { - char f_split[SPLIT_FILENAME_MAX] = {0}; - snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split); - return std::string(f_split); -} - struct split_strategy { const split_params params; std::ifstream & f_input; @@ -147,15 +187,11 @@ struct split_strategy { struct ggml_context * ctx_meta = NULL; const int n_tensors; - const int n_split; - int i_split = 0; + // one ctx_out per one output file + std::vector ctx_outs; - int i_tensor = 0; - - std::vector read_data; - - struct gguf_context * ctx_out; - std::ofstream fout; + // temporary buffer for reading in tensor data + std::vector read_buf; split_strategy(const split_params & params, std::ifstream & f_input, @@ -165,77 +201,146 @@ struct split_strategy { f_input(f_input), ctx_gguf(ctx_gguf), ctx_meta(ctx_meta), - n_tensors(gguf_get_n_tensors(ctx_gguf)), - n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) { + n_tensors(gguf_get_n_tensors(ctx_gguf)) { + + // because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits + int i_split = -1; + struct gguf_context * ctx_out = NULL; + auto new_ctx_out = [&](bool allow_no_tensors) { + i_split++; + if (ctx_out != NULL) { + if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) { + fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n"); + exit(EXIT_FAILURE); + } + ctx_outs.push_back(ctx_out); + } + ctx_out = gguf_init_empty(); + // Save all metadata in first split only + if (i_split == 0) { + gguf_set_kv(ctx_out, ctx_gguf); + } + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_NO, i_split); + gguf_set_val_u16(ctx_out, LLM_KV_SPLIT_COUNT, 0); // placeholder + gguf_set_val_i32(ctx_out, LLM_KV_SPLIT_TENSORS_COUNT, n_tensors); + }; + + // initialize ctx_out for the first split + new_ctx_out(false); + + // skip first split if no_tensor_first_split is set + if (params.no_tensor_first_split) { + new_ctx_out(true); } - bool should_split() const { - return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; + // process tensors one by one + size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata) + for (int i = 0; i < n_tensors; ++i) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + // calculate the "imaginary" size = the current size + next tensor size + size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT); + size_t next_tensors_size = curr_tensors_size + n_bytes; + if (should_split(i, next_tensors_size)) { + new_ctx_out(false); + curr_tensors_size = n_bytes; + } else { + curr_tensors_size = next_tensors_size; + } + gguf_add_tensor(ctx_out, t); + } + + // push the last ctx_out + ctx_outs.push_back(ctx_out); + + // set the correct n_split for all ctx_out + for (auto & ctx : ctx_outs) { + gguf_set_val_u16(ctx, LLM_KV_SPLIT_COUNT, ctx_outs.size()); + } } - void split_start() { - ctx_out = gguf_init_empty(); - - // Save all metadata in first split only - if (i_split == 0) { - gguf_set_kv(ctx_out, ctx_gguf); + ~split_strategy() { + for (auto & ctx_out : ctx_outs) { + gguf_free(ctx_out); } - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); - - // populate the original tensors, so we get an initial metadata - for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - gguf_add_tensor(ctx_out, meta); - } - - auto split_name = split_file_name(params.output, i_split, n_split); - - fprintf(stderr, "%s: %s ...", __func__, split_name.c_str()); - fout = std::ofstream(split_name, std::ios::binary); - fout.exceptions(std::ofstream::failbit); // fail fast on write errors - - auto meta_size = gguf_get_meta_size(ctx_out); - - // placeholder for the meta data - ::zeros(fout, meta_size); - - i_split++; } - void next_tensor() { - const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); - auto n_bytes = ggml_nbytes(t); - - if (read_data.size() < n_bytes) { - read_data.resize(n_bytes); + bool should_split(int i_tensor, size_t next_size) { + if (params.n_bytes_split > 0) { + // split by max size per file + return next_size > params.n_bytes_split; + } else { + // split by number of tensors per file + return i_tensor > 0 && i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; } - - auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); - f_input.seekg(offset); - f_input.read((char *)read_data.data(), n_bytes); - - t->data = read_data.data(); - - // write tensor data + padding - fout.write((const char *)t->data, n_bytes); - zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); - - i_tensor++; } - void split_end() { - // go back to beginning of file and write the updated metadata - fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.write((const char *)data.data(), data.size()); + void print_info() { + printf("n_split: %ld\n", ctx_outs.size()); + int i_split = 0; + for (auto & ctx_out : ctx_outs) { + // re-calculate the real gguf size for each split (= metadata size + total size of all tensors) + size_t total_size = gguf_get_meta_size(ctx_out); + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_out, i)); + total_size += ggml_nbytes(t); + } + total_size = total_size / 1024 / 1024; // convert to megabytes + printf("split %05d: n_tensors = %d, total_size = %ldM\n", i_split + 1, gguf_get_n_tensors(ctx_out), total_size); + i_split++; + } + } - fout.close(); - gguf_free(ctx_out); + void write() { + int i_split = 0; + int n_split = ctx_outs.size(); + for (auto & ctx_out : ctx_outs) { + // construct file path + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), params.output.c_str(), i_split, n_split); - fprintf(stderr, "\033[3Ddone\n"); + // open the output file + printf("Writing file %s ... ", split_path); + fflush(stdout); + std::ofstream fout = std::ofstream(split_path, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + // write metadata + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + // write tensors + for (int i = 0; i < gguf_get_n_tensors(ctx_out); ++i) { + // read tensor meta and prepare buffer + const char * t_name = gguf_get_tensor_name(ctx_out, i); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + auto n_bytes = ggml_nbytes(t); + read_buf.resize(n_bytes); + + // calculate offset + auto i_tensor_in = gguf_find_tensor(ctx_gguf, t_name); // idx of tensor in the input file + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor_in); + + // copy tensor from input to output file + copy_file_to_file(f_input, fout, offset, n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + + printf("done\n"); + // close the file + fout.close(); + i_split++; + } + } + + void copy_file_to_file(std::ifstream & f_in, std::ofstream & f_out, const size_t in_offset, const size_t len) { + // TODO: detect OS and use copy_file_range() here for better performance + if (read_buf.size() < len) { + read_buf.resize(len); + } + f_in.seekg(in_offset); + f_in.read((char *)read_buf.data(), len); + f_out.write((const char *)read_buf.data(), len); } }; @@ -250,37 +355,31 @@ static void gguf_split(const split_params & split_params) { std::ifstream f_input(split_params.input.c_str(), std::ios::binary); if (!f_input.is_open()) { fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); if (!ctx_gguf) { fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } + // prepare the strategy split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); - fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n", - __func__, split_params.input.c_str(), - split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(), - split_params.n_split_tensors); + int n_split = strategy.ctx_outs.size(); + strategy.print_info(); - strategy.split_start(); - - while (strategy.i_tensor < strategy.n_tensors) { - strategy.next_tensor(); - if (strategy.should_split()) { - strategy.split_end(); - strategy.split_start(); - } + if (!split_params.dry_run) { + // write all output splits + strategy.write(); } - strategy.split_end(); + // done, clean up gguf_free(ctx_gguf); f_input.close(); fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", - __func__, strategy.n_split, strategy.n_tensors); + __func__, n_split, strategy.n_tensors); } static void gguf_merge(const split_params & split_params) { @@ -298,7 +397,9 @@ static void gguf_merge(const split_params & split_params) { std::vector ctx_metas; std::vector ctx_ggufs; - std::string split_prefix; + char split_path[PATH_MAX] = {0}; + strncpy(split_path, split_params.input.c_str(), sizeof(split_path) - 1); + char split_prefix[PATH_MAX] = {0}; // First pass to find KV and tensors metadata for (int i_split = 0; i_split < n_split; i_split++) { @@ -309,89 +410,66 @@ static void gguf_merge(const split_params & split_params) { /*.ctx = */ &ctx_meta, }; - auto split_name = split_params.input; if (i_split > 0) { - split_name = split_file_name(split_prefix, i_split, n_split); + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); } - fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str()); + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_path); - auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params); + auto * ctx_gguf = gguf_init_from_file(split_path, params); if (!ctx_gguf) { fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); - exit(1); + exit(EXIT_FAILURE); } ctx_ggufs.push_back(ctx_gguf); ctx_metas.push_back(ctx_meta); if (i_split == 0) { - auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT); + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); if (key_n_split < 0) { fprintf(stderr, "\n%s: input file does not contain %s metadata\n", __func__, - LLM_KV_GENERAL_SPLIT_N_SPLIT); + LLM_KV_SPLIT_COUNT); gguf_free(ctx_gguf); + ggml_free(ctx_meta); gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - n_split = gguf_get_val_u8(ctx_gguf, key_n_split); + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); if (n_split < 1) { fprintf(stderr, "\n%s: input file does not contain a valid split count %d\n", __func__, n_split); gguf_free(ctx_gguf); + ggml_free(ctx_meta); gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); + } + + // Verify the file naming and extract split_prefix + if (!llama_split_prefix(split_prefix, sizeof (split_prefix), split_path, i_split, n_split)) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d" + " n_split=%d\n", __func__, + split_path, i_split, n_split); + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + gguf_free(ctx_out); + fout.close(); + exit(EXIT_FAILURE); } // Do not trigger merge if we try to merge again the output - gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); + gguf_set_val_u16(ctx_gguf, LLM_KV_SPLIT_COUNT, 0); // Set metadata from the first split gguf_set_kv(ctx_out, ctx_gguf); } - // Verify the file naming - { - int i_split_file = 0; - int n_split_file = 0; - const char * i_split_format = "-00000-of-00000.gguf"; - - if (split_name.size() < strlen(i_split_format)) { - fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str()); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); - } - gguf_free(ctx_out); - fout.close(); - exit(1); - } - - split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format)); - - const char * split_name_c_str = split_name.c_str(); - int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file); - - if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) { - fprintf(stderr, "\n%s: unexpected input file name: %s" - " i_split=%d i_split_file=%d" - " n_split=%d n_split_file=%d\n", __func__, - split_params.input.c_str(), - i_split, i_split_file, - n_split, n_split_file); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); - } - gguf_free(ctx_out); - fout.close(); - exit(1); - } - } - auto n_tensors = gguf_get_n_tensors(ctx_gguf); for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); @@ -411,18 +489,19 @@ static void gguf_merge(const split_params & split_params) { // Write tensors data for (int i_split = 0; i_split < n_split; i_split++) { - auto split_name = split_file_name(split_prefix, i_split, n_split); - std::ifstream f_input(split_name.c_str(), std::ios::binary); + llama_split_path(split_path, sizeof(split_path), split_prefix, i_split, n_split); + std::ifstream f_input(split_path, std::ios::binary); if (!f_input.is_open()) { - fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str()); - for (auto * _ctx_gguf : ctx_ggufs) { - gguf_free(_ctx_gguf); + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_path); + for (uint32_t i = 0; i < ctx_ggufs.size(); i++) { + gguf_free(ctx_ggufs[i]); + ggml_free(ctx_metas[i]); } gguf_free(ctx_out); fout.close(); - exit(1); + exit(EXIT_FAILURE); } - fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str()); + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_path); auto * ctx_gguf = ctx_ggufs[i_split]; auto * ctx_meta = ctx_metas[i_split]; @@ -469,10 +548,6 @@ static void gguf_merge(const split_params & split_params) { } int main(int argc, const char ** argv) { - if (argc < 3) { - split_print_usage(argv[0]); - } - split_params params; split_params_parse(argc, argv, params); @@ -481,8 +556,8 @@ int main(int argc, const char ** argv) { break; case SPLIT_OP_MERGE: gguf_merge(params); break; - default:split_print_usage(argv[0]); - exit(1); + default: split_print_usage(argv[0]); + exit(EXIT_FAILURE); } return 0; diff --git a/examples/gguf-split/tests.sh b/examples/gguf-split/tests.sh new file mode 100755 index 000000000..7ca6fa7f2 --- /dev/null +++ b/examples/gguf-split/tests.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +set -eu + +if [ $# -lt 1 ] +then + echo "usage: $0 path_to_build_binary [path_to_temp_folder]" + echo "example: $0 ../../build/bin ../../tmp" + exit 1 +fi + +if [ $# -gt 1 ] +then + TMP_DIR=$2 +else + TMP_DIR=/tmp +fi + +set -x + +SPLIT=$1/gguf-split +MAIN=$1/main +WORK_PATH=$TMP_DIR/gguf-split +ROOT_DIR=$(realpath $(dirname $0)/../../) + +mkdir -p "$WORK_PATH" + +# Clean up in case of previously failed test +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf + +# 1. Get a model +( +cd $WORK_PATH +"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf +) +echo PASS + +# 2. Split with max tensors strategy +$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split +echo PASS +echo + +# 2b. Test the sharded model is loading properly +$MAIN --model $WORK_PATH/ggml-model-split-00001-of-00006.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# 3. Merge +$SPLIT --merge $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-merge.gguf +echo PASS +echo + +# 3b. Test the merged model is loading properly +$MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# 4. Split with no tensors in the first split +$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors +echo PASS +echo + +# 4b. Test the sharded model is loading properly +$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# 5. Merge +#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf +#echo PASS +#echo + +# 5b. Test the merged model is loading properly +#$MAIN --model $WORK_PATH/ggml-model-merge-2.gguf --random-prompt --n-predict 32 +#echo PASS +#echo + +# 6. Split with size strategy +$SPLIT --split-max-size 2G $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-2G +echo PASS +echo + +# 6b. Test the sharded model is loading properly +$MAIN --model $WORK_PATH/ggml-model-split-2G-00001-of-00002.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# Clean up +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-merge*.gguf diff --git a/examples/gguf/gguf.cpp b/examples/gguf/gguf.cpp index 5444503a5..575143771 100644 --- a/examples/gguf/gguf.cpp +++ b/examples/gguf/gguf.cpp @@ -142,7 +142,7 @@ static bool gguf_ex_read_0(const std::string & fname) { } // read and create ggml_context containing the tensors and their data -static bool gguf_ex_read_1(const std::string & fname) { +static bool gguf_ex_read_1(const std::string & fname, bool check_data) { struct ggml_context * ctx_data = NULL; struct gguf_init_params params = { @@ -206,7 +206,7 @@ static bool gguf_ex_read_1(const std::string & fname) { printf("\n\n"); // check data - { + if (check_data) { const float * data = (const float *) cur->data; for (int j = 0; j < ggml_nelements(cur); ++j) { if (data[j] != 100 + i) { @@ -229,9 +229,16 @@ static bool gguf_ex_read_1(const std::string & fname) { int main(int argc, char ** argv) { if (argc < 3) { - printf("usage: %s data.gguf r|w\n", argv[0]); + printf("usage: %s data.gguf r|w [n]\n", argv[0]); + printf("r: read data.gguf file\n"); + printf("w: write data.gguf file\n"); + printf("n: no check of tensor data\n"); return -1; } + bool check_data = true; + if (argc == 4) { + check_data = false; + } const std::string fname(argv[1]); const std::string mode (argv[2]); @@ -242,7 +249,7 @@ int main(int argc, char ** argv) { GGML_ASSERT(gguf_ex_write(fname) && "failed to write gguf file"); } else if (mode == "r") { GGML_ASSERT(gguf_ex_read_0(fname) && "failed to read gguf file"); - GGML_ASSERT(gguf_ex_read_1(fname) && "failed to read gguf file"); + GGML_ASSERT(gguf_ex_read_1(fname, check_data) && "failed to read gguf file"); } return 0; diff --git a/examples/gritlm/README.md b/examples/gritlm/README.md index 64cc19204..a3a3c1389 100644 --- a/examples/gritlm/README.md +++ b/examples/gritlm/README.md @@ -21,12 +21,12 @@ not have to be performed at all. ### Running the example Download a Grit model: ```console -$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf +$ scripts/hf.sh --repo cohesionet/GritLM-7B_gguf --file gritlm-7b_q4_1.gguf --outdir models ``` Run the example using the downloaded model: ```console -$ ./gritlm -m gritlm-7b_q4_1.gguf +$ ./gritlm -m models/gritlm-7b_q4_1.gguf Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "A purely peer-to-peer version of electronic cash w" is: 0.605 Cosine similarity between "Bitcoin: A Peer-to-Peer Electronic Cash System" and "All text-based language problems can be reduced to" is: 0.103 diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md index 578e8fc27..458c01b87 100644 --- a/examples/imatrix/README.md +++ b/examples/imatrix/README.md @@ -22,7 +22,7 @@ For faster computation, make sure to use GPU offloading via the `-ngl` argument ## Example ```bash -LLAMA_CUBLAS=1 make -j +LLAMA_CUDA=1 make -j # generate importance matrix (imatrix.dat) ./imatrix -m ggml-model-f16.gguf -f train-data.txt -ngl 99 diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index ea79b9062..82b19fc4f 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -19,10 +19,12 @@ struct Stats { std::vector values; + std::vector counts; int ncall = 0; }; struct StatParams { + std::string dataset; std::string ofile = "imatrix.dat"; int n_output_frequency = 10; int verbosity = 1; @@ -44,41 +46,44 @@ private: std::mutex m_mutex; int m_last_call = 0; std::vector m_src1_data; - std::vector m_ids; // the expert ids from ggml_mul_mat_id + std::vector m_ids; // the expert ids from ggml_mul_mat_id // - void save_imatrix(const char * file_name) const; + void save_imatrix(const char * file_name, const char * dataset) const; void keep_imatrix(int ncall) const; }; +// remove any prefix and suffixes from the name +// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight +static std::string filter_tensor_name(const char * name) { + std::string wname; + const char * p = strchr(name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = name; + } + return wname; +} + bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) { GGML_UNUSED(user_data); const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; - - std::string wname; - { - // remove any prefix and suffixes from the name - // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight - const char * p = strchr(src0->name, '#'); - if (p != NULL) { - p = p + 1; - const char * q = strchr(p, '#'); - if (q != NULL) { - wname = std::string(p, q - p); - } else { - wname = p; - } - } else { - wname = src0->name; - } - } + std::string wname = filter_tensor_name(src0->name); // when ask is true, the scheduler wants to know if we are interested in data from this tensor // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications if (t->op != GGML_OP_MUL_MAT) return false; + // why are small batches ignored (<16 tokens)? if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; return true; @@ -96,44 +101,59 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const float * data = is_host ? (const float *) src1->data : m_src1_data.data(); + // this has been adapted to the new format of storing merged experts in a single 3d tensor + // ref: https://github.com/ggerganov/llama.cpp/pull/6387 if (t->op == GGML_OP_MUL_MAT_ID) { - const int idx = ((int32_t *) t->op_params)[0]; - const int n_as = ((int32_t *) t->op_params)[1]; + // ids -> [n_experts_used, n_tokens] + // src1 -> [cols, n_expert_used, n_tokens] + const ggml_tensor * ids = t->src[2]; + const int n_as = src0->ne[2]; + const int n_ids = ids->ne[0]; - // the top-k selected expert ids are stored in the src0 tensor - // for simplicity, always copy src0 to host, because it is small - // take into account that src0 is not contiguous! - GGML_ASSERT(src0->ne[1] == src1->ne[1]); - GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int))); - m_ids.resize(ggml_nbytes(src0)/sizeof(int)); - ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0)); + // the top-k selected expert ids are stored in the ids tensor + // for simplicity, always copy ids to host, because it is small + // take into account that ids is not contiguous! + GGML_ASSERT(ids->ne[1] == src1->ne[2]); + + m_ids.resize(ggml_nbytes(ids)); + ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids)); + + auto & e = m_stats[wname]; + + ++e.ncall; + + if (e.values.empty()) { + e.values.resize(src1->ne[0]*n_as, 0); + e.counts.resize(src1->ne[0]*n_as, 0); + } + else if (e.values.size() != (size_t)src1->ne[0]*n_as) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as); + exit(1); //GGML_ASSERT(false); + } + if (m_params.verbosity > 1) { + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type); + } // loop over all possible experts, regardless if they are used or not in the batch - // this is necessary to guarantee equal number of "ncall" for each tensor for (int ex = 0; ex < n_as; ++ex) { - src0 = t->src[2 + ex]; - auto& e = m_stats[wname]; - if (e.values.empty()) { - e.values.resize(src1->ne[0], 0); - } - else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); - exit(1); //GGML_ASSERT(false); - } - // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger - // using the following line, we can correct for that if needed - //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; - ++e.ncall; - if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); - } - for (int row = 0; row < (int)src1->ne[1]; ++row) { - const int excur = m_ids[row*n_as + idx]; - GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check - if (excur != ex) continue; - const float * x = data + row * src1->ne[0]; - for (int j = 0; j < (int)src1->ne[0]; ++j) { - e.values[j] += x[j]*x[j]; + size_t e_start = ex*src1->ne[0]; + + for (int idx = 0; idx < n_ids; ++idx) { + for (int row = 0; row < (int)src1->ne[2]; ++row) { + const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]); + + GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check + + if (excur != ex) continue; + + const int64_t i11 = idx % src1->ne[1]; + const int64_t i12 = row; + const float * x = (const float *)((const char *)data + i11*src1->nb[1] + i12*src1->nb[2]); + + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[e_start + j] += x[j]*x[j]; + e.counts[e_start + j]++; + } } } if (e.ncall > m_last_call) { @@ -150,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); + e.counts.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); @@ -163,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const float * x = data + row * src1->ne[0]; for (int j = 0; j < (int)src1->ne[0]; ++j) { e.values[j] += x[j]*x[j]; + e.counts[j]++; } } if (e.ncall > m_last_call) { @@ -180,7 +202,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } void IMatrixCollector::save_imatrix() const { - save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str()); + save_imatrix(m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(), m_params.dataset.c_str()); } void IMatrixCollector::keep_imatrix(int ncall) const { @@ -188,24 +210,39 @@ void IMatrixCollector::keep_imatrix(int ncall) const { if (file_name.empty()) file_name = "imatrix.dat"; file_name += ".at_"; file_name += std::to_string(ncall); - save_imatrix(file_name.c_str()); + save_imatrix(file_name.c_str(), m_params.dataset.c_str()); } -void IMatrixCollector::save_imatrix(const char * fname) const { +void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) const { std::ofstream out(fname, std::ios::binary); int n_entries = m_stats.size(); - out.write((const char*)&n_entries, sizeof(n_entries)); - for (auto& p : m_stats) { + out.write((const char *) &n_entries, sizeof(n_entries)); + for (const auto & p : m_stats) { int len = p.first.size(); - out.write((const char*)&len, sizeof(len)); + out.write((const char *) &len, sizeof(len)); out.write(p.first.c_str(), len); - out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); + out.write((const char *) &p.second.ncall, sizeof(p.second.ncall)); int nval = p.second.values.size(); - out.write((const char*)&nval, sizeof(nval)); - if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); + out.write((const char *) &nval, sizeof(nval)); + if (nval > 0) { + std::vector tmp(nval); + for (int i = 0; i < nval; i++) { + tmp[i] = (p.second.values[i] / static_cast(p.second.counts[i])) * static_cast(p.second.ncall); + } + out.write((const char*)tmp.data(), nval*sizeof(float)); + } } + + // Write the number of call the matrix was computed with + out.write((const char *) &m_last_call, sizeof(m_last_call)); + + // Write the dataset name at the end of the file to later on specify it in quantize + int n_dataset = strlen(dataset); + out.write((const char *) &n_dataset, sizeof(n_dataset)); + out.write(dataset, n_dataset); + if (m_params.verbosity > 0) { - fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n", __func__, m_last_call, fname); } } @@ -241,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma imatrix_data = {}; return false; } - e.values.resize(nval); - in.read((char*)e.values.data(), nval*sizeof(float)); + + // When re-called from load_imatrix() with add set, this will already be created. + if (e.values.empty()) { + e.values.resize(nval, 0); + e.counts.resize(nval, 0); + } + + std::vector tmp(nval); + in.read((char*)tmp.data(), nval*sizeof(float)); if (in.fail()) { printf("%s: failed reading data for entry %d\n",__func__,i); imatrix_data = {}; return false; } - e.ncall = ncall; + + // Recreate the state as expected by save_imatrix(), and corerct for weighted sum. + for (int i = 0; i < nval; i++) { + e.values[i] += tmp[i]; + e.counts[i] += ncall; + } + e.ncall += ncall; + } return true; } @@ -343,12 +394,13 @@ static void process_logits( static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool compute_ppl, int from_chunk) { const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); const int n_ctx = llama_n_ctx(ctx); auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -421,6 +473,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } + // TODO: use batch.logits to save computations instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return false; @@ -526,6 +579,29 @@ int main(int argc, char ** argv) { } } + gpt_params params; + params.n_batch = 512; + if (!gpt_params_parse(args.size(), args.data(), params)) { + return 1; + } + + params.logits_all = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + sparams.dataset = params.prompt_file; g_collector.set_parameters(std::move(sparams)); if (!combine_files.empty()) { @@ -564,49 +640,21 @@ int main(int argc, char ** argv) { } } - gpt_params params; - params.n_batch = 512; - if (!gpt_params_parse(args.size(), args.data(), params)) { - return 1; - } - - params.logits_all = true; - params.n_batch = std::min(params.n_batch, params.n_ctx); - - print_build_info(); - - if (params.seed == LLAMA_DEFAULT_SEED) { - params.seed = time(NULL); - } - - fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.random_prompt) { - params.prompt = gpt_random_prompt(rng); - } - llama_backend_init(); llama_numa_init(params.numa); - llama_model_params mparams = llama_model_params_from_gpt_params(params); - - llama_model * model = llama_load_model_from_file(params.model.c_str(), mparams); - if (model == NULL) { - fprintf(stderr, "%s: error: unable to load model\n", __func__); - return 1; - } - - llama_context_params cparams = llama_context_params_from_gpt_params(params); - // pass the callback to the backend scheduler // it will be executed for each node during the graph computation - cparams.cb_eval = ik_collect_imatrix; - cparams.cb_eval_user_data = NULL; + params.cb_eval = ik_collect_imatrix; + params.cb_eval_user_data = NULL; + params.warmup = false; - llama_context * ctx = llama_new_context_with_model(model, cparams); - if (ctx == NULL) { - fprintf(stderr, "%s: error: unable to create context\n", __func__); + // init + llama_model * model; + llama_context * ctx; + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == nullptr || ctx == nullptr) { + fprintf(stderr, "%s : failed to init\n", __func__); return 1; } diff --git a/examples/infill/README.md b/examples/infill/README.md index 8c97f719b..6b076c839 100644 --- a/examples/infill/README.md +++ b/examples/infill/README.md @@ -36,6 +36,11 @@ The `infill` program offers a seamless way to interact with LLaMA models, allowi ### Example +Download a model that supports infill, for example CodeLlama: +```console +scripts/hf.sh --repo TheBloke/CodeLlama-13B-GGUF --file codellama-13b.Q5_K_S.gguf --outdir models +``` + ```bash ./infill -t 10 -ngl 0 -m models/codellama-13b.Q5_K_S.gguf -c 4096 --temp 0.7 --repeat_penalty 1.1 -n 20 --in-prefix "def helloworld():\n print(\"hell" --in-suffix "\n print(\"goodbye world\")\n " ``` diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 91c39c5ae..afac145f6 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -239,6 +239,7 @@ int main(int argc, char ** argv) { LOG_TEE("%s\n", get_system_info(params).c_str()); } const bool add_bos = llama_should_add_bos_token(model); + GGML_ASSERT(llama_add_eos_token(model) != 1); LOG("add_bos: %d\n", add_bos); bool suff_rm_leading_spc = params.escape; @@ -279,10 +280,10 @@ int main(int argc, char ** argv) { if (ctx_guidance) { LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos); + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); original_prompt_len = original_inp.size(); @@ -585,7 +586,7 @@ int main(int argc, char ** argv) { // deal with eot token in infill mode if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ - if(is_interacting && !params.interactive_first) { + if (is_interacting && !params.interactive_first) { // print an eot token printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } @@ -650,8 +651,8 @@ int main(int argc, char ** argv) { // LOG_TEE("took new input\n"); is_interacting = false; } - // deal with end of text token in interactive mode - else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { + // deal with end of generation tokens in interactive mode + else if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { LOG("found EOS token\n"); if (params.interactive) { @@ -730,8 +731,8 @@ int main(int argc, char ** argv) { } } - // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) { + // end of generation + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !params.interactive) { break; } diff --git a/examples/json-schema-to-grammar.py b/examples/json_schema_to_grammar.py similarity index 73% rename from examples/json-schema-to-grammar.py rename to examples/json_schema_to_grammar.py index 9eead557f..826cd3f72 100755 --- a/examples/json-schema-to-grammar.py +++ b/examples/json_schema_to_grammar.py @@ -6,37 +6,94 @@ import re import sys from typing import Any, Dict, List, Set, Tuple, Union +def _build_repetition(item_rule, min_items, max_items, separator_rule=None, item_rule_is_literal=False): + if not separator_rule: + if min_items == 0 and max_items == 1: + return f'{item_rule}?' + elif min_items == 1 and max_items is None: + return f'{item_rule}+' + + result = '' + + if min_items > 0: + if item_rule_is_literal and separator_rule is None: + result = '"' + (item_rule[1:-1] * min_items) + '"' + else: + result = (f' {separator_rule} ' if separator_rule else ' ').join([item_rule] * min_items) + + def opt_repetitions(up_to_n, prefix_with_sep=False): + ''' + - n=4, no sep: '(a (a (a (a)?)?)?)?' + - n=4, sep=',', prefix: '("," a ("," a ("," a ("," a)?)?)?)?' + - n=4, sep=',', no prefix: '(a ("," a ("," a ("," a)?)?)?)?' + ''' + + content = f'{separator_rule} {item_rule}' if prefix_with_sep and separator_rule else item_rule + if up_to_n == 0: + return '' + elif up_to_n == 1: + return f'({content})?' + elif separator_rule and not prefix_with_sep: + return f'({content} {opt_repetitions(up_to_n - 1, prefix_with_sep=True)})?' + else: + return (f'({content} ' * up_to_n).rstrip() + (')?' * up_to_n) + + if min_items > 0 and max_items != min_items: + result += ' ' + + if max_items is not None: + result += opt_repetitions(max_items - min_items, prefix_with_sep=min_items > 0) + else: + item_operator = f'({separator_rule + " " if separator_rule else ""}{item_rule})' + + if min_items == 0 and separator_rule: + result = f'({item_rule} {item_operator}*)?' + else: + result += f'{item_operator}*' + + return result + + +class BuiltinRule: + def __init__(self, content: str, deps: list = None): + self.content = content + self.deps = deps or [] + +_up_to_15_digits = _build_repetition('[0-9]', 0, 15) + # whitespace is constrained to a single space char to prevent model "running away" in # whitespace. Also maybe improves generation quality? SPACE_RULE = '" "?' PRIMITIVE_RULES = { - 'boolean': '("true" | "false") space', - 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space', - 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space', - 'value' : 'object | array | string | number | boolean', - 'object' : '"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', - 'array' : '"[" space ( value ("," space value)* )? "]" space', - 'uuid' : '"\\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + ' "\\"" space', - 'string': r''' "\"" ( - [^"\\] | - "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) - )* "\"" space''', - 'null': '"null" space', + 'boolean' : BuiltinRule('("true" | "false") space', []), + 'decimal-part' : BuiltinRule('[0-9] ' + _up_to_15_digits, []), + 'integral-part': BuiltinRule('[0-9] | [1-9] ' + _up_to_15_digits, []), + 'number' : BuiltinRule('("-"? integral-part) ("." decimal-part)? ([eE] [-+]? integral-part)? space', ['integral-part', 'decimal-part']), + 'integer' : BuiltinRule('("-"? integral-part) space', ['integral-part']), + 'value' : BuiltinRule('object | array | string | number | boolean | null', ['object', 'array', 'string', 'number', 'boolean', 'null']), + 'object' : BuiltinRule('"{" space ( string ":" space value ("," space string ":" space value)* )? "}" space', ['string', 'value']), + 'array' : BuiltinRule('"[" space ( value ("," space value)* )? "]" space', ['value']), + 'uuid' : BuiltinRule(r'"\"" ' + ' "-" '.join('[0-9a-fA-F]' * n for n in [8, 4, 4, 4, 12]) + r' "\"" space', []), + 'char' : BuiltinRule(r'[^"\\] | "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])', []), + 'string' : BuiltinRule(r'"\"" char* "\"" space', ['char']), + 'null' : BuiltinRule('"null" space', []), } -OBJECT_RULE_NAMES = ['object', 'array', 'string', 'number', 'boolean', 'null', 'value'] # TODO: support "uri", "email" string formats -DATE_RULES = { - 'date' : '[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', - 'time' : '([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', - 'date-time': 'date "T" time', - 'date-string': '"\\"" date "\\"" space', - 'time-string': '"\\"" time "\\"" space', - 'date-time-string': '"\\"" date-time "\\"" space', +STRING_FORMAT_RULES = { + 'date' : BuiltinRule('[0-9] [0-9] [0-9] [0-9] "-" ( "0" [1-9] | "1" [0-2] ) "-" ( \"0\" [1-9] | [1-2] [0-9] | "3" [0-1] )', []), + 'time' : BuiltinRule('([01] [0-9] | "2" [0-3]) ":" [0-5] [0-9] ":" [0-5] [0-9] ( "." [0-9] [0-9] [0-9] )? ( "Z" | ( "+" | "-" ) ( [01] [0-9] | "2" [0-3] ) ":" [0-5] [0-9] )', []), + 'date-time' : BuiltinRule('date "T" time', ['date', 'time']), + 'date-string' : BuiltinRule('"\\"" date "\\"" space', ['date']), + 'time-string' : BuiltinRule('"\\"" time "\\"" space', ['time']), + 'date-time-string': BuiltinRule('"\\"" date-time "\\"" space', ['date-time']), } -RESERVED_NAMES = set(["root", *PRIMITIVE_RULES.keys(), *DATE_RULES.keys()]) +DOTALL = '[\\U00000000-\\U0010FFFF]' +DOT = '[^\\x0A\\x0D]' + +RESERVED_NAMES = set(["root", "dot", *PRIMITIVE_RULES.keys(), *STRING_FORMAT_RULES.keys()]) INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+') GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]') @@ -46,8 +103,6 @@ GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"', '-': '\\-', ']' NON_LITERAL_SET = set('|.()[]{}*+?') ESCAPED_IN_REGEXPS_BUT_NOT_IN_LITERALS = set('[]()|{}*+?') -DATE_PATTERN = '[0-9]{4}-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])' -TIME_PATTERN = '([01][0-9]|2[0-3])(:[0-5][0-9]){2}(\\.[0-9]{1,3})?(Z|[+-](([01][0-9]|2[0-3]):[0-5][0-9]))' # Cap millisecond precision w/ 3 digits class SchemaConverter: def __init__(self, *, prop_order, allow_fetch, dotall, raw_pattern): @@ -55,16 +110,41 @@ class SchemaConverter: self._allow_fetch = allow_fetch self._dotall = dotall self._raw_pattern = raw_pattern - self._rules = {'space': SPACE_RULE} + self._rules = { + 'space': SPACE_RULE, + } self._refs = {} self._refs_being_resolved = set() def _format_literal(self, literal): escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub( - lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal) + lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), literal ) return f'"{escaped}"' + def not_literal(self, literal: str, dotall: bool = True, maybe_escaped_underscores = False) -> str: + ''' + not_literal('a') -> '[^a]' + not_literal('abc') -> '([^a] | "a" ([^b] | "b" ([^c])?)?)?' + ''' + assert len(literal) > 0, 'Empty literal not supported' + def recurse(i: int): + c = literal[i] + if maybe_escaped_underscores and c == '_': + yield f'[^{c}\\\\]' + yield ' | ' + yield f'"\\\\"? "{c}"' + else: + yield f'[^{c}]' + if i < len(literal) - 1: + yield ' | ' + yield self._format_literal(c) + yield ' (' + yield from recurse(i + 1) + yield ')?' + + return ''.join(('(', *recurse(0), ')')) + def _add_rule(self, name, rule): esc_name = INVALID_RULE_CHARS_RE.sub('-', name) if esc_name not in self._rules or self._rules[esc_name] == rule: @@ -169,10 +249,10 @@ class SchemaConverter: def get_dot(): if self._dotall: - rule = '[\\U00000000-\\U0010FFFF]' + rule = DOTALL else: # Accept any character... except \n and \r line break chars (\x0A and \xOD) - rule = '[\\U00000000-\\x09\\x0B\\x0C\\x0E-\\U0010FFFF]' + rule = DOT return self._add_rule(f'dot', rule) def join_seq(): @@ -246,26 +326,14 @@ class SchemaConverter: (sub, sub_is_literal) = seq[-1] - if min_times == 0 and max_times is None: - seq[-1] = (f'{sub}*', False) - elif min_times == 0 and max_times == 1: - seq[-1] = (f'{sub}?', False) - elif min_times == 1 and max_times is None: - seq[-1] = (f'{sub}+', False) - else: - if not sub_is_literal: - id = sub_rule_ids.get(sub) - if id is None: - id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub) - sub_rule_ids[sub] = id - sub = id + if not sub_is_literal: + id = sub_rule_ids.get(sub) + if id is None: + id = self._add_rule(f'{name}-{len(sub_rule_ids) + 1}', sub) + sub_rule_ids[sub] = id + sub = id - seq[-1] = ( - ' '.join( - ([f'"{sub[1:-1] * min_times}"'] if sub_is_literal else [sub] * min_times) + - ([f'{sub}?'] * (max_times - min_times) if max_times is not None else [f'{sub}*'])), - False - ) + seq[-1] = (_build_repetition(f'"{sub}"' if sub_is_literal else sub, min_times, max_times, item_rule_is_literal=sub_is_literal), False) else: literal = '' while i < length: @@ -308,8 +376,7 @@ class SchemaConverter: return ref_name def _generate_constant_rule(self, value): - assert isinstance(value, str), f'Only string constants are supported, got {value}' - return self._format_literal(value) + return self._format_literal(json.dumps(value)) def visit(self, schema, name): schema_type = schema.get('type') @@ -374,49 +441,47 @@ class SchemaConverter: ' "]" space') else: item_rule_name = self.visit(items, f'{name}{"-" if name else ""}item') - list_item_operator = f'( "," space {item_rule_name} )' - successive_items = "" min_items = schema.get("minItems", 0) max_items = schema.get("maxItems") - if min_items > 0: - successive_items = list_item_operator * (min_items - 1) - min_items -= 1 - if max_items is not None and max_items > min_items: - successive_items += (list_item_operator + "?") * (max_items - min_items - 1) - else: - successive_items += list_item_operator + "*" - if min_items == 0: - rule = f'"[" space ( {item_rule_name} {successive_items} )? "]" space' - else: - rule = f'"[" space {item_rule_name} {successive_items} "]" space' - return self._add_rule(rule_name, rule) + return self._add_rule(rule_name, '"[" space ' + _build_repetition(item_rule_name, min_items, max_items, separator_rule='"," space') + ' "]" space') elif schema_type in (None, 'string') and 'pattern' in schema: return self._visit_pattern(schema['pattern'], rule_name) elif schema_type in (None, 'string') and re.match(r'^uuid[1-5]?$', schema_format or ''): - return self._add_rule( + return self._add_primitive( 'root' if rule_name == 'root' else schema_format, PRIMITIVE_RULES['uuid'] ) - elif schema_type in (None, 'string') and schema_format in DATE_RULES: - for t, r in DATE_RULES.items(): - self._add_rule(t, r) - return schema_format + '-string' + elif schema_type in (None, 'string') and f'{schema_format}-string' in STRING_FORMAT_RULES: + prim_name = f'{schema_format}-string' + return self._add_rule(rule_name, self._add_primitive(prim_name, STRING_FORMAT_RULES[prim_name])) + + elif schema_type == 'string' and ('minLength' in schema or 'maxLength' in schema): + char_rule = self._add_primitive('char', PRIMITIVE_RULES['char']) + min_len = schema.get('minLength', 0) + max_len = schema.get('maxLength') + + return self._add_rule(rule_name, r'"\"" ' + _build_repetition(char_rule, min_len, max_len) + r' "\"" space') elif (schema_type == 'object') or (len(schema) == 0): - for n in OBJECT_RULE_NAMES: - self._add_rule(n, PRIMITIVE_RULES[n]) - return self._add_rule(rule_name, 'object') + return self._add_rule(rule_name, self._add_primitive('object', PRIMITIVE_RULES['object'])) else: assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}' # TODO: support minimum, maximum, exclusiveMinimum, exclusiveMaximum at least for zero - return self._add_rule( - 'root' if rule_name == 'root' else schema_type, - PRIMITIVE_RULES[schema_type] - ) + return self._add_primitive('root' if rule_name == 'root' else schema_type, PRIMITIVE_RULES[schema_type]) + + def _add_primitive(self, name: str, rule: BuiltinRule): + n = self._add_rule(name, rule.content) + + for dep in rule.deps: + dep_rule = PRIMITIVE_RULES.get(dep) or STRING_FORMAT_RULES.get(dep) + assert dep_rule, f'Rule {dep} not known' + if dep not in self._rules: + self._add_primitive(dep, dep_rule) + return n def _build_object_rule(self, properties: List[Tuple[str, Any]], required: Set[str], name: str, additional_properties: Union[bool, Any]): prop_order = self._prop_order @@ -428,7 +493,7 @@ class SchemaConverter: prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}') prop_kv_rule_names[prop_name] = self._add_rule( f'{name}{"-" if name else ""}{prop_name}-kv', - fr'{self._format_literal(prop_name)} space ":" space {prop_rule_name}' + fr'{self._format_literal(json.dumps(prop_name))} space ":" space {prop_rule_name}' ) required_props = [k for k in sorted_props if k in required] optional_props = [k for k in sorted_props if k not in required] @@ -438,7 +503,7 @@ class SchemaConverter: value_rule = self.visit({} if additional_properties == True else additional_properties, f'{sub_name}-value') prop_kv_rule_names["*"] = self._add_rule( f'{sub_name}-kv', - self._add_rule('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}' + self._add_primitive('string', PRIMITIVE_RULES['string']) + f' ":" space {value_rule}' ) optional_props.append("*") diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 82413b79d..40128ec44 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -113,7 +113,7 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; @@ -174,9 +174,11 @@ struct cmd_params { std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; + std::vector flash_attn; std::vector> tensor_split; std::vector use_mmap; std::vector embeddings; + ggml_numa_strategy numa; int reps; bool verbose; output_formats output_format; @@ -190,14 +192,16 @@ static const cmd_params cmd_params_defaults = { /* n_ubatch */ {512}, /* type_k */ {GGML_TYPE_F16}, /* type_v */ {GGML_TYPE_F16}, - /* n_threads */ {get_num_physical_cores()}, + /* n_threads */ {get_math_cpu_count()}, /* n_gpu_layers */ {99}, /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, + /* flash_attn */ {false}, /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, /* use_mmap */ {true}, /* embeddings */ {false}, + /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* verbose */ false, /* output_format */ MARKDOWN @@ -220,7 +224,9 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); + printf(" --numa (default: disabled)\n"); printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); @@ -393,6 +399,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); + } else if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + break; + } else { + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; break; } + } + } else if (arg == "-fa" || arg == "--flash-attn") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); } else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) { invalid_param = true; @@ -477,6 +501,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } + if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } @@ -498,6 +523,7 @@ struct cmd_params_instance { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; + bool flash_attn; std::vector tensor_split; bool use_mmap; bool embeddings; @@ -532,6 +558,7 @@ struct cmd_params_instance { cparams.type_k = type_k; cparams.type_v = type_v; cparams.offload_kqv = !no_kv_offload; + cparams.flash_attn = flash_attn; cparams.embeddings = embeddings; return cparams; @@ -554,6 +581,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) for (const auto & nkvo : params.no_kv_offload) + for (const auto & fa : params.flash_attn) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { if (n_prompt == 0) { @@ -572,6 +600,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, + /* .flash_attn = */ fa, /* .tensor_split = */ ts, /* .use_mmap = */ mmp, /* .embeddings = */ embd, @@ -596,6 +625,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, + /* .flash_attn = */ fa, /* .tensor_split = */ ts, /* .use_mmap = */ mmp, /* .embeddings = */ embd, @@ -633,6 +663,7 @@ struct test { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; + bool flash_attn; std::vector tensor_split; bool use_mmap; bool embeddings; @@ -657,6 +688,7 @@ struct test { split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; + flash_attn = inst.flash_attn; tensor_split = inst.tensor_split; use_mmap = inst.use_mmap; embeddings = inst.embeddings; @@ -731,7 +763,7 @@ struct test { "n_batch", "n_ubatch", "n_threads", "type_k", "type_v", "n_gpu_layers", "split_mode", - "main_gpu", "no_kv_offload", + "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", @@ -753,7 +785,7 @@ struct test { } if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || - field == "use_mmap" || field == "embeddings") { + field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -787,7 +819,7 @@ struct test { std::to_string(n_batch), std::to_string(n_ubatch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), split_mode_str(split_mode), - std::to_string(main_gpu), std::to_string(no_kv_offload), + std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), @@ -808,7 +840,7 @@ struct test { const std::string test::build_commit = LLAMA_COMMIT; const int test::build_number = LLAMA_BUILD_NUMBER; -const bool test::cuda = !!ggml_cpu_has_cublas(); +const bool test::cuda = !!ggml_cpu_has_cuda(); const bool test::opencl = !!ggml_cpu_has_clblast(); const bool test::vulkan = !!ggml_cpu_has_vulkan(); const bool test::kompute = !!ggml_cpu_has_kompute(); @@ -955,6 +987,9 @@ struct markdown_printer : public printer { if (field == "no_kv_offload") { return "nkvo"; } + if (field == "flash_attn") { + return "fa"; + } if (field == "use_mmap") { return "mmap"; } @@ -1001,6 +1036,9 @@ struct markdown_printer : public printer { if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { fields.emplace_back("no_kv_offload"); } + if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) { + fields.emplace_back("flash_attn"); + } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.emplace_back("tensor_split"); } @@ -1191,6 +1229,7 @@ int main(int argc, char ** argv) { llama_log_set(llama_null_log_callback, NULL); } llama_backend_init(); + llama_numa_init(params.numa); // initialize printer std::unique_ptr p; diff --git a/examples/llama.android/app/src/main/cpp/llama-android.cpp b/examples/llama.android/app/src/main/cpp/llama-android.cpp index ce8ab3b70..4af9de303 100644 --- a/examples/llama.android/app/src/main/cpp/llama-android.cpp +++ b/examples/llama.android/app/src/main/cpp/llama-android.cpp @@ -408,7 +408,7 @@ Java_com_example_llama_Llm_completion_1loop( const auto new_token_id = llama_sample_token_greedy(context, &candidates_p); const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value); - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { return env->NewStringUTF(""); } diff --git a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift index c249291ae..737f882fb 100644 --- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift +++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift @@ -158,7 +158,7 @@ actor LlamaContext { new_token_id = llama_sample_token_greedy(context, &candidates_p) } - if new_token_id == llama_token_eos(model) || n_cur == n_len { + if llama_token_is_eog(model, new_token_id) || n_cur == n_len { print("\n") let new_token_str = String(cString: temporary_invalid_cchars + [0]) temporary_invalid_cchars.removeAll() @@ -322,7 +322,7 @@ actor LlamaContext { defer { result.deallocate() } - let nTokens = llama_token_to_piece(model, token, result, 8) + let nTokens = llama_token_to_piece(model, token, result, 8, false) if nTokens < 0 { let newResult = UnsafeMutablePointer.allocate(capacity: Int(-nTokens)) @@ -330,7 +330,7 @@ actor LlamaContext { defer { newResult.deallocate() } - let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens) + let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false) let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens)) return Array(bufferPointer) } else { diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 4d5fef020..413e433dd 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -6,7 +6,7 @@ for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. -Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown. +Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown. ## Usage Build with cmake or run `make llava-cli` to build it. @@ -22,7 +22,7 @@ After building, run: `./llava-cli` to see the usage. For example: ## Model conversion -- Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally: +1. Clone `mobileVLM-1.7B` and `clip-vit-large-patch14-336` locally: ```sh git clone https://huggingface.co/mtgv/MobileVLM-1.7B @@ -36,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B ``` -3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: +3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** please use `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh python ./examples/llava/convert-image-encoder-to-gguf \ @@ -78,7 +78,7 @@ cd examples/llava/android/build_64 ### run on Android refer to `android/adb_run.sh`, modify resources' `name` and `path` -## some result on Android with `Snapdragon 888` chip +## Some result on Android with `Snapdragon 888` chip ### case 1 **input** ```sh @@ -109,7 +109,6 @@ llama_print_timings: total time = 34731.93 ms --image /data/local/tmp/cat.jpeg \ -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat is in the image? ASSISTANT:" ``` - **output** ```sh encode_image_with_clip: image encoded in 21149.51 ms by CLIP ( 146.87 ms per image patch) @@ -121,12 +120,82 @@ llama_print_timings: eval time = 1279.03 ms / 18 runs ( 71.06 m llama_print_timings: total time = 34570.79 ms ``` + +## Some result on Android with `Snapdragon 778G` chip +### MobileVLM-1.7B case +#### llava-cli release-b2005 +**input** +```sh +/data/local/tmp/llava-cli \ + -m /data/local/tmp/ggml-model-q4_k.gguf \ + --mmproj /data/local/tmp/mmproj-model-f16.gguf \ + -t 4 \ + --image /data/local/tmp/many_llamas.jpeg \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat's that? ASSISTANT:" +``` +**output** +```sh +encode_image_with_clip: image encoded in 18728.52 ms by CLIP ( 130.06 ms per image patch) +system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: +user_prompt: \nWhat's that? ASSISTANT: + + A group of llamas are standing in a green pasture. + +llama_print_timings: load time = 20357.33 ms +llama_print_timings: sample time = 2.96 ms / 14 runs ( 0.21 ms per token, 4734.53 tokens per second) +llama_print_timings: prompt eval time = 8119.49 ms / 191 tokens ( 42.51 ms per token, 23.52 tokens per second) +llama_print_timings: eval time = 1005.75 ms / 14 runs ( 71.84 ms per token, 13.92 tokens per second) +llama_print_timings: total time = 28038.34 ms / 205 tokens +``` +#### llava-cli latest-version +**input** + +Just the same as above. + +**output**(seems to be much slower) +```sh +encode_image_with_clip: image embedding created: 144 tokens + +encode_image_with_clip: image encoded in 288268.88 ms by CLIP ( 2001.87 ms per image patch) +system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: +user_prompt: \nWhat's that? ASSISTANT: + + It is a group of sheep standing together in a grass field. + +llama_print_timings: load time = 818120.91 ms +llama_print_timings: sample time = 3.44 ms / 14 runs ( 0.25 ms per token, 4067.40 tokens per second) +llama_print_timings: prompt eval time = 529274.69 ms / 191 tokens ( 2771.07 ms per token, 0.36 tokens per second) +llama_print_timings: eval time = 43894.02 ms / 13 runs ( 3376.46 ms per token, 0.30 tokens per second) +llama_print_timings: total time = 865441.76 ms / 204 tokens +``` +### MobileVLM_V2-1.7B case +#### llava-cli release-2005b +**input** + +Just the same as above. + +**output** +```sh +encode_image_with_clip: image encoded in 20609.61 ms by CLIP ( 143.12 ms per image patch) +system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: +user_prompt: \nWhat's that? ASSISTANT: + + This image captures a lively scene of 20 llamas in motion on an expansive, grassy field. The llama is scattered across the landscape with some standing and others sitting down as if taking rest or observing their surroundings from different vantage points within this verdant setting. + +The background offers glimpses into a picturesque town nestled amidst hills under an overcast sky, adding depth to the scene while also emphasizing that distance between these llama and human-made structures like houses or roads in which they roam freely without any barriers around them. The image is framed by text at both right angles on white backgrounds against a contrasting blue backdrop with green foliage, further drawing attention to the llamas amidst their natural habitat while also inviting viewers into this picturesque landscape within town limits of Alta Llama + +llama_print_timings: load time = 22406.77 ms +llama_print_timings: sample time = 49.26 ms / 186 runs ( 0.26 ms per token, 3776.27 tokens per second) +llama_print_timings: prompt eval time = 9044.54 ms / 191 tokens ( 47.35 ms per token, 21.12 tokens per second) +llama_print_timings: eval time = 14497.49 ms / 186 runs ( 77.94 ms per token, 12.83 tokens per second) +llama_print_timings: total time = 44411.01 ms / 377 tokens +``` + ## Orin compile and run ### compile ```sh -make LLAMA_CUBLAS=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 +make LLAMA_CUDA=1 CUDA_DOCKER_ARCH=sm_87 LLAMA_CUDA_F16=1 -j 32 ``` - ### run on Orin ### case 1 **input** @@ -175,8 +244,121 @@ llama_print_timings: eval time = 166.65 ms / 11 runs ( 15.15 m llama_print_timings: total time = 1365.47 ms / 243 tokens ``` -## Minor shortcomings -The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quickly, we uniformly modified `clip_n_patches` function to a quarter. when counting the time consumption, the calculated time will be 4 times bigger than the real cost. +## Running on Intel(R) Core(TM) i7-10750H +### Operating system +Ubuntu22.04 +### compile +```sh +make -j32 +``` +### MobileVLM-1.7B case +**input** +```sh +-m /path/to/ggml-model-q4_k.gguf \ + --mmproj /path/to/mmproj-model-f16.gguf \ + --image /path/to/many_llamas.jpeg + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat's that? ASSISTANT:" \ +``` +**output** +```sh +encode_image_with_clip: image embedding created: 144 tokens + +encode_image_with_clip: image encoded in 2730.94 ms by CLIP ( 18.96 ms per image patch) +system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: +user_prompt: \nWhat's that?ASSISTANT: + + A group of llamas are walking together in a field. + +llama_print_timings: load time = 5506.60 ms +llama_print_timings: sample time = 0.44 ms / 13 runs ( 0.03 ms per token, 29545.45 tokens per second) +llama_print_timings: prompt eval time = 2031.58 ms / 190 tokens ( 10.69 ms per token, 93.52 tokens per second) +llama_print_timings: eval time = 438.92 ms / 12 runs ( 36.58 ms per token, 27.34 tokens per second) +llama_print_timings: total time = 5990.25 ms / 202 tokens +``` + +### MobileVLM_V2-1.7B case +**input** + +Just the same as above. + +**ouput** +```sh +encode_image_with_clip: image embedding created: 144 tokens + +encode_image_with_clip: image encoded in 3223.89 ms by CLIP ( 22.39 ms per image patch) +system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: +user_prompt: \nWhat's that?ASSISTANT: + + The image captures a tranquil scene in a park, where a group of approximately 20 llamas are gathered. The llamas, a mix of white and black, are standing in a line, their black and white patterns contrasting with the lush green grass of the park. The lamas are arranged in a line, suggesting a social order. + +The park itself is lush and green, with trees dotting the landscape in the background. A sign reading "Llamas Tico Ana" is also visible in the image, possibly indicating the location or the breed of the llamas. The image seems to be taken from a distance, providing a wide view of the scene and the surrounding environment. + +The llamas' positions relative to each other, the sign, and the trees create a harmonious composition. The image does not contain any discernible text. The overall scene is one of peace and natural beauty, with the llamas in their natural habitat, surrounded by the vibrant colors and lush greenery of the park. + +llama_print_timings: load time = 6642.61 ms +llama_print_timings: sample time = 8.15 ms / 223 runs ( 0.04 ms per token, 27358.61 tokens per second) +llama_print_timings: prompt eval time = 2475.07 ms / 190 tokens ( 13.03 ms per token, 76.77 tokens per second) +llama_print_timings: eval time = 8760.60 ms / 222 runs ( 39.46 ms per token, 25.34 tokens per second) +llama_print_timings: total time = 15513.95 ms / 412 tokens +``` + +## Run on Intel(R) Core(TM) Ultra7 115H +### operation system +Windows11 +### comiple +```sh +make -j32 +``` +### MobileVLM-1.7B case +**input** +```sh +-m /path/to/ggml-model-q4_k.gguf \ + --mmproj /path/to/tmp/mmproj-model-f16.gguf \ + -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: \nWhat's that? ASSISTANT:" \ +``` +**output** +```sh +encode_image_with_clip: image encoded in 4902.81 ms by CLIP ( 34.05 ms per image patch) +system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: +user_prompt: \nWhat's that? ASSISTANT: + + The image features a group of brown and white llamas standing in a grassy field. + +llama_print_timings: load time = 7441.06 ms +llama_print_timings: sample time = 0.72 ms / 19 runs ( 0.04 ms per token, 26279.39 tokens per second) +llama_print_timings: prompt eval time = 2090.71 ms / 191 tokens ( 10.95 ms per token, 91.36 tokens per second) +llama_print_timings: eval time = 512.35 ms / 18 runs ( 28.46 ms per token, 35.13 tokens per second) +llama_print_timings: total time = 7987.23 ms / 209 tokens +``` + +### MobileVLM_V2-1.7B case +**input** + +Just the same as above. + +**output** +```sh +encode_image_with_clip: image encoded in 4682.44 ms by CLIP ( 32.52 ms per image patch) +system_prompt: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: +user_prompt: \nWhat's that? ASSISTANT: + + This image captures a lively scene of a group of 14 llamas in a grassy field. The llamas, with their distinctive black and white coats, are standing and walking in a line, seemingly engaged in a social activity. One + of them, possibly the first in the line, has its back turned, perhaps observing something in the distance. + +The llama in the front of the line stands out due to its black and white coloring, which is quite unusual for llama patterns. The llama in the front also seems to be more aware of its surroundings, as it faces the camera, giving a sense of engagement with the viewer. + +The image is taken from the side of the llama, providing a clear view of the llama in the front and its companions. The lameness in the llama in + front is not visible, indicating that it might not be the main focus of the photo. + +The background of the image features a grassy field, with a fence and a tree visible in the distance. The tree appears to be bare, suggesting that it might be during a time of year when most trees are dormant or have shed their leaves. + + +llama_print_timings: load time = 7015.35 ms +llama_print_timings: sample time = 10.61 ms / 256 runs ( 0.04 ms per token, 24119.09 tokens per second) +llama_print_timings: prompt eval time = 2052.45 ms / 191 tokens ( 10.75 ms per token, 93.06 tokens per second) +llama_print_timings: eval time = 7259.43 ms / 255 runs ( 28.47 ms per token, 35.13 tokens per second) +llama_print_timings: total time = 14371.19 ms / 446 tokens +``` ## TODO @@ -191,5 +373,5 @@ The `n_patch` of output in `ldp` is 1/4 of the input. In order to implement quic ## contributor ```sh -zhangjidong05, yangyang260, huyiming03, chenxiaotao03 +zhangjidong05, yangyang260, huyiming03, chenxiaotao03, ZiangWu-77 ``` diff --git a/examples/llava/README.md b/examples/llava/README.md index 67cb0f22b..4fb0cf381 100644 --- a/examples/llava/README.md +++ b/examples/llava/README.md @@ -24,7 +24,7 @@ After building, run: `./llava-cli` to see the usage. For example: ## LLaVA 1.5 -- Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: +1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example: ```sh git clone https://huggingface.co/liuhaotian/llava-v1.5-7b @@ -56,7 +56,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa python ./convert.py ../llava-v1.5-7b --skip-unknown ``` -Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory. +Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory. ## LLaVA 1.6 gguf conversion 1) First clone a LLaVA 1.6 model: diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 48caafa87..95fbe3d02 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -3,11 +3,12 @@ // I'll gradually clean and extend it // Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch #include "clip.h" +#include "log.h" #include "ggml.h" #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA #include "ggml-cuda.h" #endif @@ -23,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) { #define TN_POS_EMBD "%s.position_embd.weight" #define TN_CLASS_EMBD "v.class_embd" #define TN_PATCH_EMBD "v.patch_embd.weight" +#define TN_PATCH_BIAS "v.patch_embd.bias" #define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s" @@ -145,7 +146,7 @@ static std::map PROJECTOR_TYPE_NAMES = { static int get_key_idx(const gguf_context * ctx, const char * key) { int i = gguf_find_key(ctx, key); if (i == -1) { - fprintf(stderr, "key %s not found in file\n", key); + LOG_TEE("key %s not found in file\n", key); throw std::runtime_error(format("Missing required key: %s", key)); } @@ -247,7 +248,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) { static void print_tensor_info(const ggml_tensor * tensor, const char * prefix = "") { size_t tensor_size = ggml_nbytes(tensor); - printf("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", + LOG_TEE("%s: n_dims = %d, name = %s, tensor_size=%zu, shape:[%" PRId64 ", %" PRId64 ", %" PRId64 ", %" PRId64 "], type = %s\n", prefix, ggml_n_dims(tensor), tensor->name, tensor_size, tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3], ggml_type_name(tensor->type)); } @@ -265,7 +266,7 @@ static projector_type clip_projector_type_from_string(const std::string & name) static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - std::cerr << "Failed to open file for writing: " << filename << std::endl; + LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -284,7 +285,7 @@ static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::s static void clip_image_save_to_bmp(const clip_image_u8& img, const std::string& filename) { std::ofstream file(filename, std::ios::binary); if (!file.is_open()) { - std::cerr << "Failed to open file for writing: " << filename << std::endl; + LOG_TEE("Failed to open file for writing: %s\n", filename.c_str()); return; } @@ -425,6 +426,7 @@ struct clip_vision_model { // embeddings struct ggml_tensor * class_embedding; struct ggml_tensor * patch_embeddings; + struct ggml_tensor * patch_bias; struct ggml_tensor * position_embeddings; struct ggml_tensor * pre_ln_w; @@ -501,6 +503,11 @@ struct clip_ctx { bool use_gelu = false; int32_t ftype = 1; + bool has_class_embedding = true; + bool has_pre_norm = true; + bool has_post_norm = false; + bool has_patch_bias = false; + struct gguf_context * ctx_gguf; struct ggml_context * ctx_data; @@ -515,7 +522,7 @@ struct clip_ctx { static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch * imgs) { if (!ctx->has_vision_encoder) { - printf("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return nullptr; } @@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 const int patch_size = hparams.patch_size; const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); - const int num_positions = num_patches + 1; + const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); const int hidden_size = hparams.hidden_size; const int n_head = hparams.n_head; const int d_head = hidden_size / n_head; @@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); + if (ctx->has_patch_bias) { + // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp)); + inp = ggml_add(ctx0, inp, model.patch_bias); + } + // concat class_embeddings and patch_embeddings - struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); - ggml_set_name(embeddings, "embeddings"); - ggml_set_input(embeddings); + struct ggml_tensor * embeddings = inp; + if (ctx->has_class_embedding) { + embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); + ggml_set_name(embeddings, "embeddings"); + ggml_set_input(embeddings); + embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); + embeddings = ggml_acc(ctx0, embeddings, inp, + embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); + } - embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); - - embeddings = ggml_acc(ctx0, embeddings, inp, - embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); ggml_set_name(positions, "positions"); @@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); // pre-layernorm - { + if (ctx->has_pre_norm) { embeddings = ggml_norm(ctx0, embeddings, eps); ggml_set_name(embeddings, "pre_ln"); @@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 embeddings = cur; } + // post-layernorm + if (ctx->has_post_norm) { + embeddings = ggml_norm(ctx0, embeddings, eps); + ggml_set_name(embeddings, "post_ln"); + + embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b); + } + // llava projector { embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); @@ -835,9 +857,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); // weight ne = [3, 3, 2048, 1] struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); embeddings = peg_0; } @@ -878,21 +901,21 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const int idx_name = gguf_find_key(ctx, KEY_NAME); if (idx_name != -1) { // make name optional temporarily as some of the uploaded models missing it due to a bug const std::string name = gguf_get_val_str(ctx, idx_name); - printf("%s: model name: %s\n", __func__, name.c_str()); + LOG_TEE("%s: model name: %s\n", __func__, name.c_str()); } - printf("%s: description: %s\n", __func__, description.c_str()); - printf("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); - printf("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); - printf("%s: n_tensors: %d\n", __func__, n_tensors); - printf("%s: n_kv: %d\n", __func__, n_kv); - printf("%s: ftype: %s\n", __func__, ftype_str.c_str()); - printf("\n"); + LOG_TEE("%s: description: %s\n", __func__, description.c_str()); + LOG_TEE("%s: GGUF version: %d\n", __func__, gguf_get_version(ctx)); + LOG_TEE("%s: alignment: %zu\n", __func__, gguf_get_alignment(ctx)); + LOG_TEE("%s: n_tensors: %d\n", __func__, n_tensors); + LOG_TEE("%s: n_kv: %d\n", __func__, n_kv); + LOG_TEE("%s: ftype: %s\n", __func__, ftype_str.c_str()); + LOG_TEE("\n"); } const int n_tensors = gguf_get_n_tensors(ctx); // kv const int n_kv = gguf_get_n_kv(ctx); - printf("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", + LOG_TEE("%s: loaded meta data with %d key-value pairs and %d tensors from %s\n", __func__, n_kv, n_tensors, fname); { std::map n_type; @@ -903,7 +926,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { n_type[type]++; } - printf("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + LOG_TEE("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); for (int i = 0; i < n_kv; i++) { const char * name = gguf_get_key(ctx, i); const enum gguf_type type = gguf_get_kv_type(ctx, i); @@ -919,7 +942,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } replace_all(value, "\n", "\\n"); - printf("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); + LOG_TEE("%s: - kv %3d: %42s %-16s = %s\n", __func__, i, name, type_name.c_str(), value.c_str()); } // print type counts @@ -928,7 +951,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { continue; } - printf("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); + LOG_TEE("%s: - type %4s: %4d tensors\n", __func__, ggml_type_name(kv.first), kv.second); } } @@ -943,7 +966,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { size_t tensor_size = ggml_nbytes(cur); model_size += tensor_size; if (verbosity >= 3) { - printf("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", + LOG_TEE("%s: tensor[%d]: n_dims = %d, name = %s, tensor_size=%zu, offset=%zu, shape:[%" PRIu64 ", %" PRIu64 ", %" PRIu64 ", %" PRIu64 "], type = %s\n", __func__, i, ggml_n_dims(cur), cur->name, tensor_size, offset, cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_type_name(type)); } } @@ -968,20 +991,20 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } } -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA new_clip->backend = ggml_backend_cuda_init(0); - printf("%s: CLIP using CUDA backend\n", __func__); + LOG_TEE("%s: CLIP using CUDA backend\n", __func__); #endif #ifdef GGML_USE_METAL new_clip->backend = ggml_backend_metal_init(); - printf("%s: CLIP using Metal backend\n", __func__); + LOG_TEE("%s: CLIP using Metal backend\n", __func__); #endif if (!new_clip->backend) { new_clip->backend = ggml_backend_cpu_init(); - printf("%s: CLIP using CPU backend\n", __func__); + LOG_TEE("%s: CLIP using CPU backend\n", __func__); } // model size and capabilities @@ -1005,15 +1028,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->use_gelu = gguf_get_val_bool(ctx, idx); if (verbosity >= 1) { - printf("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); - printf("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); - printf("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); - printf("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); - printf("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); + LOG_TEE("%s: text_encoder: %d\n", __func__, new_clip->has_text_encoder); + LOG_TEE("%s: vision_encoder: %d\n", __func__, new_clip->has_vision_encoder); + LOG_TEE("%s: llava_projector: %d\n", __func__, new_clip->has_llava_projector); + LOG_TEE("%s: model size: %.2f MB\n", __func__, model_size / 1024.0 / 1024.0); + LOG_TEE("%s: metadata size: %.2f MB\n", __func__, ggml_get_mem_size(meta) / 1024.0 / 1024.0); } } - printf("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); + LOG_TEE("%s: params backend buffer size = % 6.2f MB (%i tensors)\n", __func__, model_size / (1024.0 * 1024.0), n_tensors); // load tensors { @@ -1026,7 +1049,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { new_clip->ctx_data = ggml_init(params); if (!new_clip->ctx_data) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); + LOG_TEE("%s: ggml_init() failed\n", __func__); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1034,7 +1057,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { auto fin = std::ifstream(fname, std::ios::binary); if (!fin) { - printf("cannot open model file for loading tensors\n"); + LOG_TEE("cannot open model file for loading tensors\n"); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1056,7 +1079,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { const size_t offset = gguf_get_data_offset(ctx) + gguf_get_tensor_offset(ctx, i); fin.seekg(offset, std::ios::beg); if (!fin) { - printf("%s: failed to seek for tensor %s\n", __func__, name); + LOG_TEE("%s: failed to seek for tensor %s\n", __func__, name); clip_free(new_clip); gguf_free(ctx); return nullptr; @@ -1127,34 +1150,61 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } if (verbosity >= 2) { - printf("\n%s: vision model hparams\n", __func__); - printf("image_size %d\n", hparams.image_size); - printf("patch_size %d\n", hparams.patch_size); - printf("v_hidden_size %d\n", hparams.hidden_size); - printf("v_n_intermediate %d\n", hparams.n_intermediate); - printf("v_projection_dim %d\n", hparams.projection_dim); - printf("v_n_head %d\n", hparams.n_head); - printf("v_n_layer %d\n", hparams.n_layer); - printf("v_eps %f\n", hparams.eps); - printf("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); - printf("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); - printf("v_image_grid_pinpoints: "); + LOG_TEE("\n%s: vision model hparams\n", __func__); + LOG_TEE("image_size %d\n", hparams.image_size); + LOG_TEE("patch_size %d\n", hparams.patch_size); + LOG_TEE("v_hidden_size %d\n", hparams.hidden_size); + LOG_TEE("v_n_intermediate %d\n", hparams.n_intermediate); + LOG_TEE("v_projection_dim %d\n", hparams.projection_dim); + LOG_TEE("v_n_head %d\n", hparams.n_head); + LOG_TEE("v_n_layer %d\n", hparams.n_layer); + LOG_TEE("v_eps %f\n", hparams.eps); + LOG_TEE("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); + LOG_TEE("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); + LOG_TEE("v_image_grid_pinpoints: "); for (int i = 0; i < 32 && (hparams.image_grid_pinpoints[i] != 0); ++i) { - printf("%d ", hparams.image_grid_pinpoints[i]); + LOG_TEE("%d ", hparams.image_grid_pinpoints[i]); } - printf("\n"); - printf("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); + LOG_TEE("\n"); + LOG_TEE("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); } + try { + vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); + new_clip->has_class_embedding = true; + } catch (const std::exception& e) { + new_clip->has_class_embedding = false; + } + + try { + vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); + vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); + new_clip->has_pre_norm = true; + } catch (std::exception & e) { + new_clip->has_pre_norm = false; + } + + try { + vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight")); + vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias")); + new_clip->has_post_norm = true; + } catch (std::exception & e) { + new_clip->has_post_norm = false; + } + + try { + vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS); + new_clip->has_patch_bias = true; + } catch (std::exception & e) { + new_clip->has_patch_bias = false; + } + try { vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD); - vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); - vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); - vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); } catch(const std::exception& e) { - fprintf(stderr, "%s: failed to load vision model tensors\n", __func__); + LOG_TEE("%s: failed to load vision model tensors\n", __func__); } // LLaVA projection @@ -1183,7 +1233,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { } catch (std::runtime_error & e) { } try { vision_model.image_newline = get_tensor(new_clip->ctx_data, TN_IMAGE_NEWLINE); - // fprintf(stderr, "%s: image_newline tensor (llava-1.6) found\n", __func__); + // LOG_TEE("%s: image_newline tensor (llava-1.6) found\n", __func__); } catch (std::runtime_error & e) { } } else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) { // MobileVLM projection @@ -1263,7 +1313,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { ggml_cgraph * gf = clip_image_build_graph(new_clip, &batch); ggml_gallocr_reserve(new_clip->compute_alloc, gf); size_t compute_memory_buffer_size = ggml_gallocr_get_buffer_size(new_clip->compute_alloc, 0); - printf("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); + LOG_TEE("%s: compute allocated memory: %.2f MB\n", __func__, compute_memory_buffer_size /1024.0/1024.0); } return new_clip; @@ -1303,7 +1353,7 @@ bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { int nx, ny, nc; auto * data = stbi_load(fname, &nx, &ny, &nc, 3); if (!data) { - fprintf(stderr, "%s: failed to load image '%s'\n", __func__, fname); + LOG_TEE("%s: failed to load image '%s'\n", __func__, fname); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1315,7 +1365,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length int nx, ny, nc; auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); if (!data) { - fprintf(stderr, "%s: failed to decode image bytes\n", __func__); + LOG_TEE("%s: failed to decode image bytes\n", __func__); return false; } build_clip_img_from_data(data, nx, ny, img); @@ -1324,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length } // Linear interpolation between two points -inline float lerp(float s, float e, float t) { +inline float clip_lerp(float s, float e, float t) { return s + (e - s) * t; } // Bilinear resize function @@ -1346,17 +1396,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta float y_lerp = py - y_floor; for (int c = 0; c < 3; c++) { - float top = lerp( + float top = clip_lerp( static_cast(src.buf[3 * (y_floor * src.nx + x_floor) + c]), static_cast(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]), x_lerp ); - float bottom = lerp( + float bottom = clip_lerp( static_cast(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]), static_cast(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]), x_lerp ); - dst.buf[3 * (y * target_width + x) + c] = static_cast(lerp(top, bottom, y_lerp)); + dst.buf[3 * (y * target_width + x) + c] = static_cast(clip_lerp(top, bottom, y_lerp)); } } } @@ -1505,7 +1555,7 @@ static std::pair select_best_resolution(const std::pair & or int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -1544,7 +1594,7 @@ static std::vector divide_to_patches_u8(const clip_image_u8 & im bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) { bool pad_to_square = true; if (!ctx->has_vision_encoder) { - printf("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; } auto & params = ctx->vision_model.hparams; @@ -1621,7 +1671,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli } for (size_t i = 0; i < patches.size(); i++) { - // printf("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); + // LOG_TEE("patch %d: %d %d\n", i, patches[i]->nx, patches[i]->ny); clip_image_u8_free(patches[i]); } @@ -1755,7 +1805,7 @@ int clip_n_patches(const struct clip_ctx * ctx) { int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size); - if (ctx->proj_type == PROJECTOR_TYPE_LDP) { + if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) { n_patches /= 4; } @@ -1764,7 +1814,7 @@ int clip_n_patches(const struct clip_ctx * ctx) { bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) { if (!ctx->has_vision_encoder) { - printf("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; } @@ -1776,7 +1826,7 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs, float * vec) { if (!ctx->has_vision_encoder) { - printf("This gguf file seems to have no vision encoder\n"); + LOG_TEE("This gguf file seems to have no vision encoder\n"); return false; } @@ -1796,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima const int image_size = hparams.image_size; const int patch_size = hparams.patch_size; const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); - const int num_positions = num_patches + 1; + const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0); { struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); @@ -1824,12 +1874,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } { - struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); + if (ctx->has_class_embedding) { + struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); - void* zero_mem = malloc(ggml_nbytes(embeddings)); - memset(zero_mem, 0, ggml_nbytes(embeddings)); - ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); - free(zero_mem); + void* zero_mem = malloc(ggml_nbytes(embeddings)); + memset(zero_mem, 0, ggml_nbytes(embeddings)); + ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); + free(zero_mem); + } } { @@ -1938,7 +1990,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i new_type = type; if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) { new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type - // fprintf(stderr, "%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); + // LOG_TEE("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type)); } const size_t n_elms = ggml_nelements(cur); float * f32_data; @@ -1957,7 +2009,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i f32_data = (float *)conv_buf.data(); break; default: - printf("Please use an input file in f32 or f16\n"); + LOG_TEE("Please use an input file in f32 or f16\n"); gguf_free(ctx_out); return false; } @@ -1984,7 +2036,7 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i fout.put(0); } - printf("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, + LOG_TEE("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize, orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); } @@ -2000,8 +2052,8 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i gguf_free(ctx_out); { - printf("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); - printf("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); + LOG_TEE("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0); + LOG_TEE("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0); } return true; diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index e29da6cb2..da60ddf2f 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -1,4 +1,5 @@ #include "ggml.h" +#include "log.h" #include "common.h" #include "clip.h" #include "llava.h" @@ -18,7 +19,7 @@ static bool eval_tokens(struct llama_context * ctx_llama, std::vector%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); + LOG_TEE("%s: invalid base64 image tag. must be %s%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END); return NULL; } @@ -87,7 +88,7 @@ static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size()); if (!embed) { - fprintf(stderr, "%s: could not load image from base64 string.\n", __func__); + LOG_TEE("%s: could not load image from base64 string.\n", __func__); return NULL; } @@ -112,29 +113,29 @@ struct llava_context { }; static void show_additional_info(int /*argc*/, char ** argv) { - fprintf(stderr, "\n example usage: %s -m --mmproj --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); - fprintf(stderr, " note: a lower temperature value like 0.1 is recommended for better quality.\n"); + LOG_TEE("\n example usage: %s -m --mmproj --image --image [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]); + LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n"); } -static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) { +static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) { // load and preprocess the image llava_image_embed * embed = NULL; auto prompt = params->prompt; if (prompt_contains_image(prompt)) { if (!params->image.empty()) { - fprintf(stderr, "using base64 encoded image instead of command line image path\n"); + LOG_TEE("using base64 encoded image instead of command line image path\n"); } embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->n_threads, prompt); if (!embed) { - fprintf(stderr, "%s: can't load image from prompt\n", __func__); + LOG_TEE("%s: can't load image from prompt\n", __func__); return NULL; } params->prompt = remove_image_from_prompt(prompt); } else { - embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, params->image.c_str()); + embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str()); if (!embed) { - fprintf(stderr, "%s: is %s really an image file?\n", __func__, params->image.c_str()); + fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str()); return NULL; } } @@ -146,7 +147,6 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ int n_past = 0; const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict; - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama)); std::string system_prompt, user_prompt; size_t image_pos = prompt.find(""); @@ -154,18 +154,18 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ // new templating mode: Provide the full prompt including system message and use as a placeholder for the image system_prompt = prompt.substr(0, image_pos); user_prompt = prompt.substr(image_pos + std::string("").length()); - printf("system_prompt: %s\n", system_prompt.c_str()); + LOG_TEE("system_prompt: %s\n", system_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, system_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } - printf("user_prompt: %s\n", user_prompt.c_str()); + LOG_TEE("user_prompt: %s\n", user_prompt.c_str()); if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } else { @@ -175,20 +175,25 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ if (params->verbose_prompt) { auto tmp = ::llama_tokenize(ctx_llava->ctx_llama, user_prompt, true, true); for (int i = 0; i < (int) tmp.size(); i++) { - printf("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str()); } } } - eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos); + eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true); llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past); eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false); // generate the response - fprintf(stderr, "\n"); + LOG_TEE("\n"); struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } + std::string response = ""; for (int i = 0; i < max_tgt_len; i++) { const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past); @@ -207,8 +212,21 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_ printf("\n"); } +static struct llama_model * llava_init(gpt_params * params) { + llama_backend_init(); + llama_numa_init(params->numa); -static struct llava_context * llava_init(gpt_params * params) { + llama_model_params model_params = llama_model_params_from_gpt_params(*params); + + llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n" , __func__); + return NULL; + } + return model; +} + +static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) { const char * clip_path = params->mmproj.c_str(); auto prompt = params->prompt; @@ -218,16 +236,6 @@ static struct llava_context * llava_init(gpt_params * params) { auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1); - llama_backend_init(); - llama_numa_init(params->numa); - - llama_model_params model_params = llama_model_params_from_gpt_params(*params); - - llama_model * model = llama_load_model_from_file(params->model.c_str(), model_params); - if (model == NULL) { - fprintf(stderr , "%s: error: unable to load model\n" , __func__); - return NULL; - } llama_context_params ctx_params = llama_context_params_from_gpt_params(*params); ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings @@ -235,7 +243,7 @@ static struct llava_context * llava_init(gpt_params * params) { llama_context * ctx_llama = llama_new_context_with_model(model, ctx_params); if (ctx_llama == NULL) { - fprintf(stderr , "%s: error: failed to create the llama_context\n" , __func__); + LOG_TEE("%s: error: failed to create the llama_context\n" , __func__); return NULL; } @@ -258,6 +266,12 @@ static void llava_free(struct llava_context * ctx_llava) { llama_backend_free(); } +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + int main(int argc, char ** argv) { ggml_time_init(); @@ -267,29 +281,43 @@ int main(int argc, char ** argv) { show_additional_info(argc, argv); return 1; } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("llava", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { gpt_print_usage(argc, argv, params); show_additional_info(argc, argv); return 1; } - - auto ctx_llava = llava_init(¶ms); - if (ctx_llava == NULL) { - fprintf(stderr, "%s: error: failed to init llava\n", __func__); + auto model = llava_init(¶ms); + if (model == NULL) { + fprintf(stderr, "%s: error: failed to init llava model\n", __func__); return 1; } - auto image_embed = load_image(ctx_llava, ¶ms); - if (!image_embed) { - return 1; + for (auto & image : params.image) { + auto ctx_llava = llava_init_context(¶ms, model); + + auto image_embed = load_image(ctx_llava, ¶ms, image); + if (!image_embed) { + std::cerr << "error: failed to load image " << image << ". Terminating\n\n"; + return 1; + } + + // process the prompt + process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); + + llama_print_timings(ctx_llava->ctx_llama); + llava_image_embed_free(image_embed); + ctx_llava->model = NULL; + llava_free(ctx_llava); } + llama_free_model(model); - // process the prompt - process_prompt(ctx_llava, image_embed, ¶ms, params.prompt); - - llama_print_timings(ctx_llava->ctx_llama); - - llava_image_embed_free(image_embed); - llava_free(ctx_llava); return 0; } diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp index 29764757a..9a990bb18 100644 --- a/examples/llava/llava.cpp +++ b/examples/llava/llava.cpp @@ -54,7 +54,7 @@ static std::pair select_best_resolution(const std::pair& ori int downscaled_height = static_cast(original_height * scale); int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height); int wasted_resolution = (width * height) - effective_resolution; - // fprintf(stderr, "resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); + // LOG_TEE("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution); if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) { max_effective_resolution = effective_resolution; min_wasted_resolution = wasted_resolution; @@ -154,13 +154,13 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector model.newline = ggml_new_tensor_1d(model.ctx, GGML_TYPE_F32, newline_tmp->ne[0]); if (newline_tmp->backend != GGML_BACKEND_TYPE_CPU) { if (newline_tmp->buffer == NULL) { - printf("newline_tmp tensor buffer is NULL\n"); + LOG_TEE("newline_tmp tensor buffer is NULL\n"); } ggml_backend_tensor_get(newline_tmp, model.newline->data, 0, ggml_nbytes(newline_tmp)); } else { model.newline->data = newline_tmp->data; if (model.newline->data == NULL) { - printf("newline_tmp tensor data is NULL\n"); + LOG_TEE("newline_tmp tensor data is NULL\n"); } } @@ -224,7 +224,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli img_res_v.size = 0; img_res_v.data = nullptr; if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) { - fprintf(stderr, "%s: unable to preprocess image\n", __func__); + LOG_TEE("%s: unable to preprocess image\n", __func__); delete[] img_res_v.data; return false; } @@ -239,7 +239,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096 delete[] img_res_v.data; if (!encoded) { - fprintf(stderr, "Unable to encode image\n"); + LOG_TEE("Unable to encode image\n"); return false; } @@ -252,12 +252,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184 const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside if (!encoded) { - fprintf(stderr, "Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); + LOG_TEE("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size); return false; } } const int64_t t_img_enc_batch_us = ggml_time_us(); - printf("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); + LOG_TEE("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0); const int32_t * image_grid = clip_image_grid(ctx_clip); @@ -290,12 +290,12 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli // clip_image_save_to_bmp(*tmp, "image_feature.bmp"); } - printf("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); + LOG_TEE("%s: image embedding created: %d tokens\n", __func__, *n_img_pos); const int64_t t_img_enc_end_us = ggml_time_us(); float t_img_enc_ms = (t_img_enc_end_us - t_img_enc_start_us) / 1000.0; - printf("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); + LOG_TEE("\n%s: image encoded in %8.2f ms by CLIP (%8.2f ms per image patch)\n", __func__, t_img_enc_ms, t_img_enc_ms / *n_img_pos); return true; } @@ -305,7 +305,7 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * int n_llama_embd = llama_n_embd(llama_get_model(ctx_llama)); auto n_image_embd = clip_n_mmproj_embd(ctx_clip); if (n_image_embd != n_llama_embd) { - printf("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); + LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_image_embd, n_llama_embd); return false; } return true; @@ -314,13 +314,13 @@ bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out) { float * image_embd = (float *)malloc(clip_embd_nbytes(ctx_clip)*6); // TODO: base on gridsize/llava model if (!image_embd) { - fprintf(stderr, "Unable to allocate memory for image embeddings\n"); + LOG_TEE("Unable to allocate memory for image embeddings\n"); return false; } int n_img_pos; if (!encode_image_with_clip(ctx_clip, n_threads, img, image_embd, &n_img_pos)) { - fprintf(stderr, "%s: cannot encode image, aborting\n", __func__); + LOG_TEE("%s: cannot encode image, aborting\n", __func__); free(image_embd); return false; } @@ -340,7 +340,7 @@ bool llava_eval_image_embed(llama_context * ctx_llama, const struct llava_image_ } llama_batch batch = {int32_t(n_eval), nullptr, (image_embed->embed+i*n_embd), nullptr, nullptr, nullptr, nullptr, *n_past, 1, 0, }; if (llama_decode(ctx_llama, batch)) { - fprintf(stderr, "%s : failed to eval\n", __func__); + LOG_TEE("%s : failed to eval\n", __func__); return false; } *n_past += n_eval; @@ -352,7 +352,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c clip_image_u8 * img = clip_image_u8_init(); if (!clip_image_load_from_bytes(image_bytes, image_bytes_length, img)) { clip_image_u8_free(img); - fprintf(stderr, "%s: can't load image from bytes, is it a valid image?", __func__); + LOG_TEE("%s: can't load image from bytes, is it a valid image?", __func__); return NULL; } @@ -361,7 +361,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c bool image_embed_result = llava_image_embed_make_with_clip_img(ctx_clip, n_threads, img, &image_embed, &n_image_pos); if (!image_embed_result) { clip_image_u8_free(img); - fprintf(stderr, "%s: coulnd't embed the image\n", __func__); + LOG_TEE("%s: coulnd't embed the image\n", __func__); return NULL; } @@ -375,7 +375,7 @@ struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * c static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long *sizeOut) { auto file = fopen(path, "rb"); if (file == NULL) { - fprintf(stderr, "%s: can't read file %s\n", __func__, path); + LOG_TEE("%s: can't read file %s\n", __func__, path); return false; } @@ -385,7 +385,7 @@ static bool load_file_to_bytes(const char* path, unsigned char** bytesOut, long auto buffer = (unsigned char *)malloc(fileSize); // Allocate memory to hold the file data if (buffer == NULL) { - fprintf(stderr, "%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); + LOG_TEE("%s: failed to alloc %ld bytes for file %s\n", __func__, fileSize, path); perror("Memory allocation error"); fclose(file); return false; @@ -410,7 +410,7 @@ struct llava_image_embed * llava_image_embed_make_with_filename(struct clip_ctx long image_bytes_length; auto loaded = load_file_to_bytes(image_path, &image_bytes, &image_bytes_length); if (!loaded) { - fprintf(stderr, "%s: failed to load %s\n", __func__, image_path); + LOG_TEE("%s: failed to load %s\n", __func__, image_path); return NULL; } diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp index e2551e7a4..9c3540b20 100644 --- a/examples/lookahead/lookahead.cpp +++ b/examples/lookahead/lookahead.cpp @@ -64,13 +64,10 @@ int main(int argc, char ** argv) { std::tie(model, ctx) = llama_init_from_gpt_params(params); // Tokenize the prompt - const bool add_bos = llama_should_add_bos_token(model); - LOG("add_bos tgt: %d\n", add_bos); - std::vector inp; std::vector all; - inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true); all = inp; const int max_context_size = llama_n_ctx(ctx); @@ -302,7 +299,7 @@ int main(int argc, char ** argv) { } fflush(stdout); - if (id == llama_token_eos(model)) { + if (llama_token_is_eog(model, id)) { has_eos = true; } diff --git a/examples/lookup/CMakeLists.txt b/examples/lookup/CMakeLists.txt index c060b8f56..b91633f63 100644 --- a/examples/lookup/CMakeLists.txt +++ b/examples/lookup/CMakeLists.txt @@ -3,3 +3,21 @@ add_executable(${TARGET} lookup.cpp) install(TARGETS ${TARGET} RUNTIME) target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET lookup-create) +add_executable(${TARGET} lookup-create.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET lookup-merge) +add_executable(${TARGET} lookup-merge.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) + +set(TARGET lookup-stats) +add_executable(${TARGET} lookup-stats.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp new file mode 100644 index 000000000..1c230c966 --- /dev/null +++ b/examples/lookup/lookup-create.cpp @@ -0,0 +1,41 @@ +#include "ggml.h" +#include "llama.h" +#include "common.h" +#include "ngram-cache.h" + +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv){ + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + // init llama.cpp + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model = NULL; + llama_context * ctx = NULL; + + // load the model + std::tie(model, ctx) = llama_init_from_gpt_params(params); + GGML_ASSERT(model != nullptr); + + // tokenize the prompt + std::vector inp; + inp = ::llama_tokenize(ctx, params.prompt, true, true); + fprintf(stderr, "%s: tokenization done\n", __func__); + + + llama_ngram_cache ngram_cache; + llama_ngram_cache_update(ngram_cache, LLAMA_NGRAM_STATIC, LLAMA_NGRAM_STATIC, inp, inp.size(), true); + fprintf(stderr, "%s: hashing done, writing file to %s\n", __func__, params.lookup_cache_static.c_str()); + + llama_ngram_cache_save(ngram_cache, params.lookup_cache_static); +} diff --git a/examples/lookup/lookup-merge.cpp b/examples/lookup/lookup-merge.cpp new file mode 100644 index 000000000..07c93eb8d --- /dev/null +++ b/examples/lookup/lookup-merge.cpp @@ -0,0 +1,47 @@ +#include "ggml.h" +#include "llama.h" +#include "common.h" +#include "ngram-cache.h" + +#include +#include +#include +#include +#include +#include +#include + +static void print_usage() { + fprintf(stderr, "Merges multiple lookup cache files into a single one.\n"); + fprintf(stderr, "Usage: lookup-merge [--help] lookup_part_1.bin lookup_part_2.bin ... lookup_merged.bin\n"); +} + +int main(int argc, char ** argv){ + if (argc < 3) { + print_usage(); + exit(1); + } + + std::vector args; + args.resize(argc-1); + for (int i = 0; i < argc-1; ++i) { + args[i] = argv[i+1]; + if (args[i] == "-h" || args[i] == "--help") { + print_usage(); + exit(0); + } + } + + fprintf(stderr, "lookup-merge: loading file %s\n", args[0].c_str()); + llama_ngram_cache ngram_cache_merged = llama_ngram_cache_load(args[0]); + + for (size_t i = 1; i < args.size()-1; ++i) { + fprintf(stderr, "lookup-merge: loading file %s\n", args[i].c_str()); + llama_ngram_cache ngram_cache = llama_ngram_cache_load(args[i]); + + llama_ngram_cache_merge(ngram_cache_merged, ngram_cache); + } + + fprintf(stderr, "lookup-merge: saving file %s\n", args.back().c_str()); + llama_ngram_cache_save(ngram_cache_merged, args.back()); +} diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp new file mode 100644 index 000000000..87ecc0a4f --- /dev/null +++ b/examples/lookup/lookup-stats.cpp @@ -0,0 +1,159 @@ +#include "ggml.h" +#include "common.h" +#include "llama.h" +#include "log.h" +#include "ngram-cache.h" + +#include +#include +#include +#include +#include +#include +#include + +int main(int argc, char ** argv){ + gpt_params params; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + + const int n_draft = params.n_draft; + + // init llama.cpp + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model = NULL; + llama_context * ctx = NULL; + + // load the model + std::tie(model, ctx) = llama_init_from_gpt_params(params); + GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); + + // tokenize the prompt + std::vector inp; + inp = ::llama_tokenize(ctx, params.prompt, true, true); + + llama_ngram_cache ngram_cache_context; + llama_ngram_cache ngram_cache_dynamic; + llama_ngram_cache ngram_cache_static; + int64_t t_draft_flat_us = 0; + int64_t t_draft_us = 0; + + { + const int64_t t_start_draft_us = ggml_time_us(); + + if (!params.lookup_cache_static.empty()) { + try { + ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + } catch (std::ifstream::failure const &) { + fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + exit(1); + } + } + + if (!params.lookup_cache_dynamic.empty()) { + try { + ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program + } + + t_draft_flat_us += ggml_time_us() - t_start_draft_us; + } + + const int n_input = inp.size(); + const int n_ctx = params.n_ctx; + + int n_drafted = 0; + int n_accept = 0; + + const int64_t t_start_ms = ggml_time_ms(); + + // Iterate over input tokens in chunks of size n_ctx. + // Each chunk is treated as if a sequential generation but with pre-determined tokens to ensure reproducibility. + for (int i_start = 0; i_start + n_ctx < n_input; i_start += n_ctx) { + const std::vector inp_slice(inp.begin() + i_start, inp.begin() + i_start + n_ctx); + std::vector pseudo_output; + pseudo_output.push_back(inp_slice[0]); + + while ((int) pseudo_output.size() < n_ctx) { + // Simulate drafting and decoding from draft: + std::vector draft; + draft.push_back(pseudo_output.back()); + + { + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_draft(pseudo_output, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + t_draft_us += ggml_time_us() - t_start_draft_us; + } + + n_drafted += draft.size() - 1; + + for (size_t j = 1; j < draft.size() && (int) pseudo_output.size() < n_ctx; ++j) { + const llama_token ground_truth = inp_slice[pseudo_output.size()]; + const llama_token drafted = draft[j]; + + if (ground_truth != drafted) { + break; + } + + ++n_accept; + pseudo_output.push_back(ground_truth); + + { + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } + } + + // After each simulated batch decoding simulate the sampling of a single token: + if ((int) pseudo_output.size() < n_ctx) { + pseudo_output.push_back(inp_slice[pseudo_output.size()]); + { + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, pseudo_output, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } + } + + draft.erase(draft.begin()); + + } + if (i_start > 0 && i_start / 100000 != (i_start - n_ctx) / 100000) { + const int64_t t_now_ms = ggml_time_ms(); + const int64_t eta_ms = (n_input - i_start) * (t_now_ms - t_start_ms) / i_start; + const int64_t eta_min = eta_ms / (60*1000); + const int64_t eta_s = (eta_ms - 60*1000*eta_min) / 1000; + + LOG_TEE("lookup-stats: %d/%d done, ETA: %02" PRId64 ":%02" PRId64 "\n", i_start, n_input, eta_min, eta_s); + } + + // After each chunk, update the dynamic ngram cache with the context ngram cache: + llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + ngram_cache_context.clear(); + } + + LOG_TEE("\n"); + + LOG_TEE("\n"); + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_predict = %d\n", n_input - n_input % n_ctx); + LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); + LOG_TEE("n_accept = %d\n", n_accept); + LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + fprintf(stderr, "\n\n"); + + return 0; +} diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp index b53fae110..eebbd00a5 100644 --- a/examples/lookup/lookup.cpp +++ b/examples/lookup/lookup.cpp @@ -1,12 +1,15 @@ -#include "common.h" #include "ggml.h" #include "llama.h" +#include "common.h" +#include "ngram-cache.h" #include #include #include +#include #include #include +#include int main(int argc, char ** argv){ gpt_params params; @@ -15,11 +18,7 @@ int main(int argc, char ** argv){ return 1; } - // max/min n-grams size to search for in prompt - const int ngram_max = 4; - const int ngram_min = 1; - - // length of the candidate / draft sequence, if match is found + // max. number of additional tokens to draft if match is found const int n_draft = params.n_draft; const bool dump_kv_cache = params.dump_kv_cache; @@ -39,13 +38,40 @@ int main(int argc, char ** argv){ // load the model std::tie(model, ctx) = llama_init_from_gpt_params(params); + GGML_ASSERT(llama_n_vocab(model) < (1 << 16)); // tokenize the prompt - const bool add_bos = llama_should_add_bos_token(model); - LOG("add_bos tgt: %d\n", add_bos); - std::vector inp; - inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + inp = ::llama_tokenize(ctx, params.prompt, true, true); + + llama_ngram_cache ngram_cache_context; + llama_ngram_cache ngram_cache_dynamic; + llama_ngram_cache ngram_cache_static; + int64_t t_draft_flat_us = 0; + int64_t t_draft_us = 0; + + { + // Fill up context ngram cache with tokens from user input: + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, inp.size(), false); + + if (!params.lookup_cache_static.empty()) { + try { + ngram_cache_static = llama_ngram_cache_load(params.lookup_cache_static); + } catch (std::ifstream::failure const &) { + fprintf(stderr, "error: failed to open static lookup cache: %s", params.lookup_cache_static.c_str()); + exit(1); + } + } + + if (!params.lookup_cache_dynamic.empty()) { + try { + ngram_cache_dynamic = llama_ngram_cache_load(params.lookup_cache_dynamic); + } catch (std::ifstream::failure const &) {} // if the file does not exist it will simply be created at the end of the program + } + + t_draft_flat_us += ggml_time_us() - t_start_draft_us; + } const int max_context_size = llama_n_ctx(ctx); const int max_tokens_list_size = max_context_size - 4; @@ -76,8 +102,6 @@ int main(int argc, char ** argv){ int n_drafted = 0; int n_accept = 0; - int64_t t_draft_us = 0; - int n_past = inp.size(); bool has_eos = false; @@ -116,7 +140,7 @@ int main(int argc, char ** argv){ printf("%s", token_str.c_str()); } - if (id == llama_token_eos(model)) { + if (llama_token_is_eog(model, id)) { has_eos = true; } @@ -129,6 +153,12 @@ int main(int argc, char ** argv){ ++n_past; ++i_dft; inp.push_back(id); + { + // Update context ngram cache with the newly accepted token: + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } if (params.use_color) { // color accepted draft token @@ -149,6 +179,12 @@ int main(int argc, char ** argv){ draft.clear(); draft.push_back(id); inp.push_back(id); + { + // Update context ngram cache with the newly accepted token: + const int64_t t_start_draft_us = ggml_time_us(); + llama_ngram_cache_update(ngram_cache_context, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, inp, 1, false); + t_draft_us += ggml_time_us() - t_start_draft_us; + } break; } @@ -163,44 +199,19 @@ int main(int argc, char ** argv){ llama_batch_clear(batch_tgt); llama_batch_add(batch_tgt, draft[0], n_past, { 0 }, true); - // generate n_pred tokens through prompt lookup - auto prompt_lookup = [&]() -> void { - const int inp_size = inp.size(); - for (int ngram_size = ngram_max ; ngram_size > ngram_min; --ngram_size){ - const llama_token * ngram = &inp[inp_size - ngram_size]; - - for (int i = 0; i <= (int) inp_size - (ngram_size * 2); ++i) { - bool match = true; - for (int j = 0; j < ngram_size; ++j) { - if (inp[i + j] != ngram[j]) { - match = false; - break; - } - } - - if (match) { - const int startIdx = i + ngram_size; - const int endIdx = startIdx + n_draft; - if (endIdx < inp_size) { - for (int j = startIdx; j < endIdx; ++j) { - LOG(" - draft candidate %d: %d\n", j, inp[j]); - draft.push_back(inp[j]); - llama_batch_add(batch_tgt, inp[j], n_past + (j - startIdx) + 1, { 0 }, true); - ++n_drafted; - } - return; - } - } - } - } - return; - }; - + // Draft already contains a single token sampled from the model: + GGML_ASSERT(draft.size() == 1); + GGML_ASSERT(draft[0] == inp.back()); const int64_t t_start_draft_us = ggml_time_us(); - prompt_lookup(); + llama_ngram_cache_draft(inp, draft, n_draft, LLAMA_NGRAM_MIN, LLAMA_NGRAM_MAX, ngram_cache_context, ngram_cache_dynamic, ngram_cache_static); + + for (size_t i = 1; i < draft.size(); ++i) { + llama_batch_add(batch_tgt, draft[i], n_past + i, { 0 }, true); + } t_draft_us += ggml_time_us() - t_start_draft_us; + n_drafted += draft.size() - 1; llama_decode(ctx, batch_tgt); ++n_past; @@ -210,19 +221,24 @@ int main(int argc, char ** argv){ auto t_dec_end = ggml_time_us(); + // Update dynamic ngram cache with context ngram cache and save it to disk: + llama_ngram_cache_merge(ngram_cache_dynamic, ngram_cache_context); + llama_ngram_cache_save(ngram_cache_dynamic, params.lookup_cache_dynamic); + LOG_TEE("\n\n"); LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f)); LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f)); LOG_TEE("\n"); - LOG_TEE("n_draft = %d\n", n_draft); - LOG_TEE("n_predict = %d\n", n_predict); - LOG_TEE("n_drafted = %d\n", n_drafted); - LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", + LOG_TEE("n_draft = %d\n", n_draft); + LOG_TEE("n_predict = %d\n", n_predict); + LOG_TEE("n_drafted = %d\n", n_drafted); + LOG_TEE("t_draft_flat = %.2f ms\n", t_draft_flat_us*1e-3); + LOG_TEE("t_draft = %.2f ms, %.2f us per token, %.2f tokens per second\n", t_draft_us*1e-3, 1.0f*t_draft_us/n_drafted, n_drafted/(1e-6*t_draft_us)); - LOG_TEE("n_accept = %d\n", n_accept); - LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); + LOG_TEE("n_accept = %d\n", n_accept); + LOG_TEE("accept = %.3f%%\n", 100.0f * n_accept / n_drafted); LOG_TEE("\ntarget:\n"); llama_print_timings(ctx); diff --git a/examples/main-cmake-pkg/README.md b/examples/main-cmake-pkg/README.md index 6d665f28f..edf20d8db 100644 --- a/examples/main-cmake-pkg/README.md +++ b/examples/main-cmake-pkg/README.md @@ -8,7 +8,7 @@ Because this example is "outside of the source tree", it is important to first b ### Considerations -When hardware acceleration libraries are used (e.g. CUBlas, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. +When hardware acceleration libraries are used (e.g. CUDA, Metal, CLBlast, etc.), CMake must be able to locate the associated CMake package. In the example below, when building _main-cmake-pkg_ notice the `CMAKE_PREFIX_PATH` includes the Llama CMake package location _in addition to_ the CLBlast package—which was used when compiling _llama.cpp_. ### Build llama.cpp and install to C:\LlamaCPP directory @@ -17,11 +17,9 @@ In this case, CLBlast was already installed so the CMake package is referenced i ```cmd git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -mkdir build -cd build -cmake .. -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64 -cmake --build . --config Release -cmake --install . --prefix C:/LlamaCPP +cmake -B build -DBUILD_SHARED_LIBS=OFF -DLLAMA_CLBLAST=ON -DCMAKE_PREFIX_PATH=C:/CLBlast/lib/cmake/CLBlast -G "Visual Studio 17 2022" -A x64 +cmake --build build --config Release +cmake --install build --prefix C:/LlamaCPP ``` ### Build main-cmake-pkg @@ -29,9 +27,7 @@ cmake --install . --prefix C:/LlamaCPP ```cmd cd ..\examples\main-cmake-pkg -mkdir build -cd build -cmake .. -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 -cmake --build . --config Release -cmake --install . --prefix C:/MyLlamaApp +cmake -B build -DBUILD_SHARED_LIBS=OFF -DCMAKE_PREFIX_PATH="C:/CLBlast/lib/cmake/CLBlast;C:/LlamaCPP/lib/cmake/Llama" -G "Visual Studio 17 2022" -A x64 +cmake --build build --config Release +cmake --install build --prefix C:/MyLlamaApp ``` diff --git a/examples/main/README.md b/examples/main/README.md index 6a8d1e1c5..97e2ae4c2 100644 --- a/examples/main/README.md +++ b/examples/main/README.md @@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt In this section, we cover the most commonly used options for running the `main` program with the LLaMA models: -- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`). +- `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set). - `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). - `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses. - `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models. @@ -143,7 +143,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by ### Extended Context Size -Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. +Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8. - `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model. @@ -286,7 +286,7 @@ These options help improve the performance and memory usage of the LLaMA models. - `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes. - `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node. -- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus. +- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus. These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root. @@ -296,19 +296,23 @@ These options help improve the performance and memory usage of the LLaMA models. ### Batch Size -- `-b N, --batch-size N`: Set the batch size for prompt processing (default: 512). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations. +- `-b N, --batch-size N`: Set the batch size for prompt processing (default: `2048`). This large batch size benefits users who have BLAS installed and enabled it during the build. If you don't have BLAS enabled ("BLAS=0"), you can use a smaller number, such as 8, to see the prompt progress as it's evaluated in some situations. + +- `-ub N`, `--ubatch-size N`: physical maximum batch size. This is for pipeline parallelization. Default: `512`. ### Prompt Caching - `--prompt-cache FNAME`: Specify a file to cache the model state after the initial prompt. This can significantly speed up the startup time when you're using longer prompts. The file is created during the first run and is reused and updated in subsequent runs. **Note**: Restoring a cached prompt does not imply restoring the exact state of the session at the point it was saved. So even when specifying a specific seed, you are not guaranteed to get the same sequence of tokens as the original generation. -### Grammars +### Grammars & JSON schemas - `--grammar GRAMMAR`, `--grammar-file FILE`: Specify a grammar (defined inline or in a file) to constrain model output to a specific format. For example, you could force the model to output JSON or to speak only in emojis. See the [GBNF guide](../../grammars/README.md) for details on the syntax. +- `--json-schema SCHEMA`: Specify a [JSON schema](https://json-schema.org/) to constrain model output to (e.g. `{}` for any JSON object, or `{"items": {"type": "string", "minLength": 10, "maxLength": 100}, "minItems": 10}` for a JSON array of strings with size constraints). If a schema uses external `$ref`s, you should use `--grammar "$( python examples/json_schema_to_grammar.py myschema.json )"` instead. + ### Quantization -For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-data--run). +For information about 4-bit quantization, which can significantly improve performance and reduce memory usage, please refer to llama.cpp's primary [README](../../README.md#prepare-and-quantize). ## Additional Options @@ -316,8 +320,8 @@ These options provide extra functionality and customization when running the LLa - `-h, --help`: Display a help message showing all available options and their default values. This is particularly useful for checking the latest options and default values, as they can change frequently, and the information in this document may become outdated. - `--verbose-prompt`: Print the prompt before generating text. -- `-ngl N, --n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. +- `-ngl N, --n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. diff --git a/examples/main/main.cpp b/examples/main/main.cpp index e2d07a631..9dee41001 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -235,17 +235,17 @@ int main(int argc, char ** argv) { // The file exists and is not empty session_tokens.resize(n_ctx); size_t n_token_count_out = 0; - if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + if (!llama_state_load_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); return 1; } session_tokens.resize(n_token_count_out); - llama_set_rng_seed(ctx, params.seed); LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size()); } } const bool add_bos = llama_should_add_bos_token(model); + GGML_ASSERT(llama_add_eos_token(model) != 1); LOG("add_bos: %d\n", add_bos); std::vector embd_inp; @@ -255,7 +255,7 @@ int main(int argc, char ** argv) { if (params.chatml) { params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; } - embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + embd_inp = ::llama_tokenize(ctx, params.prompt, true, true); } else { LOG("use session tokens\n"); embd_inp = session_tokens; @@ -277,10 +277,10 @@ int main(int argc, char ** argv) { if (ctx_guidance) { LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); - guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true); + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, true, true); LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); - std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, true, true); LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); original_prompt_len = original_inp.size(); @@ -324,7 +324,7 @@ int main(int argc, char ** argv) { log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); // if we will use the cache for the full prompt without reaching the end of the cache, force - // reevaluation of the last token token to recalculate the cached logits + // reevaluation of the last token to recalculate the cached logits if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); @@ -339,14 +339,14 @@ int main(int argc, char ** argv) { } // prefix & suffix for instruct mode - const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true); - const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true, true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); // chatml prefix & suffix - const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true); + const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", true, true); const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); @@ -362,6 +362,9 @@ int main(int argc, char ** argv) { params.interactive_first = true; params.antiprompt.emplace_back("<|im_start|>user\n"); } + else if (params.conversation) { + params.interactive_first = true; + } // enable interactive mode if interactive start is specified if (params.interactive_first) { @@ -520,6 +523,10 @@ int main(int argc, char ** argv) { } struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + if (!ctx_sampling) { + fprintf(stderr, "%s: failed to initialize sampling subsystem\n", __func__); + exit(1); + } while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict @@ -544,7 +551,7 @@ int main(int argc, char ** argv) { // if we run out of context: // - take the n_keep first tokens from the original prompt (via n_past) // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches - if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + if (n_past + (int) embd.size() + std::max(0, guidance_offset) >= n_ctx) { if (params.n_predict == -2) { LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); break; @@ -693,7 +700,7 @@ int main(int argc, char ** argv) { // optionally save the session on first sample (for faster prompt loading next time) if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { need_to_save_session = false; - llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); LOG("saved session to %s\n", path_session.c_str()); } @@ -733,7 +740,7 @@ int main(int argc, char ** argv) { // display text if (input_echo && display) { for (auto id : embd) { - const std::string token_str = llama_token_to_piece(ctx, id); + const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation); printf("%s", token_str.c_str()); if (embd.size() > 1) { @@ -794,9 +801,9 @@ int main(int argc, char ** argv) { } } - // deal with end of text token in interactive mode - if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { - LOG("found EOS token\n"); + // deal with end of generation tokens in interactive mode + if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) { + LOG("found an EOG token\n"); if (params.interactive) { if (!params.antiprompt.empty()) { @@ -816,7 +823,7 @@ int main(int argc, char ** argv) { if (n_past > 0 && is_interacting) { LOG("waiting for user input\n"); - if (params.instruct || params.chatml) { + if (params.conversation || params.instruct || params.chatml) { printf("\n> "); } @@ -826,7 +833,7 @@ int main(int argc, char ** argv) { } std::string buffer; - if (!params.input_prefix.empty()) { + if (!params.input_prefix.empty() && !params.conversation) { LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); printf("%s", params.input_prefix.c_str()); } @@ -850,7 +857,7 @@ int main(int argc, char ** argv) { // Entering a empty line lets the user pass control back if (buffer.length() > 1) { // append input suffix if any - if (!params.input_suffix.empty()) { + if (!params.input_suffix.empty() && !params.conversation) { LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); printf("%s", params.input_suffix.c_str()); } @@ -876,7 +883,7 @@ int main(int argc, char ** argv) { } const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); - const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, params.interactive_specials); const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); @@ -919,8 +926,8 @@ int main(int argc, char ** argv) { } } - // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { + // end of generation + if (!embd.empty() && llama_token_is_eog(model, embd.back()) && !(params.instruct || params.interactive || params.chatml)) { LOG_TEE(" [end of text]\n"); break; } @@ -935,7 +942,7 @@ int main(int argc, char ** argv) { if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); - llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + llama_state_save_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); } llama_print_timings(ctx); diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index a2ef0fb03..7c5595d6e 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -132,7 +132,6 @@ int main(int argc, char ** argv) { llama_context * ctx = NULL; // load the target model - params.logits_all = true; std::tie(model, ctx) = llama_init_from_gpt_params(params); // load the prompts from an external file if there are any @@ -360,7 +359,7 @@ int main(int argc, char ** argv) { // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); if (client.n_decoded > 2 && - (id == llama_token_eos(model) || + (llama_token_is_eog(model, id) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || client.response.find("User:") != std::string::npos || client.response.find('\n') != std::string::npos)) { diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index 2cbc9e1fa..f2ef9ca10 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -252,8 +252,8 @@ int main(int argc, char ** argv) { // sample the most likely token const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - // is it an end of stream? - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + // is it an end of generation? + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { LOG_TEE("\n"); break; diff --git a/examples/perplexity/README.md b/examples/perplexity/README.md index 50e1af011..c5e2bc5de 100644 --- a/examples/perplexity/README.md +++ b/examples/perplexity/README.md @@ -1,21 +1,132 @@ -# perplexity +# Perplexity -TODO +The `perplexity` example can be used to calculate the so-called perplexity value of a language model over a given text corpus. +Perplexity measures how well the model can predict the next token with lower values being better. +Note that perplexity is **not** directly comparable between models, especially if they use different tokenizers. +Also note that finetunes typically result in a higher perplexity value even though the human-rated quality of outputs increases. -## Llama 2 70B Scorechart -Quantization | Model size (GiB) | Perplexity | Delta to fp16 --- | -- | -- | -- -Q4_0 | 36.20 | 3.5550 | 3.61% -Q4_1 | 40.20 | 3.5125 | 2.37% -Q5_0 | 44.20 | 3.4744 | 1.26% -Q2_K | 27.27 | 3.7339 | 8.82% -Q3_K_S | 27.86 | 3.7019 | 7.89% -Q3_K_M | 30.83 | 3.5932 | 4.72% -Q3_K_L | 33.67 | 3.5617 | 3.80% -Q4_K_S | 36.39 | 3.4852 | 1.57% -Q4_K_M | 38.54 | 3.4725 | 1.20% -Q5_K_S | 44.20 | 3.4483 | 0.50% -Q5_K_M | 45.41 | 3.4451 | 0.40% -Q6_K | 52.70 | 3.4367 | 0.16% -fp16 | 128.5 | 3.4313 | - +Within llama.cpp the perplexity of base models is used primarily to judge the quality loss from e.g. quantized models vs. FP16. +The convention among contributors is to use the Wikitext-2 test set for testing unless noted otherwise (can be obtained with `scripts/get-wikitext-2.sh`). +By default only the mean perplexity value and the corresponding uncertainty is calculated. +The uncertainty is determined empirically by assuming a Gaussian distribution of the "correct" logits per and then applying error propagation. + +More statistics can be obtained by recording the logits from the FP16 version of a model. +To do this, supply `perplexity` with `--kl-divergence-base path/to/logit/binary/file.kld`. +The program will then record all logits and save them to the provided path in binary format. +**The logit file will be very large, 11 GiB for LLaMA 2 or 37 GiB for LLaMA 3 when using the Wikitext-2 test set.** +Once you have the file, supply `perplexity` with the quantized model, the logits file via `--kl-divergence-base`, +and finally the `--kl-divergence` argument to indicate that the program should calculate the so-called Kullback-Leibler divergence. +This is a measure of how similar the FP16 and the quantized logit distributions are with a value of 0 indicating that the distribution are the same. +The uncertainty on the mean KL divergence is calculated by assuming the KL divergence per token follows a Gaussian distribution. + +In addition to the KL divergence the following statistics are calculated with `--kl-divergence`: + +* Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same. +* Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. +* Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. +* Pearson correlation coefficient of the "correct" token probabilites between models. +* Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization. +* The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggerganov/llama.cpp/discussions/2875 . +* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution. + +## LLaMA 3 8b Scoreboard + +Results are sorted by Kullback-Leibler divergence relative to FP16. +The "WT" importance matrices were created using varying numbers of Wikitext tokens and can be found [here](https://huggingface.co/JohannesGaessler/llama.cpp_importance_matrices/blob/main/imatrix-llama_3-8b-f16-2.7m_tokens.dat). + +| Quantization | imatrix | Model size [GiB] | PPL | ΔPPL | KLD | Mean Δp | RMS Δp | +|--------------|---------|------------------|------------------------|------------------------|-----------------------|-------------------|------------------| +| f16 | None | 14.97 | 6.233160 ± 0.037828 | - | - | - | - | +| q8_0 | None | 7.96 | 6.234284 ± 0.037878 | 0.002650 ± 0.001006 | 0.001355 ± 0.000006 | -0.019 ± 0.003 % | 1.198 ± 0.007 % | +| q6_K | None | 6.14 | 6.253382 ± 0.038078 | 0.021748 ± 0.001852 | 0.005452 ± 0.000035 | -0.007 ± 0.006 % | 2.295 ± 0.019 % | +| q5_K_M | None | 5.33 | 6.288607 ± 0.038338 | 0.056974 ± 0.002598 | 0.010762 ± 0.000079 | -0.114 ± 0.008 % | 3.160 ± 0.031 % | +| q5_K_S | None | 5.21 | 6.336598 ± 0.038755 | 0.104964 ± 0.003331 | 0.016595 ± 0.000122 | -0.223 ± 0.010 % | 3.918 ± 0.036 % | +| q5_1 | None | 5.65 | 6.337857 ± 0.038677 | 0.106223 ± 0.003476 | 0.018045 ± 0.000139 | -0.287 ± 0.011 % | 4.123 ± 0.039 % | +| q5_0 | None | 5.21 | 6.363224 ± 0.038861 | 0.131591 ± 0.003894 | 0.022239 ± 0.000166 | -0.416 ± 0.012 % | 4.634 ± 0.043 % | +| q4_K_M | WT 10m | 4.58 | 6.382937 ± 0.039055 | 0.151303 ± 0.004429 | 0.028152 ± 0.000240 | -0.389 ± 0.014 % | 5.251 ± 0.049 % | +| q4_K_M | None | 4.58 | 6.407115 ± 0.039119 | 0.175482 ± 0.004620 | 0.031273 ± 0.000238 | -0.596 ± 0.014 % | 5.519 ± 0.050 % | +| q4_K_S | WT 10m | 4.37 | 6.409697 ± 0.039189 | 0.178064 ± 0.004744 | 0.031951 ± 0.000259 | -0.531 ± 0.015 % | 5.645 ± 0.051 % | +| iq4_NL | WT 10m | 4.35 | 6.455593 ± 0.039630 | 0.223959 ± 0.005201 | 0.035742 ± 0.000288 | -0.590 ± 0.016 % | 5.998 ± 0.054 % | +| iq4_XS | WT 10m | 4.14 | 6.459705 ± 0.039595 | 0.228071 ± 0.005207 | 0.036334 ± 0.000284 | -0.668 ± 0.016 % | 6.044 ± 0.054 % | +| q4_K_S | None | 4.37 | 6.500529 ± 0.039778 | 0.268895 ± 0.005638 | 0.043136 ± 0.000314 | -0.927 ± 0.017 % | 6.562 ± 0.055 % | +| q4_1 | None | 4.78 | 6.682737 ± 0.041285 | 0.451103 ± 0.008030 | 0.071683 ± 0.000505 | -0.927 ± 0.017 % | 8.512 ± 0.063 % | +| q4_0 | None | 4.34 | 6.700147 ± 0.041226 | 0.468514 ± 0.007951 | 0.071940 ± 0.000491 | -1.588 ± 0.022 % | 8.434 ± 0.061 % | +| q3_K_L | WT 10m | 4.03 | 6.671223 ± 0.041427 | 0.439590 ± 0.008154 | 0.073077 ± 0.000529 | -0.940 ± 0.023 % | 8.662 ± 0.064 % | +| q3_K_M | WT 10m | 3.74 | 6.734255 ± 0.041838 | 0.502622 ± 0.008901 | 0.084358 ± 0.000588 | -1.198 ± 0.024 % | 9.292 ± 0.065 % | +| q3_K_L | None | 4.03 | 6.787876 ± 0.042104 | 0.556242 ± 0.009171 | 0.087176 ± 0.000614 | -1.532 ± 0.025 % | 9.432 ± 0.067 % | +| q3_K_M | None | 3.74 | 6.888498 ± 0.042669 | 0.656864 ± 0.010071 | 0.101913 ± 0.000677 | -1.990 ± 0.026 % | 10.203 ± 0.068 % | +| iq3_M | WT 10m | 3.53 | 6.898327 ± 0.041643 | 0.666694 ± 0.009449 | 0.102534 ± 0.000663 | -3.178 ± 0.026 % | 10.513 ± 0.066 % | +| iq3_S | WT 10m | 3.42 | 6.965501 ± 0.042406 | 0.733867 ± 0.010245 | 0.111278 ± 0.000710 | -3.066 ± 0.027 % | 10.845 ± 0.068 % | +| iq3_XS | WT 10m | 3.28 | 7.163043 ± 0.043772 | 0.931409 ± 0.012084 | 0.138693 ± 0.000857 | -3.667 ± 0.031 % | 12.148 ± 0.070 % | +| iq3_XXS | WT 10m | 3.05 | 7.458436 ± 0.046404 | 1.226803 ± 0.015234 | 0.183625 ± 0.001042 | -3.918 ± 0.035 % | 13.836 ± 0.074 % | +| q3_K_S | WT 10m | 3.41 | 7.602878 ± 0.046848 | 1.371244 ± 0.015688 | 0.199821 ± 0.001008 | -5.046 ± 0.037 % | 14.980 ± 0.070 % | +| q3_K_S | None | 3.41 | 7.863786 ± 0.048885 | 1.632152 ± 0.017733 | 0.228217 ± 0.001079 | -5.604 ± 0.038 % | 15.541 ± 0.070 % | +| iq2_M | WT 10m | 2.74 | 8.600799 ± 0.055124 | 2.369166 ± 0.025244 | 0.325989 ± 0.00160 | -6.463 ± 0.046 % | 18.519 ± 0.080 % | +| q2_K | WT 10k | 2.96 | 8.652290 ± 0.055572 | 2.420657 ± 0.025587 | 0.331393 ± 0.001562 | -6.606 ± 0.046 % | 18.790 ± 0.078 % | +| q2_K | WT 100k | 2.96 | 8.641993 ± 0.055406 | 2.410359 ± 0.025495 | 0.331672 ± 0.001569 | -6.628 ± 0.047 % | 18.856 ± 0.078 % | +| q2_K | WT 10m | 2.96 | 8.647825 ± 0.055610 | 2.416191 ± 0.025683 | 0.332223 ± 0.001572 | -6.500 ± 0.047 % | 18.881 ± 0.078 % | +| q2_K | WT 1m | 2.96 | 8.674365 ± 0.055743 | 2.442732 ± 0.025843 | 0.335308 ± 0.001576 | -6.634 ± 0.047 % | 19.009 ± 0.079 % | +| q2_K | WT 1k | 2.96 | 8.682605 ± 0.055916 | 2.450972 ± 0.026069 | 0.337093 ± 0.001596 | -6.596 ± 0.047 % | 18.977 ± 0.079 % | +| q2_K_S | WT 10m | 2.96 | 9.323778 ± 0.061551 | 3.092145 ± 0.031914 | 0.403360 ± 0.001787 | -7.131 ± 0.049 % | 20.050 ± 0.081 % | +| q2_K_S | WT 1m | 2.96 | 9.329321 ± 0.061378 | 3.097688 ± 0.031816 | 0.403590 ± 0.001797 | -7.289 ± 0.049 % | 20.123 ± 0.081 % | +| q2_K_S | WT 100k | 2.96 | 9.362973 ± 0.061740 | 3.131339 ± 0.032169 | 0.408367 ± 0.001802 | -7.198 ± 0.050 % | 20.132 ± 0.081 % | +| q2_K_S | WT 10k | 2.96 | 9.376479 ± 0.062045 | 3.144846 ± 0.032464 | 0.408662 ± 0.001819 | -7.141 ± 0.050 % | 20.120 ± 0.081 % | +| q2_K_S | WT 1k | 2.96 | 9.415200 ± 0.062475 | 3.183567 ± 0.032993 | 0.415865 ± 0.001846 | -7.153 ± 0.050 % | 20.311 ± 0.082 % | +| iq2_S | WT 10m | 2.56 | 9.650781 ± 0.063209 | 3.419148 ± 0.034017 | 0.439197 ± 0.001976 | -8.319 ± 0.052 % | 21.491 ± 0.083 % | +| q2_K | None | 2.96 | 9.751568 ± 0.063312 | 3.519934 ± 0.033863 | 0.445132 ± 0.001835 | -9.123 ± 0.051 % | 21.421 ± 0.079 % | +| iq2_XS | WT 10m | 2.43 | 10.761424 ± 0.071056 | 4.529791 ± 0.042229 | 0.546290 ± 0.002133 | -10.576 ± 0.056 % | 23.872 ± 0.082 % | +| iq2_XXS | WT 10m | 2.24 | 14.091782 ± 0.098396 | 7.860148 ± 0.070752 | 0.812022 ± 0.002741 | -14.363 ± 0.065 % | 28.576 ± 0.084 % | +| iq1_M | WT 10m | 2.01 | 25.493722 ± 0.177903 | 19.262089 ± 0.152396 | 1.393084 ± 0.003529 | -24.672 ± 0.077 % | 38.287 ± 0.084 % | +| iq1_S | WT 1m | 1.88 | 58.097760 ± 0.438604 | 51.866126 ± 0.416604 | 2.211278 ± 0.004688 | -32.471 ± 0.087 % | 46.418 ± 0.085 % | +| iq1_S | WT 1k | 1.88 | 58.267851 ± 0.446208 | 52.036218 ± 0.424373 | 2.214858 ± 0.004778 | -31.880 ± 0.089 % | 46.330 ± 0.086 % | +| iq1_S | WT 100k | 1.88 | 58.581498 ± 0.453145 | 52.349864 ± 0.431360 | 2.220834 ± 0.004818 | -32.261 ± 0.089 % | 46.002 ± 0.086 % | +| iq1_S | WT 10m | 1.88 | 60.694593 ± 0.471290 | 54.462959 ± 0.449644 | 2.254554 ± 0.004868 | -31.973 ± 0.088 % | 46.271 ± 0.086 % | +| iq1_S | WT 10k | 1.88 | 63.221324 ± 0.493077 | 56.989691 ± 0.471423 | 2.293527 ± 0.004885 | -32.261 ± 0.089 % | 46.562 ± 0.086 % | + +There seems to be no consistent improvement from using more Wikitext tokens for the importance matrix. +K-quants score better on mean Δp than the legacy quants than e.g. KL divergence would suggest. + +## LLaMA 2 vs. LLaMA 3 Quantization comparison + +| Metric | L2 7b q2_K | L3 8b q2_K | L2 7b q4_K_M | L3 8b q4_K_M | L2 7b q6_K | L3 8b q6_K | L2 7b q8_0 | L3 8b q8_0 | +|-----------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------|---------------------| +| Mean PPL | 5.794552 ± 0.032298 | 9.751568 ± 0.063312 | 5.877078 ± 0.032781 | 6.407115 ± 0.039119 | 5.808494 ± 0.032425 | 6.253382 ± 0.038078 | 5.798542 ± 0.032366 | 6.234284 ± 0.037878 | +| Mean PPL ratio | 1.107955 ± 0.001427 | 1.564849 ± 0.004525 | 1.014242 ± 0.000432 | 1.028160 ± 0.000723 | 1.002406 ± 0.000191 | 1.003490 ± 0.000296 | 1.000689 ± 0.000107 | 1.000425 ± 0.000161 | +| Mean ΔPPL | 0.625552 ± 0.008725 | 3.519934 ± 0.033863 | 0.082526 ± 0.002530 | 0.175482 ± 0.004620 | 0.013941 ± 0.001110 | 0.021748 ± 0.001852 | 0.003990 ± 0.000624 | 0.002650 ± 0.001006 | +| PPL correlation | 97.36% | 89.62% | 99.71% | 99.34% | 99.94% | 99.88% | 99.98% | 99.96% | +| Mean KLD | 0.108903 ± 0.000645 | 0.445132 ± 0.001835 | 0.012686 ± 0.000079 | 0.031273 ± 0.000238 | 0.002098 ± 0.000014 | 0.005452 ± 0.000035 | 0.000369 ± 0.000007 | 0.001355 ± 0.000006 | +| Mean Δp | -2.710 ± 0.023 % | -9.123 ± 0.051 % | -0.416 ± 0.008 % | -0.596 ± 0.014 % | -0.035 ± 0.003 % | -0.007 ± 0.006 % | -0.005 ± 0.002 % | -0.019 ± 0.003 % | +| Maximum Δp | 85.136% | 94.268% | 45.209% | 95.054% | 23.593% | 53.601% | 43.925% | 28.734% | +| 99.9% Δp | 37.184% | 50.003% | 17.461% | 27.084% | 7.798% | 13.613% | 3.387% | 6.402% | +| 99.0% Δp | 18.131% | 25.875% | 7.798% | 12.084% | 3.838% | 6.407% | 1.867% | 3.544% | +| Median Δp | -0.391% | -2.476% | -0.026% | -0.024% | -0.001% | 0.000% | -0.000% | -0.000% | +| 1.0% Δp | -39.762% | -87.173% | -11.433% | -19.567% | -4.222% | -6.767% | -1.862% | -3.698% | +| 0.1% Δp | -79.002% | -98.897% | -26.433% | -56.054% | -9.091% | -16.584% | -3.252% | -6.579% | +| Minimum Δp | -99.915% | -99.965% | -83.383% | -98.699% | -43.142% | -68.487% | -9.343% | -24.301% | +| RMS Δp | 9.762 ± 0.053 % | 21.421 ± 0.079 % | 3.252 ± 0.024 % | 5.519 ± 0.050 % | 1.339 ± 0.010 % | 2.295 ± 0.019 % | 0.618 ± 0.011 % | 1.198 ± 0.007 % | +| Same top p | 85.584 ± 0.086 % | 71.138 ± 0.119 % | 94.665 ± 0.055 % | 91.901 ± 0.072 % | 97.520 ± 0.038 % | 96.031 ± 0.051 % | 98.846 ± 0.026 % | 97.674 ± 0.040 % | + + +## Old Numbers + +
+Llama 2 70B Scoreboard + +| Quantization | Model size (GiB) | Perplexity | Delta to fp16 | +|--------------|------------------|------------|---------------| +| Q4_0 | 36.20 | 3.5550 | 3.61% | +| Q4_1 | 40.20 | 3.5125 | 2.37% | +| Q5_0 | 44.20 | 3.4744 | 1.26% | +| Q2_K | 27.27 | 3.7339 | 8.82% | +| Q3_K_S | 27.86 | 3.7019 | 7.89% | +| Q3_K_M | 30.83 | 3.5932 | 4.72% | +| Q3_K_L | 33.67 | 3.5617 | 3.80% | +| Q4_K_S | 36.39 | 3.4852 | 1.57% | +| Q4_K_M | 38.54 | 3.4725 | 1.20% | +| Q5_K_S | 44.20 | 3.4483 | 0.50% | +| Q5_K_M | 45.41 | 3.4451 | 0.40% | +| Q6_K | 52.70 | 3.4367 | 0.16% | +| fp16 | 128.5 | 3.4313 | - | + +
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index d766aef6a..db6e0949d 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -216,17 +216,22 @@ static void process_logits(std::ostream& out, int n_vocab, const float * logits, } struct kl_divergence_result { - double sum_nll = 0; - double sum_nll2 = 0; - double sum_kld = 0; - double sum_kld2 = 0; - double sum_nll_diff = 0; - double sum_nll_diff2 = 0; - size_t n_same_top = 0; - size_t count = 0; + double sum_nll = 0.0; + double sum_nll2 = 0.0; + double sum_nll_base = 0.0; + double sum_nll_base2 = 0.0; + double sum_nll_nll_base = 0.0; + double sum_kld = 0.0; + double sum_kld2 = 0.0; + double sum_p_diff = 0.0; + double sum_p_diff2 = 0.0; + double sum_p_diff4 = 0.0; + float max_p_diff = 0.0f; + size_t n_same_top = 0.0; + size_t count = 0.0; }; -static double log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) { +static std::pair log_softmax(int n_vocab, const float * logits, const uint16_t * base_log_prob, int tok, kl_divergence_result & kld) { float max_logit = logits[0]; int imax = 0; for (int i = 1; i < n_vocab; ++i) { @@ -244,12 +249,17 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba const float scale = d[0]; const float min_log_prob = d[1]; base_log_prob += 4; - float nll = max_logit + log_sum_exp - logits[tok]; + + const float nll = max_logit + log_sum_exp - logits[tok]; kld.sum_nll += nll; kld.sum_nll2 += nll*nll; - nll += (scale*base_log_prob[tok] + min_log_prob); - kld.sum_nll_diff += nll; - kld.sum_nll_diff2 += nll*nll; + + const float nll_base = -(scale*base_log_prob[tok] + min_log_prob); + kld.sum_nll_base += nll_base; + kld.sum_nll_base2 += nll_base*nll_base; + + kld.sum_nll_nll_base += nll*nll_base; + max_logit += log_sum_exp; double sum = 0; int imax_base = -1; @@ -269,34 +279,50 @@ static double log_softmax(int n_vocab, const float * logits, const uint16_t * ba kld.sum_kld2 += sum*sum; ++kld.count; if (imax == imax_base) ++kld.n_same_top; - return sum; + + const float p_base = expf(-nll_base); + const float p = expf(-nll); + const float p_diff = p - p_base; + kld.sum_p_diff += p_diff; + const double p_diff2 = p_diff*p_diff; + kld.sum_p_diff2 += p_diff2; + kld.sum_p_diff4 += p_diff2*p_diff2; + kld.max_p_diff = std::max(kld.max_p_diff, std::fabs(p_diff)); + + return std::make_pair(sum, p_diff); } static void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, const std::vector & base_log_probs, kl_divergence_result & kld, - float * kld_values) { + float * kld_values, float * p_diff_values) { std::mutex mutex; const int nv = 2*((n_vocab + 1)/2) + 4; int counter = 0; - auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values] () { + auto compute = [&mutex, &counter, &base_log_probs, &kld, n_vocab, logits, tokens, n_token, nv, kld_values, p_diff_values] () { kl_divergence_result local_kld; while (true) { std::unique_lock lock(mutex); int i = counter++; if (i >= n_token) { - kld.sum_nll += local_kld.sum_nll; - kld.sum_nll2 += local_kld.sum_nll2; - kld.sum_kld += local_kld.sum_kld; - kld.sum_kld2 += local_kld.sum_kld2; - kld.sum_nll_diff += local_kld.sum_nll_diff; - kld.sum_nll_diff2 += local_kld.sum_nll_diff2; - kld.n_same_top += local_kld.n_same_top; - kld.count += local_kld.count; + kld.sum_nll += local_kld.sum_nll; + kld.sum_nll2 += local_kld.sum_nll2; + kld.sum_nll_base += local_kld.sum_nll_base; + kld.sum_nll_base2 += local_kld.sum_nll_base2; + kld.sum_nll_nll_base += local_kld.sum_nll_nll_base; + kld.sum_kld += local_kld.sum_kld; + kld.sum_kld2 += local_kld.sum_kld2; + kld.sum_p_diff += local_kld.sum_p_diff; + kld.sum_p_diff2 += local_kld.sum_p_diff2; + kld.sum_p_diff4 += local_kld.sum_p_diff4; + kld.n_same_top += local_kld.n_same_top; + kld.max_p_diff = std::max(kld.max_p_diff, local_kld.max_p_diff); + kld.count += local_kld.count; break; } lock.unlock(); - double v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); - kld_values[i] = (float)v; + std::pair v = log_softmax(n_vocab, logits + i*n_vocab, base_log_probs.data() + i*nv, tokens[i+1], local_kld); + kld_values[i] = (float)v.first; + p_diff_values[i] = v.second; } }; for (auto & w : workers) { @@ -315,10 +341,11 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & // BOS tokens will be added for each chunk before eval const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); const int n_ctx = llama_n_ctx(ctx); @@ -380,6 +407,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & const int batch_size = std::min(end - batch_start, n_batch); //fprintf(stderr, " Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch); + // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { //fprintf(stderr, "%s : failed to eval\n", __func__); return {tokens, -1, logit_history, prob_history}; @@ -453,6 +481,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // BOS tokens will be added for each chunk before eval const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); std::ofstream logits_stream; if (!params.logits_file.empty()) { @@ -469,7 +498,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par auto tim1 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenizing the input ..\n", __func__); - std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + std::vector tokens = ::llama_tokenize(ctx, params.prompt, true); auto tim2 = std::chrono::high_resolution_clock::now(); fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); @@ -552,6 +581,8 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par const int batch_start = start + j * n_batch; const int batch_size = std::min(end - batch_start, n_batch); + int n_outputs = 0; + batch.n_tokens = 0; for (int seq = 0; seq < n_seq_batch; seq++) { int seq_start = batch_start + seq*n_ctx; @@ -566,11 +597,13 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par for (int k = 0; k < batch_size; ++k) { const int idx = seq*n_ctx + k; - batch.token[idx] = tokens[seq_start + k]; - batch.pos[idx] = j*n_batch + k; - batch.n_seq_id[idx] = 1; - batch.seq_id[idx][0] = seq; - batch.logits[idx] = batch.pos[idx] >= first ? 1 : 0; + batch.token [idx] = tokens[seq_start + k]; + batch.pos [idx] = j*n_batch + k; + batch.n_seq_id[idx] = 1; + batch.seq_id [idx][0] = seq; + batch.logits [idx] = batch.pos[idx] >= first ? 1 : 0; + + n_outputs += batch.logits[idx] != 0; } batch.n_tokens += batch_size; @@ -583,9 +616,9 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par return {tokens, -1, logit_history, prob_history}; } - if (num_batches > 1) { + if (num_batches > 1 && n_outputs > 0) { const auto * batch_logits = llama_get_logits(ctx); - logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + logits.insert(logits.end(), batch_logits, batch_logits + n_outputs * n_vocab); } } @@ -604,14 +637,15 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } for (int seq = 0; seq < n_seq_batch; seq++) { - const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx); + const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx + first); + llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first; if (!params.logits_file.empty()) { - process_logits(logits_stream, n_vocab, all_logits + first*n_vocab, + process_logits(logits_stream, n_vocab, all_logits, tokens_data, n_ctx - 1 - first, workers, log_probs, nll, nll2); } else { - process_logits(n_vocab, all_logits + first*n_vocab, + process_logits(n_vocab, all_logits, tokens_data, n_ctx - 1 - first, workers, nll, nll2, logit_history.data() + start + seq*n_ctx + first, @@ -652,6 +686,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par } static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector & batch_logits, int32_t n_batch, int32_t n_vocab) { + int prev_outputs = 0; for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += n_batch) { const int32_t n_tokens = std::min(n_batch, (int32_t) (batch.n_tokens - i)); @@ -672,7 +707,14 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector< return false; } - memcpy(batch_logits.data() + i*n_vocab, llama_get_logits(ctx), n_tokens*n_vocab*sizeof(float)); + int n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + n_outputs += batch_view.logits[i] != 0; + } + + memcpy(batch_logits.data() + prev_outputs*n_vocab, llama_get_logits(ctx), n_outputs*n_vocab*sizeof(float)); + + prev_outputs += n_outputs; } return true; @@ -757,9 +799,6 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { const bool is_spm = llama_vocab_type(llama_get_model(ctx)) == LLAMA_VOCAB_TYPE_SPM; fprintf(stderr, "================================= is_spm = %d\n", is_spm); - // This is needed as usual for LLaMA models - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - // The tasks should be randomized so the score stabilizes quickly. bool randomize_tasks = true; @@ -779,7 +818,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { size_t ending_logprob_count[4]; double ending_logprob[4]; - size_t i_batch; // starting index in the llama_batch + size_t i_logits; // starting index of logits in the llama_batch size_t common_prefix; // max number of initial tokens that are the same in all sentences size_t required_tokens; // needed number of tokens to evaluate all 4 endings std::vector seq_tokens[4]; @@ -804,7 +843,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_cur.gold_ending_idx = std::stoi( prompt_lines[idx*6+1] ); for (size_t j = 0; j < 4; j++) { hs_cur.ending[j] = prompt_lines[idx*6+2+j]; - hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], add_bos); + hs_cur.seq_tokens[j] = ::llama_tokenize(ctx, hs_cur.context + " " + hs_cur.ending[j], true); } // determine the common prefix of the endings @@ -823,7 +862,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { hs_cur.seq_tokens[2].size() - hs_cur.common_prefix + hs_cur.seq_tokens[3].size() - hs_cur.common_prefix; - //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, add_bos).size()); + //GGML_ASSERT(hs_cur.common_prefix >= ::llama_tokenize(ctx, hs_cur.context, true).size()); // Delete the selected random example from the prompt if (randomize_tasks) { @@ -844,9 +883,10 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { const int max_tasks_per_batch = 32; const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + llama_batch batch = llama_batch_init(n_ctx, 0, 4); std::vector tok_logits(n_vocab); + // TODO: this could be made smaller; it's currently the worst-case size std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; @@ -857,16 +897,17 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { int n_cur = 0; size_t i1 = i0; - size_t i_batch = 0; // this tells us where in `llama_batch` we are currently + size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch llama_batch_clear(batch); // batch as much tasks as possible into the available context - // each task has 4 unique seuqnce ids - one for each ending + // each task has 4 unique sequence ids - one for each ending // the common prefix is shared among the 4 sequences to save tokens // we extract logits only from the last common token and from all ending tokens of each sequence while (n_cur + (int) hs_data[i1].required_tokens <= n_ctx) { auto & hs_cur = hs_data[i1]; + int n_logits = 0; const int s0 = 4*(i1 - i0); if (s0 + 4 > max_seq) { @@ -874,18 +915,23 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { } for (size_t i = 0; i < hs_cur.common_prefix; ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3}, false); + llama_batch_add(batch, hs_cur.seq_tokens[0][i], i, { s0 + 0, s0 + 1, s0 + 2, s0 + 3 }, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix + n_logits += 1; for (int s = 0; s < 4; ++s) { - for (size_t i = hs_cur.common_prefix; i < hs_cur.seq_tokens[s].size(); ++i) { - llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, true); + const size_t seq_tokens_size = hs_cur.seq_tokens[s].size(); + // TODO: don't evaluate the last token of each sequence + for (size_t i = hs_cur.common_prefix; i < seq_tokens_size; ++i) { + const bool needs_logits = i < seq_tokens_size - 1; + llama_batch_add(batch, hs_cur.seq_tokens[s][i], i, { s0 + s }, needs_logits); + n_logits += needs_logits; } } - hs_cur.i_batch = i_batch; - i_batch += hs_cur.required_tokens; + hs_cur.i_logits = i_logits; + i_logits += n_logits; n_cur += hs_data[i1].required_tokens; if (++i1 == hs_task_count) { @@ -911,12 +957,11 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { eval_pairs.clear(); for (size_t i = i0; i < i1; ++i) { auto & hs_cur = hs_data[i]; - size_t li = hs_cur.common_prefix; + size_t li = 1; // skip the last logit of the common prefix (computed separately below) for (int s = 0; s < 4; ++s) { for (size_t j = hs_cur.common_prefix; j < hs_cur.seq_tokens[s].size() - 1; j++) { - eval_pairs.emplace_back(hs_cur.i_batch + li++, hs_cur.seq_tokens[s][j + 1]); + eval_pairs.emplace_back(hs_cur.i_logits + li++, hs_cur.seq_tokens[s][j + 1]); } - ++li; } } // Then we do the actual calculation @@ -928,7 +973,8 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) { for (size_t i = i0; i < i1; ++i) { auto & hs_cur = hs_data[i]; - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(hs_cur.i_batch + hs_cur.common_prefix - 1), n_vocab*sizeof(float)); + // get the logits of the last token of the common prefix + std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*hs_cur.i_logits, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -978,7 +1024,7 @@ struct winogrande_entry { std::array choices; int answer; - size_t i_batch; + size_t i_logits; size_t common_prefix; size_t required_tokens; size_t n_base1; // number of tokens for context + choice 1 @@ -1089,12 +1135,9 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { fprintf(stderr, "%s : tokenizing selected tasks\n", __func__); - // This is needed as usual for LLaMA models - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - for (auto & task : data) { - task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, add_bos); - task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, add_bos); + task.seq_tokens[0] = ::llama_tokenize(ctx, task.first + task.choices[0] + task.second, true); + task.seq_tokens[1] = ::llama_tokenize(ctx, task.first + task.choices[1] + task.second, true); task.common_prefix = 0; for (size_t k = 0; k < task.seq_tokens[0].size(); k++) { @@ -1104,12 +1147,13 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { task.common_prefix++; } + // TODO: the last token of each of the sequences don't need to be evaluated task.required_tokens = task.common_prefix + task.seq_tokens[0].size() - task.common_prefix + task.seq_tokens[1].size() - task.common_prefix; - task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], add_bos).size(); - task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], add_bos).size(); + task.n_base1 = ::llama_tokenize(ctx, task.first + task.choices[0], true).size(); + task.n_base2 = ::llama_tokenize(ctx, task.first + task.choices[1], true).size(); } fprintf(stderr, "%s : calculating winogrande score over selected tasks.\n", __func__); @@ -1121,9 +1165,10 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const int max_tasks_per_batch = 128; const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx)); - llama_batch batch = llama_batch_init(n_ctx, 0, max_seq); + llama_batch batch = llama_batch_init(n_ctx, 0, 2); std::vector tok_logits(n_vocab); + // TODO: this could be made smaller; it's currently the worst-case size std::vector batch_logits(n_vocab*n_ctx); std::vector> eval_pairs; @@ -1137,29 +1182,33 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { int n_cur = 0; size_t i1 = i0; - size_t i_batch = 0; + size_t i_logits = 0; llama_batch_clear(batch); while (n_cur + (int) data[i1].required_tokens <= n_ctx) { + int n_logits = 0; const int s0 = 2*(i1 - i0); if (s0 + 2 > max_seq) { break; } for (size_t i = 0; i < data[i1].common_prefix; ++i) { - llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1}, false); + llama_batch_add(batch, data[i1].seq_tokens[0][i], i, { s0 + 0, s0 + 1 }, false); } batch.logits[batch.n_tokens - 1] = true; + n_logits += 1; for (int s = 0; s < 2; ++s) { + // TODO: end before the last token, no need to predict past the end of the sequences for (size_t i = data[i1].common_prefix; i < data[i1].seq_tokens[s].size(); ++i) { llama_batch_add(batch, data[i1].seq_tokens[s][i], i, { s0 + s }, true); + n_logits += 1; } } - data[i1].i_batch = i_batch; - i_batch += data[i1].required_tokens; + data[i1].i_logits = i_logits; + i_logits += n_logits; n_cur += data[i1].required_tokens; if (++i1 == data.size()) { @@ -1190,15 +1239,16 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) { const auto& n_base1 = skip_choice ? task.n_base1 : task.common_prefix; const int last_1st = task.seq_tokens[0].size() - n_base1 > 1 ? 1 : 0; - size_t li = n_base1 - 1; + size_t li = n_base1 - task.common_prefix; for (size_t j = n_base1-1; j < task.seq_tokens[0].size()-1-last_1st; ++j) { - eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[0][j+1]); + eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[0][j+1]); } const auto& n_base2 = skip_choice ? task.n_base2 : task.common_prefix; const int last_2nd = task.seq_tokens[1].size() - n_base2 > 1 ? 1 : 0; - li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - 1; + // FIXME: this uses the wrong first logits when not skipping the choice word + li = task.seq_tokens[0].size() - task.common_prefix + n_base2 - task.common_prefix; for (size_t j = n_base2-1; j < task.seq_tokens[1].size()-1-last_2nd; ++j) { - eval_pairs.emplace_back(task.i_batch + li++, task.seq_tokens[1][j+1]); + eval_pairs.emplace_back(task.i_logits + li++, task.seq_tokens[1][j+1]); } } compute_logprobs(batch_logits.data(), n_vocab, workers, eval_pairs, eval_results); @@ -1287,14 +1337,14 @@ struct multiple_choice_task { } // For evaluation - size_t i_batch; // starting index in the llama_batch + size_t i_logits; // starting index of logits in the llama_batch size_t common_prefix; // max number of initial tokens that are the same in all sentences size_t required_tokens; // needed number of tokens to evaluate all answers std::vector> seq_tokens; std::vector log_probs; }; -static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, multiple_choice_task& task, bool log_error) { +static bool multiple_choice_prepare_one_task(llama_context * ctx, multiple_choice_task& task, bool log_error) { if (task.question.empty() || task.mc1.answers.empty()) { if (log_error) { printf("%s: found bad task with empty question and/or answers\n", __func__); @@ -1309,7 +1359,7 @@ static bool multiple_choice_prepare_one_task(llama_context * ctx, bool add_bos, } return false; } - task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, add_bos)); + task.seq_tokens.emplace_back(::llama_tokenize(ctx, task.question + " " + answer, true)); } auto min_len = task.seq_tokens.front().size(); for (auto& seq : task.seq_tokens) { @@ -1366,7 +1416,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params std::vector task_pos(n_task); strstream.read((char *)task_pos.data(), task_pos.size()*sizeof(uint32_t)); if (strstream.fail()) { - printf("%s: failed to raad task positions from prompt\n", __func__); + printf("%s: failed to read task positions from prompt\n", __func__); return; } @@ -1408,9 +1458,6 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params n_task = params.multiple_choice_tasks; } - // This is needed as usual for LLaMA models - const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); - printf("%s: preparing task data", __func__); fflush(stdout); if (n_task > 500) { @@ -1418,7 +1465,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params fflush(stdout); std::atomic counter(0); std::atomic n_bad(0); - auto prepare = [&counter, &n_bad, &tasks, ctx, add_bos] () { + auto prepare = [&counter, &n_bad, &tasks, ctx] () { int num_tasks = tasks.size(); int n_bad_local = 0; while (true) { @@ -1429,7 +1476,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params } int last = std::min(first + K_TOKEN_CHUNK, num_tasks); for (int i = first; i < last; ++i) { - if (!multiple_choice_prepare_one_task(ctx, add_bos, tasks[i], false)) ++n_bad_local; + if (!multiple_choice_prepare_one_task(ctx, tasks[i], false)) ++n_bad_local; } } }; @@ -1447,11 +1494,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params return; } } else { - int n_dot = n_task/100; + int n_dot = std::max((int) n_task/100, 1); int i_task = 0; for (auto& task : tasks) { ++i_task; - if (!multiple_choice_prepare_one_task(ctx, add_bos, task, true)) { + if (!multiple_choice_prepare_one_task(ctx, task, true)) { return; } if (i_task%n_dot == 0) { @@ -1491,17 +1538,18 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params int n_cur = 0; size_t i1 = i0; - size_t i_batch = 0; // this tells us where in `llama_batch` we are currently + size_t i_logits = 0; // this tells us how many logits were needed before this point in the batch llama_batch_clear(batch); // batch as much tasks as possible into the available context - // each task has 4 unique seuqnce ids - one for each ending + // each task has 4 unique sequence ids - one for each ending // the common prefix is shared among the 4 sequences to save tokens // we extract logits only from the last common token and from all ending tokens of each sequence int s0 = 0; while (n_cur + (int) tasks[i1].required_tokens <= n_ctx) { auto& cur_task = tasks[i1]; + int n_logits = 0; int num_answers = cur_task.seq_tokens.size(); if (s0 + num_answers > max_seq) { @@ -1518,17 +1566,22 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params llama_batch_add(batch, cur_task.seq_tokens[0][i], i, batch_indeces, false); } batch.logits[batch.n_tokens - 1] = true; // we need logits for the last token of the common prefix + n_logits += 1; for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { - for (size_t i = cur_task.common_prefix; i < cur_task.seq_tokens[s].size(); ++i) { - llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, true); + const size_t seq_tokens_size = cur_task.seq_tokens[s].size(); + // TODO: don't evaluate the last token of each sequence + for (size_t i = cur_task.common_prefix; i < seq_tokens_size; ++i) { + const bool needs_logits = i < seq_tokens_size - 1; + llama_batch_add(batch, cur_task.seq_tokens[s][i], i, { s0 + s }, needs_logits); + n_logits += needs_logits; } } s0 += num_answers; - cur_task.i_batch = i_batch; - i_batch += cur_task.required_tokens; + cur_task.i_logits = i_logits; + i_logits += n_logits; n_cur += cur_task.required_tokens; if (++i1 == tasks.size()) { @@ -1554,12 +1607,11 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params eval_pairs.clear(); for (size_t i = i0; i < i1; ++i) { auto& cur_task = tasks[i]; - size_t li = cur_task.common_prefix; + size_t li = 1; // skip the last logit of the common prefix (computed separately below) for (int s = 0; s < int(cur_task.seq_tokens.size()); ++s) { for (size_t j = cur_task.common_prefix; j < cur_task.seq_tokens[s].size() - 1; j++) { - eval_pairs.emplace_back(cur_task.i_batch + li++, cur_task.seq_tokens[s][j + 1]); + eval_pairs.emplace_back(cur_task.i_logits + li++, cur_task.seq_tokens[s][j + 1]); } - ++li; } } // Then we do the actual calculation @@ -1578,7 +1630,8 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params //} //printf("\n common_prefix: %zu\n", cur_task.common_prefix); - std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*(cur_task.i_batch + cur_task.common_prefix - 1), n_vocab*sizeof(float)); + // get the logits of the last token of the common prefix + std::memcpy(tok_logits.data(), batch_logits.data() + n_vocab*cur_task.i_logits, n_vocab*sizeof(float)); const auto first_probs = softmax(tok_logits); @@ -1681,9 +1734,11 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { const int num_batches = (n_ctx + n_batch - 1)/n_batch; const int nv = 2*((n_vocab + 1)/2) + 4; const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + GGML_ASSERT(llama_add_eos_token(llama_get_model(ctx)) != 1); std::vector log_probs_uint16(size_t(n_ctx - 1 - n_ctx/2) * nv); - std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); + std::vector kld_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); + std::vector p_diff_values(size_t(n_ctx - 1 - n_ctx/2)*n_chunk); std::vector logits; if (num_batches > 1) { logits.reserve(n_ctx * n_vocab); @@ -1700,9 +1755,18 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { df = df > 0 && count > 10 ? sqrt(df/(count-1)) : 0.; return std::make_pair(f, df); }; + auto covariance = [] (double suma, double sumb, double sumab, size_t count) { + if (count < 10) { + return 0.0; + } + double var = sumab/count - (suma/count)*(sumb/count); + var /= count - 1; + return var; + }; kl_divergence_result kld; - auto kld_ptr = kld_values.data(); + auto kld_ptr = kld_values.data(); + auto p_diff_ptr = p_diff_values.data(); for (int i = 0; i < n_chunk; ++i) { const int start = i * n_ctx; @@ -1730,6 +1794,7 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } + // TODO: use llama_batch.logits instead of relying on logits_all == true if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { fprintf(stderr, "%s : failed to eval\n", __func__); return; @@ -1756,24 +1821,42 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { } fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); - printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL-Divergence Same top\n"); + printf("\nchunk PPL ln(PPL(Q)/PPL(base)) KL Divergence Δp RMS Same top p\n"); } const int first = n_ctx/2; const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits(ctx); process_logits(n_vocab, all_logits + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, - workers, log_probs_uint16, kld, kld_ptr); - kld_ptr += n_ctx - 1 - first; + workers, log_probs_uint16, kld, kld_ptr, p_diff_ptr); + p_diff_ptr += n_ctx - 1 - first; + kld_ptr += n_ctx - 1 - first; - auto ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); - auto log_ppl_ratio = mean_and_uncertainty(kld.sum_nll_diff, kld.sum_nll_diff2, kld.count); - auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - auto p_top = 1.*kld.n_same_top/kld.count; - auto d_p_top = sqrt(p_top*(1 - p_top)/(kld.count - 1)); + printf("%4d", i+1); - printf("%4d %10.4lf %10.5lf ± %10.5f %10.5f ± %10.5lf %.5f ± %.5f\n", i+1, exp(ppl.first), - log_ppl_ratio.first, log_ppl_ratio.second, kl_div.first, kl_div.second, - p_top, d_p_top); + auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); + const double ppl_val = exp(log_ppl.first); + const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) + printf(" %9.4lf ± %9.4lf", ppl_val, ppl_unc); + + auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); + const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); + const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; + const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); + printf(" %10.5lf ± %10.5lf", log_ppl_ratio_val, log_ppl_ratio_unc); + + auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); + printf(" %10.5lf ± %10.5lf", kl_div.first, kl_div.second); + + auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); + const double p_diff_rms_val = sqrt(p_diff_mse.first); + const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; + printf(" %6.3lf ± %6.3lf %%", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + + double p_top_val = 1.*kld.n_same_top/kld.count; + double p_top_unc = sqrt(p_top_val*(1 - p_top_val)/(kld.count - 1)); + printf(" %6.3lf ± %6.3lf %%", 100.0*p_top_val, 100.0*p_top_unc); + + printf("\n"); fflush(stdout); @@ -1784,31 +1867,97 @@ static void kl_divergence(llama_context * ctx, const gpt_params & params) { if (kld.count < 100) return; // we do not wish to do statistics on so few values std::sort(kld_values.begin(), kld_values.end()); + std::sort(p_diff_values.begin(), p_diff_values.end()); - printf("===== KL-divergence statistics\n"); + printf("====== Perplexity statistics ======\n"); + + auto log_ppl = mean_and_uncertainty(kld.sum_nll, kld.sum_nll2, kld.count); + const double ppl_val = exp(log_ppl.first); + const double ppl_unc = ppl_val * log_ppl.second; // ppl_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl.second ** 2 ) + printf("Mean PPL(Q) : %10.6lf ± %10.6lf\n", ppl_val, ppl_unc); + + auto log_ppl_base = mean_and_uncertainty(kld.sum_nll_base, kld.sum_nll_base2, kld.count); + const double ppl_base_val = exp(log_ppl_base.first); + const double ppl_base_unc = ppl_base_val * log_ppl_base.second; // ppl_base_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_base.second ** 2 ) + printf("Mean PPL(base) : %10.6lf ± %10.6lf\n", ppl_base_val, ppl_base_unc); + + const double log_ppl_cov = covariance(kld.sum_nll, kld.sum_nll_base, kld.sum_nll_nll_base, kld.count); + // printf("Cov(ln(PPL(Q)), ln(PPL(base))): %10.6lf\n", log_ppl_cov); + const double log_ppl_cor = log_ppl_cov / (log_ppl.second*log_ppl_base.second); + printf("Cor(ln(PPL(Q)), ln(PPL(base))): %6.2lf%%\n", 100.0*log_ppl_cor); + + const double log_ppl_ratio_val = log_ppl.first - log_ppl_base.first; + const double log_ppl_ratio_unc = sqrt(log_ppl.second*log_ppl.second + log_ppl_base.second*log_ppl_base.second - 2.0*log_ppl_cov); + printf("Mean ln(PPL(Q)/PPL(base)) : %10.6lf ± %10.6lf\n", log_ppl_ratio_val, log_ppl_ratio_unc); + + const double ppl_ratio_val = exp(log_ppl_ratio_val); + const double ppl_ratio_unc = ppl_ratio_val * log_ppl_ratio_unc; // ppl_ratio_unc = sqrt( (dexp(x) / dx) ** 2 * log_ppl_ratio.second ** 2 ) + printf("Mean PPL(Q)/PPL(base) : %10.6lf ± %10.6lf\n", ppl_ratio_val, ppl_ratio_unc); + + const double ppl_cov = ppl_val * ppl_base_val * log_ppl_cov; + const double ppl_diff_val = ppl_val - ppl_base_val; + const double ppl_diff_unc = sqrt(ppl_unc*ppl_unc + ppl_base_unc*ppl_base_unc - 2.0*ppl_cov); + printf("Mean PPL(Q)-PPL(base) : %10.6lf ± %10.6lf\n", ppl_diff_val, ppl_diff_unc); + + printf("\n"); + + printf("====== KL divergence statistics ======\n"); auto kl_div = mean_and_uncertainty(kld.sum_kld, kld.sum_kld2, kld.count); - printf("Average: %10.6f ±%10.6lf\n", kl_div.first, kl_div.second); + printf("Mean KLD: %10.6lf ± %10.6lf\n", kl_div.first, kl_div.second); auto kld_median = kld_values.size()%2 == 0 ? 0.5f*(kld_values[kld_values.size()/2] + kld_values[kld_values.size()/2-1]) : kld_values[kld_values.size()/2]; - printf("Median : %10.6f\n", kld_median); - auto percentile = [&kld_values] (float fraction) { - if (fraction <= 0) return kld_values.front(); - if (fraction >= 1) return kld_values.back(); - float p = fraction*(kld_values.size() - 1); + auto percentile = [] (std::vector values, float fraction) { + if (fraction <= 0) return values.front(); + if (fraction >= 1) return values.back(); + float p = fraction*(values.size() - 1); size_t ip = size_t(p); p -= ip; - return (1 - p)*kld_values[ip] + p*kld_values[std::min(ip+1, kld_values.size()-1)]; + return (1 - p)*values[ip] + p*values[std::min(ip+1, values.size()-1)]; }; - printf("Maximum: %10.6f\n", kld_values.back()); - printf("KLD_99 : %10.6f\n", percentile(0.99f)); - printf("KLD_95 : %10.6f\n", percentile(0.95f)); - printf("KLD_90 : %10.6f\n", percentile(0.90f)); + printf("Maximum KLD: %10.6f\n", kld_values.back()); + printf("99.9%% KLD: %10.6f\n", percentile(kld_values, 0.999f)); + printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + printf("99.0%% KLD: %10.6f\n", percentile(kld_values, 0.990f)); + printf("Median KLD: %10.6f\n", kld_median); + printf("10.0%% KLD: %10.6f\n", percentile(kld_values, 0.100f)); + printf(" 5.0%% KLD: %10.6f\n", percentile(kld_values, 0.050f)); + printf(" 1.0%% KLD: %10.6f\n", percentile(kld_values, 0.010f)); + printf("Minimum KLD: %10.6f\n", kld_values.front()); - printf("Minimum: %10.6f\n", kld_values.front()); - printf("KLD_01 : %10.6f\n", percentile(0.01f)); - printf("KLD_05 : %10.6f\n", percentile(0.05f)); - printf("KLD_10 : %10.6f\n", percentile(0.10f)); + printf("\n"); + + printf("====== Token probability statistics ======\n"); + + auto p_diff = mean_and_uncertainty(kld.sum_p_diff, kld.sum_p_diff2, kld.count); + printf("Mean Δp: %6.3lf ± %5.3lf %%\n", 100.0*p_diff.first, 100.0*p_diff.second); + + auto p_diff_median = p_diff_values.size()%2 == 0 ? 0.5f*(p_diff_values[p_diff_values.size()/2] + p_diff_values[p_diff_values.size()/2-1]) + : p_diff_values[p_diff_values.size()/2]; + + printf("Maximum Δp: %6.3lf%%\n", 100.0*p_diff_values.back()); + printf("99.9%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.999f)); + printf("99.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.990f)); + printf("95.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.950f)); + printf("90.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.900f)); + printf("75.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.750f)); + printf("Median Δp: %6.3lf%%\n", 100.0*p_diff_median); + printf("25.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.250f)); + printf("10.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.100f)); + printf(" 5.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.050f)); + printf(" 1.0%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.010f)); + printf(" 0.1%% Δp: %6.3lf%%\n", 100.0*percentile(p_diff_values, 0.001f)); + printf("Minimum Δp: %6.3lf%%\n", 100.0*p_diff_values.front()); + + auto p_diff_mse = mean_and_uncertainty(kld.sum_p_diff2, kld.sum_p_diff4, kld.count); + // printf("MSE Δp : %10.6lf ± %10.6lf\n", p_diff_mse.first, p_diff_mse.second); + + const double p_diff_rms_val = sqrt(p_diff_mse.first); + const double p_diff_rms_unc = 0.5/p_diff_rms_val * p_diff_mse.second; + printf("RMS Δp : %6.3lf ± %5.3lf %%\n", 100.0*p_diff_rms_val, 100.0*p_diff_rms_unc); + + const double same_top_p = 1.0*kld.n_same_top/kld.count; + printf("Same top p: %6.3lf ± %5.3lf %%\n", 100.0*same_top_p, 100.0*sqrt(same_top_p*(1.0 - same_top_p)/(kld.count - 1))); } @@ -1823,12 +1972,20 @@ int main(int argc, char ** argv) { const int32_t n_ctx = params.n_ctx; + if (n_ctx <= 0) { + fprintf(stderr, "%s: perplexity tool requires '--ctx-size' > 0\n", __func__); + return 1; + } + const bool ppl = !params.hellaswag && !params.winogrande && !params.multiple_choice && !params.kl_divergence; + if (ppl) { - int n_seq = std::max(1, params.n_batch / n_ctx); - int32_t n_kv = n_seq * n_ctx; + const int32_t n_seq = std::max(1, params.n_batch / n_ctx); + const int32_t n_kv = n_seq * n_ctx; + params.n_parallel = n_seq; - params.n_ctx = n_kv; + params.n_ctx = n_kv; + params.n_batch = std::min(params.n_batch, n_kv); } else { params.n_batch = std::min(params.n_batch, params.n_ctx); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index 1d05f1391..746df8446 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -23,7 +23,7 @@ #endif struct quantize_stats_params { - std::string model = "models/7B/ggml-model-f16.gguf"; + std::string model = DEFAULT_MODEL_PATH; bool verbose = false; bool per_layer_stats = false; bool print_histogram = false; diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index 6f374a2bd..6b977fde8 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,6 +1,6 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(${TARGET} PRIVATE ../../common) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/README.md b/examples/quantize/README.md index c8b9a27a0..8a10365c0 100644 --- a/examples/quantize/README.md +++ b/examples/quantize/README.md @@ -4,17 +4,17 @@ TODO ## Llama 2 7B -Quantization | Bits per Weight (BPW) --- | -- -Q2_K | 3.35 -Q3_K_S | 3.50 -Q3_K_M | 3.91 -Q3_K_L | 4.27 -Q4_K_S | 4.58 -Q4_K_M | 4.84 -Q5_K_S | 5.52 -Q5_K_M | 5.68 -Q6_K | 6.56 +| Quantization | Bits per Weight (BPW) | +|--------------|-----------------------| +| Q2_K | 3.35 | +| Q3_K_S | 3.50 | +| Q3_K_M | 3.91 | +| Q3_K_L | 4.27 | +| Q4_K_S | 4.58 | +| Q4_K_M | 4.84 | +| Q5_K_S | 5.52 | +| Q5_K_M | 5.68 | +| Q6_K | 6.56 | ## Llama 2 13B Quantization | Bits per Weight (BPW) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 7662ec80c..909eab283 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -8,7 +8,6 @@ #include #include #include -#include struct quant_option { std::string name; @@ -26,6 +25,7 @@ static const std::vector QUANT_OPTIONS = { { "IQ2_S", LLAMA_FTYPE_MOSTLY_IQ2_S, " 2.5 bpw quantization", }, { "IQ2_M", LLAMA_FTYPE_MOSTLY_IQ2_M, " 2.7 bpw quantization", }, { "IQ1_S", LLAMA_FTYPE_MOSTLY_IQ1_S, " 1.56 bpw quantization", }, + { "IQ1_M", LLAMA_FTYPE_MOSTLY_IQ1_M, " 1.75 bpw quantization", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "IQ3_XXS",LLAMA_FTYPE_MOSTLY_IQ3_XXS," 3.06 bpw quantization", }, @@ -46,12 +46,17 @@ static const std::vector QUANT_OPTIONS = { { "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", }, { "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", }, { "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", }, - { "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", }, + { "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", }, + { "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", }, { "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", }, // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching. { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", }, }; +static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE = "quantize.imatrix.file"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET = "quantize.imatrix.dataset"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES = "quantize.imatrix.entries_count"; +static const char * const LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS = "quantize.imatrix.chunks_count"; static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) { std::string ftype_str; @@ -87,13 +92,18 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp // [[noreturn]] static void usage(const char * executable) { - printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); + printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable); printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n"); printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n"); printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n"); printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n"); printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n"); printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n"); + printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n"); + printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n"); + printf(" --keep-split: will generate quatized model in the same shards as input"); + printf(" --override-kv KEY=TYPE:VALUE\n"); + printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n"); printf("Note: --include-weights and --exclude-weights cannot be used together\n"); printf("\nAllowed quantization types:\n"); for (auto & it : QUANT_OPTIONS) { @@ -107,61 +117,80 @@ static void usage(const char * executable) { exit(1); } -static void load_imatrix(const std::string& imatrix_file, std::unordered_map>& imatrix_data) { +static int load_imatrix(const std::string & imatrix_file, std::string & imatrix_dataset, std::unordered_map> & imatrix_data) { std::ifstream in(imatrix_file.c_str(), std::ios::binary); if (!in) { - printf("%s: failed to open %s\n",__func__,imatrix_file.c_str()); - return; + printf("%s: failed to open %s\n",__func__, imatrix_file.c_str()); + exit(1); } int n_entries; - in.read((char*)&n_entries, sizeof(n_entries)); + in.read((char *)&n_entries, sizeof(n_entries)); if (in.fail() || n_entries < 1) { printf("%s: no data in file %s\n", __func__, imatrix_file.c_str()); - return; + exit(1); } for (int i = 0; i < n_entries; ++i) { int len; in.read((char *)&len, sizeof(len)); std::vector name_as_vec(len+1); in.read((char *)name_as_vec.data(), len); if (in.fail()) { - printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str()); - return; + printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str()); + exit(1); } name_as_vec[len] = 0; std::string name{name_as_vec.data()}; - auto& e = imatrix_data[std::move(name)]; + auto & e = imatrix_data[name]; int ncall; - in.read((char*)&ncall, sizeof(ncall)); + in.read((char *)&ncall, sizeof(ncall)); int nval; in.read((char *)&nval, sizeof(nval)); if (in.fail() || nval < 1) { - printf("%s: failed reading number of values for entry %d\n",__func__,i); + printf("%s: failed reading number of values for entry %d\n", __func__, i); imatrix_data = {}; - return; + exit(1); } e.resize(nval); - in.read((char*)e.data(), nval*sizeof(float)); + in.read((char *)e.data(), nval*sizeof(float)); if (in.fail()) { - printf("%s: failed reading data for entry %d\n",__func__,i); + printf("%s: failed reading data for entry %d\n", __func__, i); imatrix_data = {}; - return; + exit(1); } if (ncall > 0) { for (auto& v : e) v /= ncall; } + + if (getenv("LLAMA_TRACE")) { + printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str()); + } } - printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str()); + + // latest imatrix version contains the dataset filename at the end of the file + int m_last_call = 0; + if (in.peek() != EOF) { + in.read((char *)&m_last_call, sizeof(m_last_call)); + int dataset_len; + in.read((char *)&dataset_len, sizeof(dataset_len)); + std::vector dataset_as_vec(dataset_len); + in.read(dataset_as_vec.data(), dataset_len); + imatrix_dataset.assign(dataset_as_vec.begin(), dataset_as_vec.end()); + printf("%s: imatrix dataset='%s'\n", __func__, imatrix_dataset.c_str()); + } + printf("%s: loaded %d importance matrix entries from %s computed on %d chunks\n", __func__, int(imatrix_data.size()), imatrix_file.c_str(), m_last_call); + return m_last_call; } -static void prepare_imatrix(const std::string& imatrix_file, - const std::vector& included_weights, - const std::vector& excluded_weights, - std::unordered_map>& imatrix_data) { +static int prepare_imatrix(const std::string & imatrix_file, + std::string & imatrix_dataset, + const std::vector & included_weights, + const std::vector & excluded_weights, + std::unordered_map> & imatrix_data) { + int m_last_call = -1; if (!imatrix_file.empty()) { - load_imatrix(imatrix_file, imatrix_data); + m_last_call = load_imatrix(imatrix_file, imatrix_dataset, imatrix_data); } if (imatrix_data.empty()) { - return; + return m_last_call; } if (!excluded_weights.empty()) { for (auto& name : excluded_weights) { @@ -187,6 +216,19 @@ static void prepare_imatrix(const std::string& imatrix_file, if (!imatrix_data.empty()) { printf("%s: have %d importance matrix entries\n", __func__, int(imatrix_data.size())); } + return m_last_call; +} + +static ggml_type parse_ggml_type(const char * arg) { + ggml_type result = GGML_TYPE_COUNT; + for (int j = 0; j < GGML_TYPE_COUNT; ++j) { + auto type = ggml_type(j); + const auto * name = ggml_type_name(type); + if (name && strcmp(arg, name) == 0) { + result = type; break; + } + } + return result; } int main(int argc, char ** argv) { @@ -199,10 +241,27 @@ int main(int argc, char ** argv) { int arg_idx = 1; std::string imatrix_file; std::vector included_weights, excluded_weights; + std::vector kv_overrides; for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) { params.quantize_output_tensor = false; + } else if (strcmp(argv[arg_idx], "--output-tensor-type") == 0) { + if (arg_idx < argc-1) { + params.output_tensor_type = parse_ggml_type(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--token-embedding-type") == 0) { + if (arg_idx < argc-1) { + params.token_embedding_type = parse_ggml_type(argv[++arg_idx]); + } else { + usage(argv[0]); + } + } else if (strcmp(argv[arg_idx], "--override-kv") == 0) { + if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) { + usage(argv[0]); + } } else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) { params.allow_requantize = true; } else if (strcmp(argv[arg_idx], "--pure") == 0) { @@ -225,6 +284,8 @@ int main(int argc, char ** argv) { } else { usage(argv[0]); } + } else if (strcmp(argv[arg_idx], "--keep-split")) { + params.keep_split = true; } else { usage(argv[0]); } @@ -238,10 +299,48 @@ int main(int argc, char ** argv) { usage(argv[0]); } + std::string imatrix_dataset; std::unordered_map> imatrix_data; - prepare_imatrix(imatrix_file, included_weights, excluded_weights, imatrix_data); + int m_last_call = prepare_imatrix(imatrix_file, imatrix_dataset, included_weights, excluded_weights, imatrix_data); if (!imatrix_data.empty()) { params.imatrix = &imatrix_data; + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_file.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + if (!imatrix_dataset.empty()) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_DATASET); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_STR; + strncpy(kvo.val_str, imatrix_dataset.c_str(), 127); + kvo.val_str[127] = '\0'; + kv_overrides.emplace_back(std::move(kvo)); + } + + { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = imatrix_data.size(); + kv_overrides.emplace_back(std::move(kvo)); + } + + if (m_last_call > 0) { + llama_model_kv_override kvo; + std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS); + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.val_i64 = m_last_call; + kv_overrides.emplace_back(std::move(kvo)); + } + } + if (!kv_overrides.empty()) { + kv_overrides.emplace_back(); + kv_overrides.back().key[0] = 0; + params.kv_overrides = &kv_overrides; } llama_backend_init(); @@ -252,21 +351,28 @@ int main(int argc, char ** argv) { std::string fname_out; std::string ftype_str; + std::string suffix = ".gguf"; if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) { std::string fpath; const size_t pos = fname_inp.find_last_of("/\\"); if (pos != std::string::npos) { fpath = fname_inp.substr(0, pos + 1); } - // export as [inp path]/ggml-model-[ftype].gguf - fname_out = fpath + "ggml-model-" + ftype_str + ".gguf"; + + // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting + fname_out = fpath + "ggml-model-" + ftype_str; + if (!params.keep_split) { + fname_out += suffix; + } arg_idx++; if (ftype_str == "COPY") { params.only_copy = true; } - } - else { + } else { fname_out = argv[arg_idx]; + if (params.keep_split && fname_out.find(suffix) != std::string::npos) { + fname_out = fname_out.substr(0, fname_out.length() - suffix.length()); + } arg_idx++; if (argc <= arg_idx) { @@ -296,10 +402,12 @@ int main(int argc, char ** argv) { if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || - params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) && imatrix_data.empty()) { - fprintf(stderr, "\n===============================================================================================\n"); - fprintf(stderr, "Please do not use IQ1_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); - fprintf(stderr, "===============================================================================================\n\n\n"); + params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || + params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) { + fprintf(stderr, "\n==========================================================================================================\n"); + fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n"); + fprintf(stderr, "==========================================================================================================\n\n\n"); return 1; } diff --git a/examples/quantize/tests.sh b/examples/quantize/tests.sh new file mode 100644 index 000000000..160c12bee --- /dev/null +++ b/examples/quantize/tests.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -eu + +if [ $# -lt 1 ] +then + echo "usage: $0 path_to_build_binary [path_to_temp_folder]" + echo "example: $0 ../../build/bin ../../tmp" + exit 1 +fi + +if [ $# -gt 1 ] +then + TMP_DIR=$2 +else + TMP_DIR=/tmp +fi + +set -x + +SPLIT=$1/gguf-split +QUANTIZE=$1/quantize +MAIN=$1/main +WORK_PATH=$TMP_DIR/quantize +ROOT_DIR=$(realpath $(dirname $0)/../../) + +mkdir -p "$WORK_PATH" + +# Clean up in case of previously failed test +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf + +# 1. Get a model +( +cd $WORK_PATH +"$ROOT_DIR"/scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf +) +echo PASS + +# 2. Split model +$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split +echo PASS +echo + +# 3. Requant model with '--keep_split' +$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K +echo PASS +echo + +# 3a. Test the requanted model is loading properly +$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# 4. Requant mode without '--keep_split' +$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K +echo PASS +echo + +# 4b. Test the requanted model is loading properly +$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32 +echo PASS +echo + +# Clean up +rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf diff --git a/examples/regex-to-grammar.py b/examples/regex-to-grammar.py index c98118e95..5cd9210a4 100644 --- a/examples/regex-to-grammar.py +++ b/examples/regex-to-grammar.py @@ -8,7 +8,7 @@ print(subprocess.check_output( "python", os.path.join( os.path.dirname(os.path.realpath(__file__)), - "json-schema-to-grammar.py"), + "json_schema_to_grammar.py"), *rest, "-", "--raw-pattern", diff --git a/examples/retrieval/CMakeLists.txt b/examples/retrieval/CMakeLists.txt new file mode 100644 index 000000000..eaabae08d --- /dev/null +++ b/examples/retrieval/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET retrieval) +add_executable(${TARGET} retrieval.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md new file mode 100644 index 000000000..2b2595c46 --- /dev/null +++ b/examples/retrieval/README.md @@ -0,0 +1,69 @@ +# llama.cpp/examples/retrieval + +Demonstration of simple retrieval technique based on cosine similarity + +More info: +https://github.com/ggerganov/llama.cpp/pull/6193 + +### How to use + +`retieval.cpp` has parameters of its own: +- `--context-file`: file to be embedded - state this option multiple times to embed multiple files +- `--chunk-size`: minimum size of each text chunk to be embedded +- `--chunk-separator`: STRING to divide chunks by. newline by default + +`retrieval` example can be tested as follows: + +```bash +make -j && ./retrieval --model ./models/bge-base-en-v1.5-f16.gguf --top-k 3 --context-file README.md --context-file License --chunk-size 100 --chunk-separator . +``` + +This chunks and embeds all given files and starts a loop requesting query inputs: + +``` +Enter query: +``` + +On each query input, top k chunks are shown along with file name, chunk position within file and original text: + +``` +Enter query: describe the mit license +batch_decode: n_tokens = 6, n_seq = 1 +Top 3 similar chunks: +filename: README.md +filepos: 119 +similarity: 0.762334 +textdata: +png) + +[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) + +[Roadmap](https://github. +-------------------- +filename: License +filepos: 0 +similarity: 0.725146 +textdata: +MIT License + +Copyright (c) 2023 Georgi Gerganov + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. +-------------------- +filename: README.md +filepos: 9178 +similarity: 0.621722 +textdata: +com/cztomsik/ava) (MIT) +- [ptsochantaris/emeltal](https://github.com/ptsochantaris/emeltal) +- [pythops/tenere](https://github. +-------------------- +``` diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp new file mode 100644 index 000000000..5ba71e76a --- /dev/null +++ b/examples/retrieval/retrieval.cpp @@ -0,0 +1,350 @@ +#include "common.h" +#include "llama.h" + +#include +#include + +struct retrieval_params { + std::vector context_files; // context files to embed + int32_t chunk_size = 64; // chunk size for context embedding + std::string chunk_separator = "\n"; // chunk separator for context embedding +}; + +static void retrieval_params_print_usage(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & params) { + gpt_print_usage(argc, argv, gpt_params); + printf("retrieval options:\n"); + printf(" --context-file FNAME file containing context to embed.\n"); + printf(" specify multiple files by providing --context-file option multiple times.\n"); + printf(" --chunk-size N minimum length of embedded text chunk (default:%d)\n", params.chunk_size); + printf(" --chunk-separator STRING\n"); + printf(" string to separate chunks (default: \"\\n\")\n"); + printf("\n"); +} + +static void retrieval_params_parse(int argc, char ** argv, gpt_params & gpt_params, retrieval_params & retrieval_params) { + int i = 1; + std::string arg; + while (i < argc) { + arg = argv[i]; + bool invalid_gpt_param = false; + if(gpt_params_find_arg(argc, argv, argv[i], gpt_params, i, invalid_gpt_param)) { + if (invalid_gpt_param) { + fprintf(stderr, "error: invalid argument: %s\n", arg.c_str()); + retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); + exit(1); + } + // option was parsed by gpt_params_find_arg + } else if (arg == "--context-file") { + if (++i >= argc) { + fprintf(stderr, "error: missing argument for --context-file\n"); + retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); + exit(1); + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); + exit(1); + } + // store the external file name in params + retrieval_params.context_files.push_back(argv[i]); + } else if (arg == "--chunk-size") { + if (++i >= argc) { + fprintf(stderr, "error: missing argument for --chunk-size\n"); + retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); + exit(1); + } + retrieval_params.chunk_size = std::stoi(argv[i]); + } else if (arg == "--chunk-separator") { + if (++i >= argc) { + fprintf(stderr, "error: missing argument for --chunk-separator\n"); + retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); + exit(1); + } + retrieval_params.chunk_separator = argv[i]; + } else { + // unknown argument + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + retrieval_params_print_usage(argc, argv, gpt_params, retrieval_params); + exit(1); + } + i++; + } +} + +struct chunk { + // filename + std::string filename; + // original file position + size_t filepos; + // original text data + std::string textdata = ""; + // tokenized text data + std::vector tokens; + // embedding + std::vector embedding; +}; + +// chunk file data to chunks of size >= chunk_size +// chunk_separator is the separator between chunks +static std::vector chunk_file(const std::string & filename, int chunk_size, const std::string & chunk_separator) { + std::vector chunks; + std::ifstream f(filename.c_str()); + + if (!f.is_open()) { + fprintf(stderr, "Error: could not open file %s\n", filename.c_str()); + return chunks; + } + + chunk current_chunk; + char buffer[1024]; + int64_t filepos = 0; + std::string current = ""; + while (f.read(buffer, 1024)) { + current += std::string(buffer, f.gcount()); + size_t pos; + while ((pos = current.find(chunk_separator)) != std::string::npos) { + current_chunk.textdata += current.substr(0, pos + chunk_separator.size()); + if ((int) current_chunk.textdata.size() > chunk_size) { + // save chunk + current_chunk.filepos = filepos; + current_chunk.filename = filename; + chunks.push_back(current_chunk); + // update filepos + filepos += (int) current_chunk.textdata.size(); + // reset current_chunk + current_chunk = chunk(); + } + current = current.substr(pos + chunk_separator.size()); + } + + } + // add leftover data to last chunk + if (current_chunk.textdata.size() > 0) { + if (chunks.empty()) { + current_chunk.filepos = filepos; + current_chunk.filename = filename; + chunks.push_back(current_chunk); + } else { + chunks.back().textdata += current_chunk.textdata; + } + } + f.close(); + return chunks; +} + +static void batch_add_seq(llama_batch & batch, const std::vector & tokens, int seq_id) { + for (size_t i = 0; i < tokens.size(); i++) { + llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1); + } +} + +static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) { + // clear previous kv_cache values (irrelevant for embeddings) + llama_kv_cache_clear(ctx); + + // run model + fprintf(stderr, "%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq); + if (llama_decode(ctx, batch) < 0) { + fprintf(stderr, "%s : failed to decode\n", __func__); + } + + for (int i = 0; i < batch.n_tokens; i++) { + if (!batch.logits[i]) { + continue; + } + + // try to get sequence embeddings - supported only when pooling_type is not NONE + const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); + if (embd == NULL) { + embd = llama_get_embeddings_ith(ctx, i); + if (embd == NULL) { + fprintf(stderr, "%s: failed to get embeddings for token %d\n", __func__, i); + continue; + } + } + + float * out = output + batch.seq_id[i][0] * n_embd; + llama_embd_normalize(embd, out, n_embd); + } +} + +int main(int argc, char ** argv) { + gpt_params params; + retrieval_params retrieval_params; + + retrieval_params_parse(argc, argv, params, retrieval_params); + + // For BERT models, batch size must be equal to ubatch size + params.n_ubatch = params.n_batch; + + if (retrieval_params.chunk_size <= 0) { + fprintf(stderr, "chunk_size must be positive\n"); + return 1; + } + if (retrieval_params.context_files.empty()) { + fprintf(stderr, "context_files must be specified\n"); + return 1; + } + params.embedding = true; + + print_build_info(); + + printf("processing files:\n"); + for (auto & context_file : retrieval_params.context_files) { + printf("%s\n", context_file.c_str()); + } + + std::vector chunks; + for (auto & context_file : retrieval_params.context_files) { + std::vector file_chunk = chunk_file(context_file, retrieval_params.chunk_size, retrieval_params.chunk_separator); + chunks.insert(chunks.end(), file_chunk.begin(), file_chunk.end()); + } + printf("Number of chunks: %ld\n", chunks.size()); + + llama_backend_init(); + llama_numa_init(params.numa); + + llama_model * model; + llama_context * ctx; + + // load the model + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + + if (n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, n_ctx); + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + // max batch size + const uint64_t n_batch = params.n_batch; + GGML_ASSERT(params.n_batch >= params.n_ctx); + + // tokenize the prompts and trim + for (auto & chunk : chunks) { + auto inp = ::llama_tokenize(ctx, chunk.textdata, true, false); + if (inp.size() > n_batch) { + fprintf(stderr, "%s: error: chunk size (%lld) exceeds batch size (%lld), increase batch size and re-run\n", + __func__, (long long int) inp.size(), (long long int) n_batch); + return 1; + } + // add eos if not present + if (inp.empty() || inp.back() != llama_token_eos(model)) { + inp.push_back(llama_token_eos(model)); + } + chunk.tokens = inp; + } + + // tokenization stats + if (params.verbose_prompt) { + for (int i = 0; i < (int) chunks.size(); i++) { + fprintf(stderr, "%s: prompt %d: '%s'\n", __func__, i, chunks[i].textdata.c_str()); + fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, chunks[i].tokens.size()); + for (int j = 0; j < (int) chunks[i].tokens.size(); j++) { + fprintf(stderr, "%6d -> '%s'\n", chunks[i].tokens[j], llama_token_to_piece(ctx, chunks[i].tokens[j]).c_str()); + } + fprintf(stderr, "\n\n"); + } + } + + // initialize batch + const int n_chunks = chunks.size(); + struct llama_batch batch = llama_batch_init(n_batch, 0, 1); + + // allocate output + const int n_embd = llama_n_embd(model); + std::vector embeddings(n_chunks * n_embd, 0); + float * emb = embeddings.data(); + + // break into batches + int p = 0; // number of prompts processed already + int s = 0; // number of prompts in current batch + for (int k = 0; k < n_chunks; k++) { + // clamp to n_batch tokens + auto & inp = chunks[k].tokens; + + const uint64_t n_toks = inp.size(); + + // encode if at capacity + if (batch.n_tokens + n_toks > n_batch) { + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + llama_batch_clear(batch); + p += s; + s = 0; + } + + // add to batch + batch_add_seq(batch, inp, s); + s += 1; + } + + // final batch + float * out = emb + p * n_embd; + batch_decode(ctx, batch, out, s, n_embd); + + // save embeddings to chunks + for (int i = 0; i < n_chunks; i++) { + chunks[i].embedding = std::vector(emb + i * n_embd, emb + (i + 1) * n_embd); + // clear tokens as they are no longer needed + chunks[i].tokens.clear(); + } + + // start loop, receive query and return top k similar chunks based on cosine similarity + std::string query; + while (true) { + printf("Enter query: "); + std::getline(std::cin, query); + std::vector query_tokens = llama_tokenize(ctx, query, true); + + struct llama_batch query_batch = llama_batch_init(n_batch, 0, 1); + batch_add_seq(query_batch, query_tokens, 0); + + std::vector query_emb(n_embd, 0); + batch_decode(ctx, query_batch, query_emb.data(), 1, n_embd); + + llama_batch_clear(query_batch); + + // compute cosine similarities + { + std::vector> similarities; + for (int i = 0; i < n_chunks; i++) { + float sim = llama_embd_similarity_cos(chunks[i].embedding.data(), query_emb.data(), n_embd); + similarities.push_back(std::make_pair(i, sim)); + } + + // sort similarities + std::sort(similarities.begin(), similarities.end(), [](const std::pair & a, const std::pair & b) { + return a.second > b.second; + }); + + printf("Top %d similar chunks:\n", params.sparams.top_k); + for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) { + printf("filename: %s\n", chunks[similarities[i].first].filename.c_str()); + printf("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos); + printf("similarity: %f\n", similarities[i].second); + printf("textdata:\n%s\n", chunks[similarities[i].first].textdata.c_str()); + printf("--------------------\n"); + } + } + } + + // clean up + llama_print_timings(ctx); + llama_free(ctx); + llama_free_model(model); + llama_backend_free(); +} diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index ef952e2bd..c3b766882 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -24,6 +24,7 @@ int main(int argc, char ** argv) { std::string result0; std::string result1; + std::string result2; // init llama_model * model; @@ -44,8 +45,8 @@ int main(int argc, char ** argv) { // save state (rng, logits, embedding and kv_cache) to file { - std::vector state_mem(llama_get_state_size(ctx)); - const size_t written = llama_copy_state_data(ctx, state_mem.data()); + std::vector state_mem(llama_state_get_size(ctx)); + const size_t written = llama_state_get_data(ctx, state_mem.data()); FILE *fp_write = fopen("dump_state.bin", "wb"); fwrite(state_mem.data(), 1, written, fp_write); @@ -97,13 +98,13 @@ int main(int argc, char ** argv) { // load state (rng, logits, embedding and kv_cache) from file { - std::vector state_mem(llama_get_state_size(ctx2)); + std::vector state_mem(llama_state_get_size(ctx2)); FILE * fp_read = fopen("dump_state.bin", "rb"); const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); fclose(fp_read); - if (read != llama_set_state_data(ctx2, state_mem.data())) { + if (read != llama_state_set_data(ctx2, state_mem.data())) { fprintf(stderr, "\n%s : failed to read state\n", __func__); llama_free(ctx2); llama_free_model(model); @@ -141,16 +142,104 @@ int main(int argc, char ** argv) { n_past += 1; } - printf("\n"); + printf("\n\n"); llama_free(ctx2); - llama_free_model(model); if (result0 != result1) { fprintf(stderr, "\n%s : error : the 2 generations are different\n", __func__); return 1; } + // make new context + auto* ctx3 = llama_new_context_with_model(model, llama_context_params_from_gpt_params(params)); + + printf("\nsingle seq run: %s", params.prompt.c_str()); + + // load state (rng, logits, embedding and kv_cache) from file + { + std::vector state_mem(llama_state_get_size(ctx3)); + + FILE * fp_read = fopen("dump_state.bin", "rb"); + const size_t read = fread(state_mem.data(), 1, state_mem.size(), fp_read); + fclose(fp_read); + + if (read != llama_state_set_data(ctx3, state_mem.data())) { + fprintf(stderr, "\n%s : failed to read state\n", __func__); + llama_free(ctx3); + llama_free_model(model); + return 1; + } + + fprintf(stderr, "%s : deserialized state from %zd out of a maximum of %zd bytes\n", __func__, read, state_mem.size()); + } + + // restore state (last tokens) + n_past = n_past_saved; + + // save seq 0 and load into seq 1 + { + // save kv of seq 0 + std::vector seq_store(llama_state_seq_get_size(ctx3, 0)); + const size_t ncopy = llama_state_seq_get_data(ctx3, seq_store.data(), 0); + if (ncopy != seq_store.size()) { + fprintf(stderr, "\n%s : seq copy data length %zd does not match expected length %zd\n", __func__, ncopy, seq_store.size()); + llama_free(ctx3); + llama_free_model(model); + return 1; + } + fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy); + + // erase whole kv + llama_kv_cache_clear(ctx3); + fprintf(stderr, "%s : kv cache cleared\n", __func__); + + // restore kv into seq 1 + const size_t nset = llama_state_seq_set_data(ctx3, seq_store.data(), 1); + if (nset != seq_store.size()) { + fprintf(stderr, "\n%s : seq set data length %zd does not match expected length %zd\n", __func__, nset, seq_store.size()); + llama_free(ctx3); + llama_free_model(model); + return 1; + } + fprintf(stderr, "%s : seq 1 restored, %zd bytes\n", __func__, nset); + } + + // third run with seq 1 instead of 0 + for (auto i = 0; i < params.n_predict; i++) { + auto * logits = llama_get_logits(ctx3); + auto n_vocab = llama_n_vocab(model); + std::vector candidates; + candidates.reserve(n_vocab); + for (llama_token token_id = 0; token_id < n_vocab; token_id++) { + candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); + } + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + auto next_token = llama_sample_token(ctx3, &candidates_p); + auto next_token_str = llama_token_to_piece(ctx3, next_token); + + printf("%s", next_token_str.c_str()); + result2 += next_token_str; + + if (llama_decode(ctx3, llama_batch_get_one(&next_token, 1, n_past, 1))) { + fprintf(stderr, "\n%s : failed to evaluate\n", __func__); + llama_free(ctx3); + llama_free_model(model); + return 1; + } + n_past += 1; + } + + printf("\n"); + + llama_free(ctx3); + llama_free_model(model); + + if (result0 != result2) { + fprintf(stderr, "\n%s : error : the seq restore generation is different\n", __func__); + return 1; + } + fprintf(stderr, "\n%s : success\n", __func__); return 0; diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt index d2ee47d01..4b89c5302 100644 --- a/examples/server/CMakeLists.txt +++ b/examples/server/CMakeLists.txt @@ -1,17 +1,34 @@ set(TARGET server) option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} +include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}) +set(TARGET_SRCS server.cpp utils.hpp httplib.h ) +set(PUBLIC_ASSETS + index.html + index.js + completion.js + json-schema-to-grammar.mjs +) +foreach(asset ${PUBLIC_ASSETS}) + set(input "${CMAKE_CURRENT_SOURCE_DIR}/public/${asset}") + set(output "${CMAKE_CURRENT_BINARY_DIR}/${asset}.hpp") + list(APPEND TARGET_SRCS ${output}) + add_custom_command( + DEPENDS "${input}" + OUTPUT "${output}" + COMMAND "${CMAKE_COMMAND}" "-DINPUT=${input}" "-DOUTPUT=${output}" -P "${PROJECT_SOURCE_DIR}/scripts/xxd.cmake" + ) +endforeach() +add_executable(${TARGET} ${TARGET_SRCS}) install(TARGETS ${TARGET} RUNTIME) target_compile_definitions(${TARGET} PRIVATE SERVER_VERBOSE=$ ) -target_link_libraries(${TARGET} PRIVATE common json-schema-to-grammar ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) find_package(OpenSSL REQUIRED) target_link_libraries(${TARGET} PRIVATE OpenSSL::SSL OpenSSL::Crypto) diff --git a/examples/server/README.md b/examples/server/README.md index 755e1d538..650317991 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -11,54 +11,69 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. * Continuous batching * Multimodal (wip) * Monitoring endpoints + * Schema-constrained JSON response format The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216). **Command line options:** -- `--threads N`, `-t N`: Set the number of threads to use during generation. -- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. -- `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`) +- `--threads N`, `-t N`: Set the number of threads to use during generation. Not used if model layers are offloaded to GPU. The server is using batching. This parameter is used only if one token is to be processed on CPU backend. +- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. Not used if model layers are offloaded to GPU. +- `--threads-http N`: Number of threads in the http server pool to process requests. Default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)` - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). -- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf). +- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file. Default: unused +- `-hfr REPO, --hf-repo REPO`: Hugging Face model repository. Default: unused +- `-hff FILE, --hf-file FILE`: Hugging Face model file. Default: unused - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. -- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. -- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance. -- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS. -- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS. -- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`. -- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended. +- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is `512`, but LLaMA models were built with a context of `2048`, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of `4096`. +- `-ngl N`, `--n-gpu-layers N`: When compiled with GPU support, this option allows offloading some layers to the GPU for computation. Generally results in increased performance. +- `-mg i, --main-gpu i`: When using multiple GPUs, this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default, GPU `0` is used. +- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs, this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default, the data is split in proportion to VRAM, but this may not be optimal for performance. +- `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `2048` +- `-ub N`, `--ubatch-size N`: Physical maximum batch size. Default: `512` - `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped. - `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed. -- `--numa STRATEGY`: Attempt one of the below optimization strategies that help on some NUMA systems +- `--numa STRATEGY`: Attempt one of the below optimization strategies that may help on some NUMA systems - `--numa distribute`: Spread execution evenly over all nodes - `--numa isolate`: Only spawn threads on CPUs on the node that execution started on -- `--numa numactl`: Use the CPU map provided by numactl -if run without this previously, it is recommended to drop the system page cache before using this -see https://github.com/ggerganov/llama.cpp/issues/1437 +- `--numa numactl`: Use the CPU map provided by numactl. If run without this previously, it is recommended to drop the system +page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437 -- `--numa`: Attempt optimizations that help on some NUMA systems. +- `--numa`: Attempt optimizations that may help on some NUMA systems. - `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains. - `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation. -- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`. -- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. -- `--port`: Set the port to listen. Default: `8080`. -- `--path`: path from which to serve static files (default: disabled) -- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. -- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s. -- `--embedding`: Enable embedding extraction, Default: disabled. -- `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) -- `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) -- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load "a system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) +- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600` +- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1` +- `--port`: Set the port to listen. Default: `8080` +- `--path`: Path from which to serve static files. Default: disabled +- `--api-key`: Set an api key for request authorization. By default, the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. +- `--api-key-file`: Path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`s. +- `--embedding`: Enable embedding extraction. Default: disabled +- `-np N`, `--parallel N`: Set the number of slots for process requests. Default: `1` +- `-cb`, `--cont-batching`: Enable continuous batching (a.k.a dynamic batching). Default: disabled +- `-spf FNAME`, `--system-prompt-file FNAME` Set a file to load a system prompt (initial prompt of all slots). This is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `--mmproj MMPROJ_FILE`: Path to a multimodal projector file for LLaVA. -- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w` -- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n` -- `-n N, --n-predict N`: Set the maximum tokens to predict (default: -1) +- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend. Used together with group attention width `--grp-attn-w`. Default: `1`, which is disabled. +- `--grp-attn-w`: Set the group attention width to extend context size through self-extend. Used together with group attention factor `--grp-attn-n`. Default: `512` +- `-n N, --n-predict N`: Set the maximum tokens to predict. Default: `-1` - `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included. -- `--metrics`: enable prometheus `/metrics` compatible endpoint (default: disabled) -- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name (default: template taken from model's metadata). We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) -- `--log-disable`: Output logs to stdout only, default: enabled. -- `--log-format FORMAT`: Define the log output to FORMAT: json or text (default: json) +- `--metrics`: enable prometheus `/metrics` compatible endpoint. Default: disabled +- `--slot-save-path PATH`: Specifies the path where the state of slots (the prompt cache) can be stored. If not provided, the slot management endpoints will be disabled. +- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) +- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled +- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json` +- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn` +- `--rope-freq-base N` : RoPE frequency base (default: loaded from model) +- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25) +- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation) +- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0) +- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0) +- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0) +- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls` +- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled) +- `-fa`, `--flash-attn` : enable flash attention (default: disabled). +- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`) +- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options) **If compiled with `LLAMA_SERVER_SSL=ON`** - `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key @@ -66,23 +81,26 @@ see https://github.com/ggerganov/llama.cpp/issues/1437 ## Build -server is build alongside everything else from the root of the project +`server` is built alongside everything else from the root of the project - Using `make`: ```bash - make + make server ``` - Using `CMake`: ```bash - cmake --build . --config Release + cmake -B build + cmake --build build --config Release -t server ``` + Binary is at `./build/bin/server` + ## Build with SSL -server can also be built with SSL support using OpenSSL 3 +`server` can also be built with SSL support using OpenSSL 3 - Using `make`: @@ -96,10 +114,8 @@ server can also be built with SSL support using OpenSSL 3 - Using `CMake`: ```bash - mkdir build - cd build - cmake .. -DLLAMA_SERVER_SSL=ON - make server + cmake -B build -DLLAMA_SERVER_SSL=ON + cmake --build build --config Release -t server ``` ## Quick Start @@ -132,7 +148,7 @@ docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/ ## Testing with CURL -Using [curl](https://curl.se/). On Windows `curl.exe` should be available in the base OS. +Using [curl](https://curl.se/). On Windows, `curl.exe` should be available in the base OS. ```sh curl --request POST \ @@ -156,7 +172,7 @@ mkdir llama-client cd llama-client ``` -Create a index.js file and put inside this: +Create a index.js file and put this inside: ```javascript const prompt = `Building a website can be done in 10 simple steps:`; @@ -187,8 +203,8 @@ node index.js - 503 -> `{"status": "loading model"}` if the model is still being loaded. - 500 -> `{"status": "error"}` if the model failed to load. - 200 -> `{"status": "ok", "slots_idle": 1, "slots_processing": 2 }` if the model is successfully loaded and the server is ready for further requests mentioned below. - - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slot are currently available. - - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slot are currently available. + - 200 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if no slots are currently available. + - 503 -> `{"status": "no slot available", "slots_idle": 0, "slots_processing": 32}` if the query parameter `fail_on_no_slot` is provided and no slots are currently available. If the query parameter `include_slots` is passed, `slots` field will contain internal slots data except if `--slots-endpoint-disable` is set. @@ -202,75 +218,77 @@ node index.js - The model's `tokenizer.ggml.add_bos_token` metadata is `true` - The system prompt is empty - `temperature`: Adjust the randomness of the generated text (default: 0.8). + `temperature`: Adjust the randomness of the generated text. Default: `0.8` - `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` (default: 0.0, 0.0 = disabled). + `dynatemp_range`: Dynamic temperature range. The final temperature will be in the range of `[temperature - dynatemp_range; temperature + dynatemp_range]` Default: `0.0`, which is disabled. - `dynatemp_exponent`: Dynamic temperature exponent (default: 1.0). + `dynatemp_exponent`: Dynamic temperature exponent. Default: `1.0` - `top_k`: Limit the next token selection to the K most probable tokens (default: 40). + `top_k`: Limit the next token selection to the K most probable tokens. Default: `40` - `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P (default: 0.95). + `top_p`: Limit the next token selection to a subset of tokens with a cumulative probability above a threshold P. Default: `0.95` - `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token (default: 0.05). + `min_p`: The minimum probability for a token to be considered, relative to the probability of the most likely token. Default: `0.05` - `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. (default: -1, -1 = infinity). + `n_predict`: Set the maximum number of tokens to predict when generating text. **Note:** May exceed the set limit slightly if the last token is a partial multibyte character. When 0, no tokens will be generated but the prompt is evaluated into the cache. Default: `-1`, where `-1` is infinity. `n_keep`: Specify the number of tokens from the prompt to retain when the context size is exceeded and tokens need to be discarded. - By default, this value is set to 0 (meaning no tokens are kept). Use `-1` to retain all tokens from the prompt. + By default, this value is set to `0`, meaning no tokens are kept. Use `-1` to retain all tokens from the prompt. `stream`: It allows receiving each predicted token in real-time instead of waiting for the completion to finish. To enable this, set to `true`. `stop`: Specify a JSON array of stopping strings. - These words will not be included in the completion, so make sure to add them to the prompt for the next iteration (default: []). + These words will not be included in the completion, so make sure to add them to the prompt for the next iteration. Default: `[]` - `tfs_z`: Enable tail free sampling with parameter z (default: 1.0, 1.0 = disabled). + `tfs_z`: Enable tail free sampling with parameter z. Default: `1.0`, which is disabled. - `typical_p`: Enable locally typical sampling with parameter p (default: 1.0, 1.0 = disabled). + `typical_p`: Enable locally typical sampling with parameter p. Default: `1.0`, which is disabled. - `repeat_penalty`: Control the repetition of token sequences in the generated text (default: 1.1). + `repeat_penalty`: Control the repetition of token sequences in the generated text. Default: `1.1` - `repeat_last_n`: Last n tokens to consider for penalizing repetition (default: 64, 0 = disabled, -1 = ctx-size). + `repeat_last_n`: Last n tokens to consider for penalizing repetition. Default: `64`, where `0` is disabled and `-1` is ctx-size. - `penalize_nl`: Penalize newline tokens when applying the repeat penalty (default: true). + `penalize_nl`: Penalize newline tokens when applying the repeat penalty. Default: `true` - `presence_penalty`: Repeat alpha presence penalty (default: 0.0, 0.0 = disabled). + `presence_penalty`: Repeat alpha presence penalty. Default: `0.0`, which is disabled. - `frequency_penalty`: Repeat alpha frequency penalty (default: 0.0, 0.0 = disabled); + `frequency_penalty`: Repeat alpha frequency penalty. Default: `0.0`, which is disabled. - `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens (default: `null` = use the original `prompt`). + `penalty_prompt`: This will replace the `prompt` for the purpose of the penalty evaluation. Can be either `null`, a string or an array of numbers representing tokens. Default: `null`, which is to use the original `prompt`. - `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation (default: 0, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0). + `mirostat`: Enable Mirostat sampling, controlling perplexity during text generation. Default: `0`, where `0` is disabled, `1` is Mirostat, and `2` is Mirostat 2.0. - `mirostat_tau`: Set the Mirostat target entropy, parameter tau (default: 5.0). + `mirostat_tau`: Set the Mirostat target entropy, parameter tau. Default: `5.0` - `mirostat_eta`: Set the Mirostat learning rate, parameter eta (default: 0.1). + `mirostat_eta`: Set the Mirostat learning rate, parameter eta. Default: `0.1` - `grammar`: Set grammar for grammar-based sampling (default: no grammar) + `grammar`: Set grammar for grammar-based sampling. Default: no grammar - `seed`: Set the random number generator (RNG) seed (default: -1, -1 = random seed). + `json_schema`: Set a JSON schema for grammar-based sampling (e.g. `{"items": {"type": "string"}, "minItems": 10, "maxItems": 100}` of a list of strings, or `{}` for any JSON). See [tests](../../tests/test-json-schema-to-grammar.cpp) for supported features. Default: no JSON schema. - `ignore_eos`: Ignore end of stream token and continue generating (default: false). + `seed`: Set the random number generator (RNG) seed. Default: `-1`, which is a random seed. - `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. (default: []). + `ignore_eos`: Ignore end of stream token and continue generating. Default: `false` - `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token (default: 0) + `logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]` - `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum (default: 0) + `n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0` + + `min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0` `image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. - `slot_id`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot (default: -1) + `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` - `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. (default: false) + `cache_prompt`: Re-use previously cached prompt from the last request if possible. This may prevent re-caching the prompt from scratch. Default: `false` `system_prompt`: Change the system prompt (initial prompt of all slots), this is useful for chat applications. [See more](#change-system-prompt-on-runtime) - `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. (default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values) + `samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["top_k", "tfs_z", "typical_p", "top_p", "min_p", "temperature"]` - these are all the available values. ### Result JSON -- Note: When using streaming mode (`stream`) only `content` and `stop` will be returned until end of completion. +- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. - `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure: @@ -284,7 +302,7 @@ node index.js }, { "prob": float, - "tok_str": "" + "tok_str": "" }, ... ] @@ -313,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`. `content`: Set the text to tokenize. - Note that a special `BOS` token is never inserted. + `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` - **POST** `/detokenize`: Convert tokens to text. @@ -354,14 +372,16 @@ Notice that each `probs` is an array of length `n_probs`. - `assistant_name` - the required assistant name to generate the prompt in case you have specified a system prompt for all slots. - `user_name` - the required anti-prompt to generate the prompt in case you have specified a system prompt for all slots. -- `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. +- `default_generation_settings` - the default generation settings for the `/completion` endpoint, which has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only model with [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, ChatML template will be used. *Options:* - See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such are `mirostat` are supported. + See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs/api-reference/chat). While some OpenAI-specific features such as function calling aren't supported, llama.cpp `/completion`-specific features such as `mirostat` are supported. + + The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}`), similar to other OpenAI-inspired API providers. *Examples:* @@ -511,16 +531,67 @@ Available metrics: - `llamacpp:tokens_predicted_total`: Number of generation tokens processed. - `llamacpp:prompt_tokens_seconds`: Average prompt throughput in tokens/s. - `llamacpp:predicted_tokens_seconds`: Average generation throughput in tokens/s. -- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. 1 means 100 percent usage. +- `llamacpp:kv_cache_usage_ratio`: KV-cache usage. `1` means 100 percent usage. - `llamacpp:kv_cache_tokens`: KV-cache tokens. -- `llamacpp:requests_processing`: Number of request processing. -- `llamacpp:requests_deferred`: Number of request deferred. +- `llamacpp:requests_processing`: Number of requests processing. +- `llamacpp:requests_deferred`: Number of requests deferred. + +- **POST** `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. + + *Options:* + + `filename`: Name of the file to save the slot's prompt cache. The file will be saved in the directory specified by the `--slot-save-path` server parameter. + +### Result JSON + +```json +{ + "id_slot": 0, + "filename": "slot_save_file.bin", + "n_saved": 1745, + "n_written": 14309796, + "timings": { + "save_ms": 49.865 + } +} +``` + +- **POST** `/slots/{id_slot}?action=restore`: Restore the prompt cache of the specified slot from a file. + + *Options:* + + `filename`: Name of the file to restore the slot's prompt cache from. The file should be located in the directory specified by the `--slot-save-path` server parameter. + +### Result JSON + +```json +{ + "id_slot": 0, + "filename": "slot_save_file.bin", + "n_restored": 1745, + "n_read": 14309796, + "timings": { + "restore_ms": 42.937 + } +} +``` + +- **POST** `/slots/{id_slot}?action=erase`: Erase the prompt cache of the specified slot. + +### Result JSON + +```json +{ + "id_slot": 0, + "n_erased": 1745 +} +``` ## More examples ### Change system prompt on runtime -To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt` to achieve that. This only needs to be done once to establish it. +To use the server example to serve multiple chat-type clients while keeping the same system prompt, you can utilize the option `system_prompt`. This only needs to be used once. `prompt`: Specify a context that you want all connecting clients to respect. @@ -559,11 +630,11 @@ bash chat.sh ### OAI-like API -The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi +The HTTP `server` supports an OAI-like API: https://github.com/openai/openai-openapi ### API errors -Server returns error in the same format as OAI: https://github.com/openai/openai-openapi +`server` returns errors in the same format as OAI: https://github.com/openai/openai-openapi Example of an error: diff --git a/examples/server/bench/README.md b/examples/server/bench/README.md index a53ad64d7..23a3ec975 100644 --- a/examples/server/bench/README.md +++ b/examples/server/bench/README.md @@ -2,13 +2,15 @@ Benchmark is using [k6](https://k6.io/). -##### Install k6 +##### Install k6 and sse extension -Follow instruction from: https://k6.io/docs/get-started/installation/ +SSE is not supported by default in k6, you have to build k6 with the [xk6-sse](https://github.com/phymbert/xk6-sse) extension. -Example for ubuntu: +Example: ```shell -snap install k6 +go install go.k6.io/xk6/cmd/xk6@latest +xk6 build master \ +--with github.com/phymbert/xk6-sse ``` #### Download a dataset @@ -46,7 +48,7 @@ server --host localhost --port 8080 \ For 500 chat completions request with 8 concurrent users during maximum 10 minutes, run: ```shell -k6 run script.js --duration 10m --iterations 500 --vus 8 +./k6 run script.js --duration 10m --iterations 500 --vus 8 ``` The benchmark values can be overridden with: @@ -86,3 +88,33 @@ K6 metrics might be compared against [server metrics](../README.md), with: ```shell curl http://localhost:8080/metrics ``` + +### Using the CI python script +The `bench.py` script does several steps: +- start the server +- define good variable for k6 +- run k6 script +- extract metrics from prometheus + +It aims to be used in the CI, but you can run it manually: + +```shell +LLAMA_SERVER_BIN_PATH=../../../cmake-build-release/bin/server python bench.py \ + --runner-label local \ + --name local \ + --branch `git rev-parse --abbrev-ref HEAD` \ + --commit `git rev-parse HEAD` \ + --scenario script.js \ + --duration 5m \ + --hf-repo ggml-org/models \ + --hf-file phi-2/ggml-model-q4_0.gguf \ + --model-path-prefix models \ + --parallel 4 \ + -ngl 33 \ + --batch-size 2048 \ + --ubatch-size 256 \ + --ctx-size 4096 \ + --n-prompts 200 \ + --max-prompt-tokens 256 \ + --max-tokens 256 +``` diff --git a/examples/server/bench/bench.py b/examples/server/bench/bench.py new file mode 100644 index 000000000..86c5de101 --- /dev/null +++ b/examples/server/bench/bench.py @@ -0,0 +1,309 @@ +import argparse +import json +import os +import re +import signal +import socket +import subprocess +import sys +import threading +import time +import traceback +from contextlib import closing +from datetime import datetime + +import matplotlib +import matplotlib.dates +import matplotlib.pyplot as plt +import requests +from statistics import mean + + +def main(args_in: list[str] | None = None) -> None: + parser = argparse.ArgumentParser(description="Start server benchmark scenario") + parser.add_argument("--name", type=str, help="Bench name", required=True) + parser.add_argument("--runner-label", type=str, help="Runner label", required=True) + parser.add_argument("--branch", type=str, help="Branch name", default="detached") + parser.add_argument("--commit", type=str, help="Commit name", default="dirty") + parser.add_argument("--host", type=str, help="Server listen host", default="0.0.0.0") + parser.add_argument("--port", type=int, help="Server listen host", default="8080") + parser.add_argument("--model-path-prefix", type=str, help="Prefix where to store the model files", default="models") + parser.add_argument("--n-prompts", type=int, + help="SERVER_BENCH_N_PROMPTS: total prompts to randomly select in the benchmark", required=True) + parser.add_argument("--max-prompt-tokens", type=int, + help="SERVER_BENCH_MAX_PROMPT_TOKENS: maximum prompt tokens to filter out in the dataset", + required=True) + parser.add_argument("--max-tokens", type=int, + help="SERVER_BENCH_MAX_CONTEXT: maximum context size of the completions request to filter out in the dataset: prompt + predicted tokens", + required=True) + parser.add_argument("--hf-repo", type=str, help="Hugging Face model repository", required=True) + parser.add_argument("--hf-file", type=str, help="Hugging Face model file", required=True) + parser.add_argument("-ngl", "--n-gpu-layers", type=int, help="layers to the GPU for computation", required=True) + parser.add_argument("--ctx-size", type=int, help="Set the size of the prompt context", required=True) + parser.add_argument("--parallel", type=int, help="Set the number of slots for process requests", required=True) + parser.add_argument("--batch-size", type=int, help="Set the batch size for prompt processing", required=True) + parser.add_argument("--ubatch-size", type=int, help="physical maximum batch size", required=True) + parser.add_argument("--scenario", type=str, help="Scenario to run", required=True) + parser.add_argument("--duration", type=str, help="Bench scenario", required=True) + + args = parser.parse_args(args_in) + + start_time = time.time() + + # Start the server and performance scenario + try: + server_process = start_server(args) + except Exception: + print("bench: server start error :") + traceback.print_exc(file=sys.stdout) + sys.exit(1) + + # start the benchmark + try: + start_benchmark(args) + + iterations = 0 + with open("results.github.env", 'w') as github_env: + # parse output + with open('k6-results.json', 'r') as bench_results: + # Load JSON data from file + data = json.load(bench_results) + for metric_name in data['metrics']: + for metric_metric in data['metrics'][metric_name]: + value = data['metrics'][metric_name][metric_metric] + if isinstance(value, float) or isinstance(value, int): + value = round(value, 2) + data['metrics'][metric_name][metric_metric]=value + github_env.write( + f"{escape_metric_name(metric_name)}_{escape_metric_name(metric_metric)}={value}\n") + iterations = data['root_group']['checks']['success completion']['passes'] + + except Exception: + print("bench: error :") + traceback.print_exc(file=sys.stdout) + + # Stop the server + if server_process: + try: + print(f"bench: shutting down server pid={server_process.pid} ...") + if os.name == 'nt': + interrupt = signal.CTRL_C_EVENT + else: + interrupt = signal.SIGINT + server_process.send_signal(interrupt) + server_process.wait(0.5) + + except subprocess.TimeoutExpired: + print(f"server still alive after 500ms, force-killing pid={server_process.pid} ...") + server_process.kill() # SIGKILL + server_process.wait() + + while is_server_listening(args.host, args.port): + time.sleep(0.1) + + title = (f"llama.cpp {args.name} on {args.runner_label}\n " + f"duration={args.duration} {iterations} iterations") + xlabel = (f"{args.hf_repo}/{args.hf_file}\n" + f"parallel={args.parallel} ctx-size={args.ctx_size} ngl={args.n_gpu_layers} batch-size={args.batch_size} ubatch-size={args.ubatch_size} pp={args.max_prompt_tokens} pp+tg={args.max_tokens}\n" + f"branch={args.branch} commit={args.commit}") + + # Prometheus + end_time = time.time() + prometheus_metrics = {} + if is_server_listening("0.0.0.0", 9090): + metrics = ['prompt_tokens_seconds', 'predicted_tokens_seconds', + 'kv_cache_usage_ratio', 'requests_processing', 'requests_deferred'] + + for metric in metrics: + resp = requests.get(f"http://localhost:9090/api/v1/query_range", + params={'query': 'llamacpp:' + metric, 'start': start_time, 'end': end_time, 'step': 2}) + + with open(f"{metric}.json", 'w') as metric_json: + metric_json.write(resp.text) + + if resp.status_code != 200: + print(f"bench: unable to extract prometheus metric {metric}: {resp.text}") + else: + metric_data = resp.json() + values = metric_data['data']['result'][0]['values'] + timestamps, metric_values = zip(*values) + metric_values = [float(value) for value in metric_values] + prometheus_metrics[metric] = metric_values + timestamps_dt = [datetime.fromtimestamp(int(ts)) for ts in timestamps] + plt.figure(figsize=(16, 10), dpi=80) + plt.plot(timestamps_dt, metric_values, label=metric) + plt.xticks(rotation=0, fontsize=14, horizontalalignment='center', alpha=.7) + plt.yticks(fontsize=12, alpha=.7) + + ylabel = f"llamacpp:{metric}" + plt.title(title, + fontsize=14, wrap=True) + plt.grid(axis='both', alpha=.3) + plt.ylabel(ylabel, fontsize=22) + plt.xlabel(xlabel, fontsize=14, wrap=True) + plt.gca().xaxis.set_major_locator(matplotlib.dates.MinuteLocator()) + plt.gca().xaxis.set_major_formatter(matplotlib.dates.DateFormatter("%Y-%m-%d %H:%M:%S")) + plt.gcf().autofmt_xdate() + + # Remove borders + plt.gca().spines["top"].set_alpha(0.0) + plt.gca().spines["bottom"].set_alpha(0.3) + plt.gca().spines["right"].set_alpha(0.0) + plt.gca().spines["left"].set_alpha(0.3) + + # Save the plot as a jpg image + plt.savefig(f'{metric}.jpg', dpi=60) + plt.close() + + # Mermaid format in case images upload failed + with (open(f"{metric}.mermaid", 'w') as mermaid_f): + mermaid = ( + f"""--- +config: + xyChart: + titleFontSize: 12 + width: 900 + height: 600 + themeVariables: + xyChart: + titleColor: "#000000" +--- +xychart-beta + title "{title}" + y-axis "llamacpp:{metric}" + x-axis "llamacpp:{metric}" {int(min(timestamps))} --> {int(max(timestamps))} + line [{', '.join([str(round(float(value), 2)) for value in metric_values])}] + """) + mermaid_f.write(mermaid) + + # 140 chars max for commit status description + bench_results = { + "i": iterations, + "req": { + "p95": round(data['metrics']["http_req_duration"]["p(95)"], 2), + "avg": round(data['metrics']["http_req_duration"]["avg"], 2), + }, + "pp": { + "p95": round(data['metrics']["llamacpp_prompt_processing_second"]["p(95)"], 2), + "avg": round(data['metrics']["llamacpp_prompt_processing_second"]["avg"], 2), + "0": round(mean(prometheus_metrics['prompt_tokens_seconds']), 2), + }, + "tg": { + "p95": round(data['metrics']["llamacpp_tokens_second"]["p(95)"], 2), + "avg": round(data['metrics']["llamacpp_tokens_second"]["avg"], 2), + "0": round(mean(prometheus_metrics['predicted_tokens_seconds']), 2), + }, + } + with open("results.github.env", 'a') as github_env: + github_env.write(f"BENCH_RESULTS={json.dumps(bench_results, indent=None, separators=(',', ':') )}\n") + github_env.write(f"BENCH_ITERATIONS={iterations}\n") + + title = title.replace('\n', ' ') + xlabel = xlabel.replace('\n', ' ') + github_env.write(f"BENCH_GRAPH_TITLE={title}\n") + github_env.write(f"BENCH_GRAPH_XLABEL={xlabel}\n") + + +def start_benchmark(args): + k6_path = './k6' + if 'BENCH_K6_BIN_PATH' in os.environ: + k6_path = os.environ['BENCH_K6_BIN_PATH'] + k6_args = [ + 'run', args.scenario, + '--no-color', + ] + k6_args.extend(['--duration', args.duration]) + k6_args.extend(['--iterations', args.n_prompts]) + k6_args.extend(['--vus', args.parallel]) + k6_args.extend(['--summary-export', 'k6-results.json']) + args = f"SERVER_BENCH_N_PROMPTS={args.n_prompts} SERVER_BENCH_MAX_PROMPT_TOKENS={args.max_prompt_tokens} SERVER_BENCH_MAX_CONTEXT={args.max_tokens} " + args = args + ' '.join([str(arg) for arg in [k6_path, *k6_args]]) + print(f"bench: starting k6 with: {args}") + k6_completed = subprocess.run(args, shell=True, stdout=sys.stdout, stderr=sys.stderr) + if k6_completed.returncode != 0: + raise Exception("bench: unable to run k6") + + +def start_server(args): + server_process = start_server_background(args) + + attempts = 0 + max_attempts = 20 + if 'GITHUB_ACTIONS' in os.environ: + max_attempts *= 2 + + while not is_server_listening(args.host, args.port): + attempts += 1 + if attempts > max_attempts: + assert False, "server not started" + print(f"bench: waiting for server to start ...") + time.sleep(0.5) + + print("bench: server started.") + return server_process + + +def start_server_background(args): + # Start the server + server_path = '../../../build/bin/server' + if 'LLAMA_SERVER_BIN_PATH' in os.environ: + server_path = os.environ['LLAMA_SERVER_BIN_PATH'] + server_args = [ + '--host', args.host, + '--port', args.port, + ] + model_file = args.model_path_prefix + os.path.sep + args.hf_file + model_dir = os.path.dirname(model_file) + if not os.path.exists(model_dir): + os.makedirs(model_dir) + server_args.extend(['--model', model_file]) + server_args.extend(['--hf-repo', args.hf_repo]) + server_args.extend(['--hf-file', args.hf_file]) + server_args.extend(['--n-gpu-layers', args.n_gpu_layers]) + server_args.extend(['--ctx-size', args.ctx_size]) + server_args.extend(['--parallel', args.parallel]) + server_args.extend(['--batch-size', args.batch_size]) + server_args.extend(['--ubatch-size', args.ubatch_size]) + server_args.extend(['--n-predict', args.max_tokens * 2]) + server_args.extend(['--defrag-thold', "0.1"]) + server_args.append('--cont-batching') + server_args.append('--metrics') + server_args.append('--flash-attn') + server_args.extend(['--log-format', "text"]) + args = [str(arg) for arg in [server_path, *server_args]] + print(f"bench: starting server with: {' '.join(args)}") + pkwargs = { + 'stdout': subprocess.PIPE, + 'stderr': subprocess.PIPE + } + server_process = subprocess.Popen( + args, + **pkwargs) + + def server_log(in_stream, out_stream): + for line in iter(in_stream.readline, b''): + print(line.decode('utf-8'), end='', file=out_stream) + + thread_stdout = threading.Thread(target=server_log, args=(server_process.stdout, sys.stdout)) + thread_stdout.start() + thread_stderr = threading.Thread(target=server_log, args=(server_process.stderr, sys.stderr)) + thread_stderr.start() + + return server_process + + +def is_server_listening(server_fqdn, server_port): + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + result = sock.connect_ex((server_fqdn, server_port)) + _is_server_listening = result == 0 + if _is_server_listening: + print(f"server is listening on {server_fqdn}:{server_port}...") + return _is_server_listening + + +def escape_metric_name(metric_name): + return re.sub('[^A-Z0-9]', '_', metric_name.upper()) + + +if __name__ == '__main__': + main() diff --git a/examples/server/bench/prometheus.yml b/examples/server/bench/prometheus.yml new file mode 100644 index 000000000..b15ee5244 --- /dev/null +++ b/examples/server/bench/prometheus.yml @@ -0,0 +1,9 @@ +global: + scrape_interval: 10s + external_labels: + llamacpp: 'server' + +scrape_configs: + - job_name: 'llama.cpp server' + static_configs: + - targets: ['localhost:8080'] diff --git a/examples/server/bench/requirements.txt b/examples/server/bench/requirements.txt new file mode 100644 index 000000000..66ed226ed --- /dev/null +++ b/examples/server/bench/requirements.txt @@ -0,0 +1,2 @@ +matplotlib +requests diff --git a/examples/server/bench/script.js b/examples/server/bench/script.js index a4f5ac5ab..bdf4f5abc 100644 --- a/examples/server/bench/script.js +++ b/examples/server/bench/script.js @@ -1,4 +1,4 @@ -import http from 'k6/http' +import sse from 'k6/x/sse' import {check, sleep} from 'k6' import {SharedArray} from 'k6/data' import {Counter, Rate, Trend} from 'k6/metrics' @@ -53,7 +53,9 @@ const data = new SharedArray('conversations', function () { const llamacpp_prompt_tokens = new Trend('llamacpp_prompt_tokens') const llamacpp_completion_tokens = new Trend('llamacpp_completion_tokens') + const llamacpp_tokens_second = new Trend('llamacpp_tokens_second') +const llamacpp_prompt_processing_second = new Trend('llamacpp_prompt_processing_second') const llamacpp_prompt_tokens_total_counter = new Counter('llamacpp_prompt_tokens_total_counter') const llamacpp_completion_tokens_total_counter = new Counter('llamacpp_completion_tokens_total_counter') @@ -86,35 +88,63 @@ export default function () { } ], "model": model, - "stream": false, - "max_tokens": max_tokens + "stream": true, + "seed": 42, + "max_tokens": max_tokens, + "stop": ["<|im_end|>"] // This is temporary for phi-2 base (i.e. not instructed) since the server expects that the model always to emit BOS } - const body = JSON.stringify(payload) + const params = {method: 'POST', body: JSON.stringify(payload)}; - let res = http.post(`${server_url}/chat/completions`, body, { - headers: {'Content-Type': 'application/json'}, - timeout: '300s' + const startTime = new Date() + let promptEvalEndTime = null + let prompt_tokens = 0 + let completions_tokens = 0 + let finish_reason = null + const res = sse.open(`${server_url}/chat/completions`, params, function (client) { + client.on('event', function (event) { + if (promptEvalEndTime == null) { + promptEvalEndTime = new Date() + } + + let chunk = JSON.parse(event.data) + let choice = chunk.choices[0] + if (choice.finish_reason) { + finish_reason = choice.finish_reason + } + + if (chunk.usage) { + prompt_tokens = chunk.usage.prompt_tokens + llamacpp_prompt_tokens.add(prompt_tokens) + llamacpp_prompt_tokens_total_counter.add(prompt_tokens) + + completions_tokens = chunk.usage.completion_tokens + llamacpp_completion_tokens.add(completions_tokens) + llamacpp_completion_tokens_total_counter.add(completions_tokens) + } + }) + + client.on('error', function (e) { + console.log('An unexpected error occurred: ', e.error()); + throw e; + }) }) check(res, {'success completion': (r) => r.status === 200}) - if (res.status === 200) { - const completions = res.json() + const endTime = new Date() - llamacpp_prompt_tokens.add(completions.usage.prompt_tokens) - llamacpp_prompt_tokens_total_counter.add(completions.usage.prompt_tokens) - - llamacpp_completion_tokens.add(completions.usage.completion_tokens) - llamacpp_completion_tokens_total_counter.add(completions.usage.completion_tokens) - - llamacpp_completions_truncated_rate.add(completions.choices[0].finish_reason === 'length') - llamacpp_completions_stop_rate.add(completions.choices[0].finish_reason === 'stop') - - llamacpp_tokens_second.add(completions.usage.total_tokens / res.timings.duration * 1.e3) - } else { - console.error(`response: ${res.body} request=${payload}`) + const promptEvalTime = promptEvalEndTime - startTime + if (promptEvalTime > 0) { + llamacpp_prompt_processing_second.add(prompt_tokens / (promptEvalEndTime - startTime) * 1.e3) } + const completion_time = endTime - promptEvalEndTime + if (completions_tokens > 0 && completion_time > 0) { + llamacpp_tokens_second.add(completions_tokens / completion_time * 1.e3) + } + llamacpp_completions_truncated_rate.add(finish_reason === 'length') + llamacpp_completions_stop_rate.add(finish_reason === 'stop') + sleep(0.3) } diff --git a/examples/server/completion.js.hpp b/examples/server/completion.js.hpp deleted file mode 100644 index f8bb4b0f2..000000000 --- a/examples/server/completion.js.hpp +++ /dev/null @@ -1,486 +0,0 @@ -unsigned char completion_js[] = { - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x44, - 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x3a, 0x20, 0x74, 0x72, - 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, - 0x69, 0x63, 0x74, 0x3a, 0x20, 0x35, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20, - 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x3a, - 0x20, 0x30, 0x2e, 0x32, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, - 0x3a, 0x20, 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x5d, 0x0a, 0x7d, - 0x3b, 0x0a, 0x0a, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x0a, - 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, - 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, - 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, - 0x74, 0x6f, 0x72, 0x2e, 0x20, 0x52, 0x65, 0x63, 0x6f, 0x6d, 0x6d, 0x65, - 0x6e, 0x64, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x6d, 0x6f, 0x73, - 0x74, 0x20, 0x75, 0x73, 0x65, 0x20, 0x63, 0x61, 0x73, 0x65, 0x73, 0x2e, - 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, - 0x6c, 0x65, 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, - 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, - 0x6a, 0x73, 0x27, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, - 0x73, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x22, - 0x54, 0x65, 0x6c, 0x6c, 0x20, 0x6d, 0x65, 0x20, 0x61, 0x20, 0x6a, 0x6f, - 0x6b, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, - 0x69, 0x63, 0x74, 0x3a, 0x20, 0x38, 0x30, 0x30, 0x7d, 0x29, 0x0a, 0x2f, - 0x2f, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, - 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, - 0x73, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, - 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, - 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, - 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x2f, 0x2f, 0x0a, - 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, - 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, 0x6c, - 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, - 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, - 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, 0x3d, 0x20, 0x7b, - 0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, - 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x21, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, - 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, - 0x77, 0x20, 0x41, 0x62, 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, - 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x73, 0x2c, - 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, - 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x70, 0x6f, - 0x6e, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, - 0x66, 0x65, 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x63, 0x6f, 0x6d, 0x70, - 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x2c, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x6d, 0x65, 0x74, 0x68, 0x6f, 0x64, 0x3a, 0x20, 0x27, - 0x50, 0x4f, 0x53, 0x54, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x62, - 0x6f, 0x64, 0x79, 0x3a, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74, - 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x63, 0x6f, 0x6d, 0x70, - 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x61, 0x64, 0x65, - 0x72, 0x73, 0x3a, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x27, 0x43, 0x6f, 0x6e, 0x6e, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x27, - 0x3a, 0x20, 0x27, 0x6b, 0x65, 0x65, 0x70, 0x2d, 0x61, 0x6c, 0x69, 0x76, - 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x43, - 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2d, 0x54, 0x79, 0x70, 0x65, 0x27, - 0x3a, 0x20, 0x27, 0x61, 0x70, 0x70, 0x6c, 0x69, 0x63, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x27, 0x41, 0x63, 0x63, 0x65, 0x70, 0x74, 0x27, - 0x3a, 0x20, 0x27, 0x74, 0x65, 0x78, 0x74, 0x2f, 0x65, 0x76, 0x65, 0x6e, - 0x74, 0x2d, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x27, 0x2c, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x28, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x20, - 0x3f, 0x20, 0x7b, 0x27, 0x41, 0x75, 0x74, 0x68, 0x6f, 0x72, 0x69, 0x7a, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x60, 0x42, 0x65, 0x61, - 0x72, 0x65, 0x72, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x60, 0x7d, 0x20, - 0x3a, 0x20, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x3a, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, - 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x0a, 0x20, 0x20, 0x7d, 0x29, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, - 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x73, 0x70, - 0x6f, 0x6e, 0x73, 0x65, 0x2e, 0x62, 0x6f, 0x64, 0x79, 0x2e, 0x67, 0x65, - 0x74, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, - 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x54, 0x65, 0x78, - 0x74, 0x44, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, - 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x6c, - 0x65, 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, - 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x42, 0x75, 0x66, - 0x66, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x70, 0x61, 0x72, 0x74, - 0x69, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x72, 0x65, 0x61, 0x64, 0x20, 0x6c, - 0x69, 0x6e, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, - 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x63, - 0x6f, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x72, 0x65, - 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, 0x64, 0x28, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, - 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x6f, 0x6e, 0x65, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, - 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x41, - 0x64, 0x64, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, - 0x76, 0x65, 0x72, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, 0x74, 0x6f, 0x20, - 0x74, 0x68, 0x65, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x20, - 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, - 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x6c, 0x65, 0x66, - 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x2b, 0x20, 0x64, 0x65, 0x63, 0x6f, - 0x64, 0x65, 0x72, 0x2e, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x28, 0x72, - 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, - 0x43, 0x68, 0x65, 0x63, 0x6b, 0x20, 0x69, 0x66, 0x20, 0x74, 0x68, 0x65, - 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x72, 0x61, 0x63, - 0x74, 0x65, 0x72, 0x20, 0x69, 0x73, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, - 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x73, - 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72, 0x65, 0x61, - 0x6b, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, 0x65, 0x6e, 0x64, - 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x53, - 0x70, 0x6c, 0x69, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, - 0x74, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, - 0x69, 0x6e, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2e, - 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x5c, 0x6e, 0x27, 0x29, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x49, - 0x66, 0x20, 0x74, 0x68, 0x65, 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x64, - 0x6f, 0x65, 0x73, 0x6e, 0x27, 0x74, 0x20, 0x65, 0x6e, 0x64, 0x20, 0x77, - 0x69, 0x74, 0x68, 0x20, 0x61, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, - 0x72, 0x65, 0x61, 0x6b, 0x2c, 0x20, 0x74, 0x68, 0x65, 0x6e, 0x20, 0x74, - 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e, 0x65, - 0x20, 0x69, 0x73, 0x20, 0x69, 0x6e, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, - 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, - 0x53, 0x74, 0x6f, 0x72, 0x65, 0x20, 0x69, 0x74, 0x20, 0x69, 0x6e, 0x20, - 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x20, - 0x62, 0x65, 0x20, 0x61, 0x64, 0x64, 0x65, 0x64, 0x20, 0x74, 0x6f, 0x20, - 0x74, 0x68, 0x65, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x63, 0x68, 0x75, - 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x65, 0x6e, - 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x4c, 0x69, 0x6e, 0x65, 0x42, 0x72, - 0x65, 0x61, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, - 0x3d, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x2e, 0x70, 0x6f, 0x70, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, - 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d, - 0x20, 0x22, 0x22, 0x3b, 0x20, 0x2f, 0x2f, 0x20, 0x52, 0x65, 0x73, 0x65, - 0x74, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x69, - 0x66, 0x20, 0x77, 0x65, 0x20, 0x68, 0x61, 0x76, 0x65, 0x20, 0x61, 0x20, - 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, 0x61, - 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x65, 0x6e, 0x64, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2f, 0x20, 0x50, 0x61, 0x72, 0x73, 0x65, 0x20, 0x61, 0x6c, - 0x6c, 0x20, 0x73, 0x73, 0x65, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x73, - 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x64, 0x64, 0x20, 0x74, 0x68, 0x65, - 0x6d, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x72, 0x65, 0x67, 0x65, 0x78, 0x20, 0x3d, 0x20, 0x2f, 0x5e, 0x28, 0x5c, - 0x53, 0x2b, 0x29, 0x3a, 0x5c, 0x73, 0x28, 0x2e, 0x2a, 0x29, 0x24, 0x2f, - 0x67, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, - 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x6e, - 0x65, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x73, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x3d, 0x20, - 0x72, 0x65, 0x67, 0x65, 0x78, 0x2e, 0x65, 0x78, 0x65, 0x63, 0x28, 0x6c, - 0x69, 0x6e, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x5b, 0x6d, 0x61, 0x74, 0x63, - 0x68, 0x5b, 0x31, 0x5d, 0x5d, 0x20, 0x3d, 0x20, 0x6d, 0x61, 0x74, 0x63, - 0x68, 0x5b, 0x32, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6e, 0x63, 0x65, 0x20, - 0x77, 0x65, 0x20, 0x6b, 0x6e, 0x6f, 0x77, 0x20, 0x74, 0x68, 0x69, 0x73, - 0x20, 0x69, 0x73, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, - 0x70, 0x2c, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x6a, 0x75, 0x73, - 0x74, 0x20, 0x64, 0x65, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x74, 0x68, 0x65, - 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x64, 0x61, 0x74, - 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, - 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, - 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, - 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, - 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x72, 0x65, 0x73, - 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x79, 0x69, - 0x65, 0x6c, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x72, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x69, 0x66, - 0x20, 0x77, 0x65, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x61, 0x20, 0x73, 0x74, - 0x6f, 0x70, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x20, 0x66, 0x72, 0x6f, - 0x6d, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2c, 0x20, 0x77, 0x65, - 0x20, 0x77, 0x69, 0x6c, 0x6c, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, - 0x68, 0x65, 0x72, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, - 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, - 0x70, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, - 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, - 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, - 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, - 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, 0x20, - 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, - 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, - 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x66, 0x61, - 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, - 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, - 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, - 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, - 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, 0x72, 0x65, 0x73, 0x75, - 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, - 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x2e, 0x69, 0x6e, 0x63, 0x6c, 0x75, 0x64, 0x65, 0x73, 0x28, 0x27, - 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, 0x69, 0x6c, - 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x61, - 0x6e, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x74, 0x6f, 0x20, 0x62, - 0x65, 0x20, 0x63, 0x61, 0x75, 0x67, 0x68, 0x74, 0x20, 0x62, 0x79, 0x20, - 0x75, 0x70, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x20, 0x63, 0x61, 0x6c, - 0x6c, 0x65, 0x72, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, - 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, - 0x28, 0x27, 0x73, 0x6c, 0x6f, 0x74, 0x20, 0x75, 0x6e, 0x61, 0x76, 0x61, - 0x69, 0x6c, 0x61, 0x62, 0x6c, 0x65, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x65, 0x72, 0x72, - 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, - 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x5b, 0x24, 0x7b, 0x72, - 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, - 0x63, 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x2d, 0x20, 0x24, 0x7b, 0x72, 0x65, - 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x74, - 0x79, 0x70, 0x65, 0x7d, 0x5d, 0x3a, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x73, - 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x2e, 0x6d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, - 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x6c, 0x6c, 0x61, 0x6d, - 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x20, - 0x24, 0x7b, 0x72, 0x65, 0x73, 0x75, 0x6c, 0x74, 0x2e, 0x65, 0x72, 0x72, - 0x6f, 0x72, 0x7d, 0x60, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x20, - 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x2e, 0x6e, 0x61, - 0x6d, 0x65, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x41, 0x62, 0x6f, 0x72, - 0x74, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, - 0x2e, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x6c, 0x6c, 0x61, 0x6d, - 0x61, 0x20, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x3a, 0x20, 0x22, 0x2c, 0x20, - 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x65, 0x3b, 0x0a, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, 0x6c, - 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x61, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x20, - 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, - 0x79, 0x6f, 0x75, 0x20, 0x63, 0x61, 0x6e, 0x20, 0x73, 0x75, 0x62, 0x73, - 0x63, 0x72, 0x69, 0x62, 0x65, 0x20, 0x74, 0x6f, 0x0a, 0x2f, 0x2f, 0x0a, - 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, 0x3a, 0x0a, - 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, - 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, - 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7d, - 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x63, 0x6f, 0x6d, 0x70, - 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x6a, 0x73, 0x27, 0x0a, 0x2f, - 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x20, 0x3d, 0x20, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x6e, 0x2e, 0x61, 0x64, 0x64, - 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, - 0x72, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, 0x2c, - 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, - 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, 0x69, 0x74, 0x65, - 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x65, 0x74, 0x61, 0x69, - 0x6c, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x0a, 0x2f, - 0x2f, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x2f, 0x2f, 0x0a, 0x65, - 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x6d, - 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, - 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x20, - 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, - 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, - 0x20, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x28, 0x61, 0x73, 0x79, 0x6e, 0x63, - 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, - 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, - 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, 0x75, 0x6e, - 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, - 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, - 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, - 0x70, 0x61, 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, - 0x65, 0x77, 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, - 0x6e, 0x74, 0x28, 0x22, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x22, - 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, - 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, - 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, - 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, - 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, - 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x67, - 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, - 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, - 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, - 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, - 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, - 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, - 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, - 0x54, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, - 0x74, 0x63, 0x68, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, - 0x20, 0x43, 0x75, 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, - 0x28, 0x22, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x22, 0x2c, 0x20, - 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x3a, 0x20, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x74, 0x69, 0x6d, - 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x2e, 0x64, 0x69, 0x73, 0x70, 0x61, 0x74, 0x63, 0x68, - 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x6e, 0x65, 0x77, 0x20, 0x43, 0x75, - 0x73, 0x74, 0x6f, 0x6d, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x28, 0x22, 0x64, - 0x6f, 0x6e, 0x65, 0x22, 0x2c, 0x20, 0x7b, 0x20, 0x64, 0x65, 0x74, 0x61, - 0x69, 0x6c, 0x3a, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, - 0x29, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x54, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x3b, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x43, 0x61, 0x6c, - 0x6c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2c, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x61, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x69, 0x73, - 0x65, 0x20, 0x74, 0x68, 0x61, 0x74, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, - 0x76, 0x65, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, - 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x65, 0x64, 0x20, 0x74, 0x65, 0x78, - 0x74, 0x2e, 0x20, 0x54, 0x68, 0x69, 0x73, 0x20, 0x64, 0x6f, 0x65, 0x73, - 0x20, 0x6e, 0x6f, 0x74, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, - 0x20, 0x73, 0x74, 0x72, 0x65, 0x61, 0x6d, 0x69, 0x6e, 0x67, 0x0a, 0x2f, - 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x45, 0x78, 0x61, 0x6d, 0x70, 0x6c, 0x65, - 0x3a, 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, - 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x2e, 0x74, 0x68, 0x65, - 0x6e, 0x28, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, - 0x72, 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, - 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x72, - 0x0a, 0x2f, 0x2f, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x70, 0x72, - 0x6f, 0x6d, 0x70, 0x74, 0x29, 0x0a, 0x2f, 0x2f, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x77, 0x72, - 0x69, 0x74, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, - 0x0a, 0x2f, 0x2f, 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x72, - 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, - 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, - 0x3d, 0x20, 0x7b, 0x7d, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, - 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, - 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x28, 0x61, 0x73, 0x79, - 0x6e, 0x63, 0x20, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x2c, - 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, - 0x74, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, - 0x6e, 0x6b, 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, - 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x29, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x2b, 0x3d, 0x20, 0x63, 0x68, - 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x6f, - 0x6c, 0x76, 0x65, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x63, 0x61, 0x74, 0x63, - 0x68, 0x20, 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6a, 0x65, 0x63, 0x74, - 0x28, 0x65, 0x72, 0x72, 0x6f, 0x72, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x7d, 0x3b, 0x0a, - 0x0a, 0x2f, 0x2a, 0x2a, 0x0a, 0x20, 0x2a, 0x20, 0x28, 0x64, 0x65, 0x70, - 0x72, 0x65, 0x63, 0x61, 0x74, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x2a, 0x2f, - 0x0a, 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x43, 0x6f, 0x6d, 0x70, 0x6c, - 0x65, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, - 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2c, 0x20, 0x63, 0x61, 0x6c, - 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, - 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, - 0x20, 0x6f, 0x66, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, - 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x20, 0x7d, 0x29, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x6c, 0x6c, - 0x62, 0x61, 0x63, 0x6b, 0x28, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x47, - 0x65, 0x74, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, - 0x20, 0x69, 0x6e, 0x66, 0x6f, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, - 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x2e, 0x20, 0x54, - 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x75, 0x73, 0x65, 0x66, 0x75, - 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x67, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x20, 0x74, 0x68, 0x65, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, - 0x74, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x20, 0x61, 0x6e, 0x64, - 0x20, 0x73, 0x6f, 0x20, 0x6f, 0x6e, 0x2e, 0x0a, 0x65, 0x78, 0x70, 0x6f, - 0x72, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x49, 0x6e, 0x66, 0x6f, 0x20, - 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x67, - 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x73, 0x65, - 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, - 0x74, 0x63, 0x68, 0x28, 0x22, 0x2f, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x22, - 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, 0x28, 0x72, 0x20, 0x3d, 0x3e, 0x20, - 0x72, 0x2e, 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, - 0x20, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, - 0x6c, 0x74, 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x5f, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x5f, - 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x3b, 0x0a, 0x7d, 0x0a -}; -size_t completion_js_len = 5796; diff --git a/examples/server/deps.sh b/examples/server/deps.sh index ea23e6450..d28378901 100755 --- a/examples/server/deps.sh +++ b/examples/server/deps.sh @@ -8,13 +8,3 @@ PUBLIC=$DIR/public echo "download js bundle files" curl https://npm.reversehttp.com/@preact/signals-core,@preact/signals,htm/preact,preact,preact/hooks > $PUBLIC/index.js echo >> $PUBLIC/index.js # add newline - -FILES=$(ls $PUBLIC) - -cd $PUBLIC -for FILE in $FILES; do - echo "generate $FILE.hpp" - - # use simple flag for old version of xxd - xxd -i $FILE > $DIR/$FILE.hpp -done diff --git a/examples/server/index.html.hpp b/examples/server/index.html.hpp deleted file mode 100644 index ae8c88980..000000000 --- a/examples/server/index.html.hpp +++ /dev/null @@ -1,2821 +0,0 @@ -unsigned char index_html[] = { - 0x3c, 0x68, 0x74, 0x6d, 0x6c, 0x3e, 0x0a, 0x0a, 0x3c, 0x68, 0x65, 0x61, - 0x64, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, 0x63, - 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x3d, 0x22, 0x55, 0x54, 0x46, 0x2d, - 0x38, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x76, 0x69, 0x65, 0x77, 0x70, 0x6f, - 0x72, 0x74, 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3d, - 0x22, 0x77, 0x69, 0x64, 0x74, 0x68, 0x3d, 0x64, 0x65, 0x76, 0x69, 0x63, - 0x65, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, 0x2c, 0x20, 0x69, 0x6e, 0x69, - 0x74, 0x69, 0x61, 0x6c, 0x2d, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x3d, 0x31, - 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x2d, 0x73, 0x63, - 0x61, 0x6c, 0x65, 0x3d, 0x31, 0x22, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, - 0x3c, 0x6d, 0x65, 0x74, 0x61, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, - 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65, - 0x22, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x3d, 0x22, 0x6c, - 0x69, 0x67, 0x68, 0x74, 0x20, 0x64, 0x61, 0x72, 0x6b, 0x22, 0x3e, 0x0a, - 0x20, 0x20, 0x3c, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x20, 0x2d, 0x20, 0x63, 0x68, 0x61, - 0x74, 0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3e, 0x0a, 0x0a, 0x20, - 0x20, 0x3c, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, - 0x79, 0x3a, 0x20, 0x73, 0x79, 0x73, 0x74, 0x65, 0x6d, 0x2d, 0x75, 0x69, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, - 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x39, 0x30, 0x25, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x23, - 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, - 0x3a, 0x20, 0x30, 0x65, 0x6d, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, - 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, 0x65, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, 0x75, 0x6d, - 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, 0x75, 0x73, - 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, 0x74, 0x77, - 0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, - 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x6d, 0x61, 0x69, 0x6e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x33, 0x70, 0x78, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, - 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, - 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, 0x6f, 0x6c, - 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6a, - 0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, - 0x6e, 0x74, 0x3a, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x2d, 0x62, 0x65, - 0x74, 0x77, 0x65, 0x65, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x3b, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, - 0x72, 0x6f, 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, 0x2d, 0x79, - 0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x3a, 0x20, 0x31, - 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63, 0x63, - 0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, - 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, - 0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, - 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, - 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x62, 0x6f, 0x64, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, - 0x3a, 0x20, 0x36, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x2d, 0x77, 0x69, 0x64, 0x74, 0x68, - 0x3a, 0x20, 0x33, 0x30, 0x30, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x2d, 0x68, 0x65, 0x69, 0x67, - 0x68, 0x74, 0x3a, 0x20, 0x31, 0x2e, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, - 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x20, - 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x70, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x66, 0x6c, 0x6f, 0x77, - 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, 0x62, 0x72, 0x65, 0x61, 0x6b, - 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x77, 0x6f, 0x72, 0x64, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3a, 0x20, - 0x62, 0x72, 0x65, 0x61, 0x6b, 0x2d, 0x77, 0x6f, 0x72, 0x64, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x79, 0x70, 0x68, 0x65, 0x6e, - 0x73, 0x3a, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x74, 0x6f, - 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x2d, 0x62, - 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x23, 0x77, 0x72, 0x69, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x72, 0x6d, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, - 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x20, 0x30, - 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, - 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, - 0x64, 0x69, 0x72, 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x63, - 0x6f, 0x6c, 0x75, 0x6d, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x69, 0x67, 0x6e, - 0x2d, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x3a, 0x20, 0x73, 0x74, 0x72, 0x65, - 0x74, 0x63, 0x68, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x69, 0x67, 0x68, 0x74, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, - 0x61, 0x79, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x64, 0x69, 0x72, - 0x65, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x72, 0x6f, 0x77, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, - 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x6a, 0x75, 0x73, 0x74, 0x69, 0x66, 0x79, 0x2d, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x65, - 0x6e, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, - 0x65, 0x72, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, - 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, - 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, - 0x64, 0x73, 0x65, 0x74, 0x2e, 0x74, 0x77, 0x6f, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, - 0x3a, 0x20, 0x67, 0x72, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x67, 0x72, 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, - 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x2e, - 0x74, 0x68, 0x72, 0x65, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x67, - 0x72, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, - 0x72, 0x69, 0x64, 0x2d, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x3a, 0x20, 0x22, 0x61, 0x20, 0x61, 0x20, 0x61, 0x22, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x61, 0x70, 0x3a, 0x20, 0x31, 0x65, - 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, - 0x3a, 0x20, 0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, - 0x23, 0x61, 0x61, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x62, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, - 0x73, 0x3a, 0x20, 0x34, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, - 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, - 0x69, 0x6e, 0x2d, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x35, 0x65, - 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x77, - 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x62, 0x6f, 0x6c, 0x64, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, - 0x6e, 0x3a, 0x20, 0x2d, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x2d, 0x30, - 0x2e, 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, - 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x75, 0x72, 0x73, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x6f, 0x69, 0x6e, - 0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x5b, - 0x6f, 0x70, 0x65, 0x6e, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, - 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x2d, 0x73, - 0x65, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, - 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, 0x33, 0x65, - 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, - 0x64, 0x65, 0x72, 0x2d, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x3a, 0x20, - 0x31, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, 0x23, 0x63, - 0x63, 0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x2d, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, - 0x3a, 0x20, 0x61, 0x62, 0x73, 0x6f, 0x6c, 0x75, 0x74, 0x65, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, - 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, - 0x77, 0x68, 0x69, 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x30, 0x2e, - 0x32, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, - 0x6f, 0x78, 0x2d, 0x73, 0x68, 0x61, 0x64, 0x6f, 0x77, 0x3a, 0x20, 0x30, - 0x20, 0x30, 0x20, 0x31, 0x30, 0x70, 0x78, 0x20, 0x72, 0x67, 0x62, 0x61, - 0x28, 0x30, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x30, 0x2c, 0x20, 0x30, 0x2e, - 0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, - 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x35, 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6c, 0x65, 0x78, 0x2d, 0x67, 0x72, 0x6f, - 0x77, 0x3a, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x70, 0x72, 0x65, 0x20, 0x63, 0x6f, 0x64, 0x65, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, - 0x3a, 0x20, 0x62, 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, - 0x64, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x32, 0x32, - 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6c, - 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x64, 0x64, 0x64, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x64, - 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, - 0x6e, 0x74, 0x2d, 0x66, 0x61, 0x6d, 0x69, 0x6c, 0x79, 0x3a, 0x20, 0x6d, - 0x6f, 0x6e, 0x6f, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, - 0x20, 0x30, 0x2e, 0x31, 0x65, 0x6d, 0x20, 0x30, 0x2e, 0x33, 0x65, 0x6d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x6f, 0x72, 0x64, - 0x65, 0x72, 0x2d, 0x72, 0x61, 0x64, 0x69, 0x75, 0x73, 0x3a, 0x20, 0x33, - 0x70, 0x78, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, - 0x35, 0x65, 0x6d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x62, 0x6c, - 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, - 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2e, 0x73, 0x6c, 0x69, 0x6d, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x72, 0x67, - 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x20, 0x30, 0x2e, 0x35, 0x65, 0x6d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, - 0x61, 0x79, 0x3a, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x68, - 0x65, 0x61, 0x64, 0x65, 0x72, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x65, 0x78, 0x74, 0x2d, 0x61, 0x6c, 0x69, 0x67, 0x6e, - 0x3a, 0x20, 0x63, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x6f, - 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x6f, 0x6e, 0x74, 0x2d, 0x73, 0x69, 0x7a, 0x65, 0x3a, 0x20, 0x38, - 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x23, 0x38, 0x38, 0x38, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6d, - 0x6f, 0x64, 0x65, 0x2d, 0x63, 0x68, 0x61, 0x74, 0x20, 0x74, 0x65, 0x78, - 0x74, 0x61, 0x72, 0x65, 0x61, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x70, - 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, 0x34, - 0x2e, 0x35, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x63, - 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x65, - 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x3d, - 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x68, 0x65, 0x69, 0x67, 0x68, 0x74, 0x3a, 0x20, - 0x31, 0x30, 0x65, 0x6d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x5b, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x65, 0x64, 0x69, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x5d, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x69, 0x73, 0x70, 0x6c, - 0x61, 0x79, 0x3a, 0x20, 0x69, 0x6e, 0x6c, 0x69, 0x6e, 0x65, 0x2d, 0x62, - 0x6c, 0x6f, 0x63, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x77, 0x68, 0x69, 0x74, 0x65, 0x2d, 0x73, 0x70, 0x61, 0x63, 0x65, 0x3a, - 0x20, 0x70, 0x72, 0x65, 0x2d, 0x77, 0x72, 0x61, 0x70, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6f, 0x75, 0x74, 0x6c, 0x69, 0x6e, 0x65, - 0x3a, 0x20, 0x30, 0x70, 0x78, 0x20, 0x73, 0x6f, 0x6c, 0x69, 0x64, 0x20, - 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x40, 0x6b, 0x65, 0x79, 0x66, 0x72, 0x61, 0x6d, 0x65, 0x73, 0x20, 0x6c, - 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77, 0x69, - 0x70, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x30, - 0x25, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x70, - 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x30, 0x25, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x31, 0x30, 0x30, 0x25, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, - 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, - 0x6f, 0x6e, 0x3a, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, - 0x67, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, - 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, - 0x72, 0x2d, 0x31, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65, - 0x30, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, - 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, - 0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23, 0x65, 0x65, 0x65, 0x65, 0x65, 0x65, - 0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, - 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x73, 0x69, 0x7a, - 0x65, 0x3a, 0x20, 0x35, 0x30, 0x25, 0x20, 0x31, 0x30, 0x30, 0x25, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, - 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x3a, - 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x2d, 0x67, 0x72, 0x61, 0x64, - 0x69, 0x65, 0x6e, 0x74, 0x28, 0x39, 0x30, 0x64, 0x65, 0x67, 0x2c, 0x20, - 0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, - 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x2c, 0x20, - 0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, - 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x32, 0x29, 0x2c, 0x20, - 0x76, 0x61, 0x72, 0x28, 0x2d, 0x2d, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, - 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, 0x31, 0x29, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6e, 0x69, 0x6d, 0x61, - 0x74, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, - 0x67, 0x2d, 0x62, 0x67, 0x2d, 0x77, 0x69, 0x70, 0x65, 0x20, 0x32, 0x73, - 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x20, 0x69, 0x6e, 0x66, 0x69, - 0x6e, 0x69, 0x74, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x40, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x20, - 0x28, 0x70, 0x72, 0x65, 0x66, 0x65, 0x72, 0x73, 0x2d, 0x63, 0x6f, 0x6c, - 0x6f, 0x72, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x65, 0x3a, 0x20, 0x64, - 0x61, 0x72, 0x6b, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2e, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, 0x6c, 0x6f, - 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, 0x72, 0x2d, - 0x31, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, 0x30, 0x30, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2d, 0x2d, - 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x2d, 0x63, 0x6f, 0x6c, 0x6f, - 0x72, 0x2d, 0x32, 0x3a, 0x20, 0x23, 0x32, 0x32, 0x32, 0x32, 0x32, 0x32, - 0x66, 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x70, 0x6f, 0x70, 0x6f, - 0x76, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, - 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, 0x6e, 0x64, 0x2d, 0x63, 0x6f, 0x6c, - 0x6f, 0x72, 0x3a, 0x20, 0x62, 0x6c, 0x61, 0x63, 0x6b, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3e, 0x0a, - 0x0a, 0x20, 0x20, 0x3c, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, 0x74, - 0x79, 0x70, 0x65, 0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x75, 0x6c, 0x65, 0x22, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, - 0x6c, 0x2c, 0x20, 0x68, 0x2c, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, - 0x2c, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x20, 0x63, 0x6f, - 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x6e, 0x64, - 0x65, 0x72, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, - 0x6c, 0x2c, 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, - 0x2c, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x2c, 0x20, 0x43, 0x6f, - 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x69, 0x6e, 0x64, - 0x65, 0x78, 0x2e, 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x20, 0x7d, 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, - 0x2f, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x2e, - 0x6a, 0x73, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x70, - 0x6f, 0x72, 0x74, 0x20, 0x7b, 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, - 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x20, 0x7d, 0x20, - 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x27, 0x2f, 0x6a, 0x73, 0x6f, 0x6e, 0x2d, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2d, 0x74, 0x6f, 0x2d, 0x67, 0x72, - 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x2e, 0x6d, 0x6a, 0x73, 0x27, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x65, 0x6c, 0x65, - 0x63, 0x74, 0x65, 0x64, 0x5f, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x20, 0x3d, - 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x76, 0x61, 0x72, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x20, - 0x3d, 0x20, 0x2d, 0x31, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, - 0x3a, 0x20, 0x22, 0x54, 0x68, 0x69, 0x73, 0x20, 0x69, 0x73, 0x20, 0x61, - 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x73, 0x61, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x62, 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x55, 0x73, - 0x65, 0x72, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x4c, 0x6c, 0x61, 0x6d, 0x61, - 0x2c, 0x20, 0x61, 0x20, 0x66, 0x72, 0x69, 0x65, 0x6e, 0x64, 0x6c, 0x79, - 0x20, 0x63, 0x68, 0x61, 0x74, 0x62, 0x6f, 0x74, 0x2e, 0x20, 0x4c, 0x6c, - 0x61, 0x6d, 0x61, 0x20, 0x69, 0x73, 0x20, 0x68, 0x65, 0x6c, 0x70, 0x66, - 0x75, 0x6c, 0x2c, 0x20, 0x6b, 0x69, 0x6e, 0x64, 0x2c, 0x20, 0x68, 0x6f, - 0x6e, 0x65, 0x73, 0x74, 0x2c, 0x20, 0x67, 0x6f, 0x6f, 0x64, 0x20, 0x61, - 0x74, 0x20, 0x77, 0x72, 0x69, 0x74, 0x69, 0x6e, 0x67, 0x2c, 0x20, 0x61, - 0x6e, 0x64, 0x20, 0x6e, 0x65, 0x76, 0x65, 0x72, 0x20, 0x66, 0x61, 0x69, - 0x6c, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x6e, 0x73, 0x77, 0x65, 0x72, - 0x20, 0x61, 0x6e, 0x79, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, - 0x73, 0x20, 0x69, 0x6d, 0x6d, 0x65, 0x64, 0x69, 0x61, 0x74, 0x65, 0x6c, - 0x79, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x70, - 0x72, 0x65, 0x63, 0x69, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x22, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x7d, 0x7d, 0x5c, 0x6e, 0x5c, 0x6e, 0x7b, 0x7b, 0x68, 0x69, 0x73, - 0x74, 0x6f, 0x72, 0x79, 0x7d, 0x7d, 0x5c, 0x6e, 0x7b, 0x7b, 0x63, 0x68, - 0x61, 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x22, 0x7b, 0x7b, 0x6e, 0x61, - 0x6d, 0x65, 0x7d, 0x7d, 0x3a, 0x20, 0x7b, 0x7b, 0x6d, 0x65, 0x73, 0x73, - 0x61, 0x67, 0x65, 0x7d, 0x7d, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, - 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x79, 0x70, 0x65, 0x3a, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, - 0x2c, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, - 0x20, 0x7c, 0x20, 0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, - 0x6f, 0x6e, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, - 0x61, 0x72, 0x3a, 0x20, 0x22, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x22, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x3a, - 0x20, 0x22, 0x55, 0x73, 0x65, 0x72, 0x22, 0x2c, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x6c, - 0x65, 0x63, 0x74, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x27, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, - 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, - 0x74, 0x3a, 0x20, 0x34, 0x30, 0x30, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, - 0x65, 0x3a, 0x20, 0x30, 0x2e, 0x37, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, - 0x74, 0x5f, 0x6e, 0x3a, 0x20, 0x32, 0x35, 0x36, 0x2c, 0x20, 0x2f, 0x2f, - 0x20, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, - 0x20, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x2c, 0x20, 0x2d, 0x31, - 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x20, 0x73, - 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, - 0x3a, 0x20, 0x31, 0x2e, 0x31, 0x38, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, - 0x2e, 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, - 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, - 0x6b, 0x3a, 0x20, 0x34, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x3c, 0x3d, - 0x20, 0x30, 0x20, 0x74, 0x6f, 0x20, 0x75, 0x73, 0x65, 0x20, 0x76, 0x6f, - 0x63, 0x61, 0x62, 0x20, 0x73, 0x69, 0x7a, 0x65, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x39, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, - 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x5f, 0x70, 0x3a, 0x20, 0x30, - 0x2e, 0x30, 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x20, 0x3d, 0x20, - 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x3a, 0x20, 0x31, 0x2e, - 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, 0x30, 0x20, 0x3d, 0x20, - 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, - 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x31, 0x2e, - 0x30, 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x65, 0x73, 0x65, - 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, - 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, - 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2e, 0x30, - 0x20, 0x3d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x30, 0x2f, - 0x31, 0x2f, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, - 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, 0x3a, 0x20, - 0x35, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x20, 0x65, 0x6e, 0x74, 0x72, 0x6f, 0x70, 0x79, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, - 0x65, 0x74, 0x61, 0x3a, 0x20, 0x30, 0x2e, 0x31, 0x2c, 0x20, 0x2f, 0x2f, - 0x20, 0x6c, 0x65, 0x61, 0x72, 0x6e, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x61, - 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, 0x61, - 0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, 0x27, 0x27, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x3a, - 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x63, 0x6f, - 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, - 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, 0x5f, 0x6b, 0x65, - 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x2f, 0x2f, 0x20, 0x6d, 0x69, - 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x20, 0x66, 0x72, 0x6f, 0x6d, - 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x65, - 0x72, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x6d, 0x61, - 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x61, 0x63, 0x68, 0x65, - 0x5f, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3a, 0x20, 0x74, 0x72, 0x75, - 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x70, 0x69, - 0x5f, 0x6b, 0x65, 0x79, 0x3a, 0x20, 0x27, 0x27, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x20, - 0x53, 0x54, 0x41, 0x52, 0x54, 0x3a, 0x20, 0x53, 0x75, 0x70, 0x70, 0x6f, - 0x72, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x69, - 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, 0x20, 0x69, - 0x6e, 0x20, 0x62, 0x72, 0x6f, 0x77, 0x73, 0x65, 0x72, 0x73, 0x20, 0x4c, - 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, - 0x2a, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, - 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, - 0x65, 0x79, 0x20, 0x3d, 0x20, 0x22, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x63, - 0x70, 0x70, 0x5f, 0x73, 0x65, 0x72, 0x76, 0x65, 0x72, 0x5f, 0x6c, 0x6f, - 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x22, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, - 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, - 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, - 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, - 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x73, - 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, - 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, - 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, - 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x4a, 0x53, 0x4f, - 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, - 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, - 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x52, 0x61, 0x77, 0x54, - 0x65, 0x78, 0x74, 0x28, 0x74, 0x61, 0x67, 0x2c, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, - 0x67, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, - 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, - 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, - 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, 0x2c, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, - 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, - 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, - 0x28, 0x74, 0x61, 0x67, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, - 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, - 0x61, 0x67, 0x65, 0x2e, 0x67, 0x65, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x28, - 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, - 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x4b, 0x65, 0x79, - 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, 0x2b, 0x20, 0x74, 0x61, 0x67, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x21, 0x69, 0x74, 0x65, 0x6d, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, 0x73, 0x65, 0x28, - 0x69, 0x74, 0x65, 0x6d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6c, - 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, - 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, 0x52, 0x61, - 0x77, 0x54, 0x65, 0x78, 0x74, 0x28, 0x74, 0x61, 0x67, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x74, 0x65, 0x6d, 0x20, 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, - 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x2e, 0x67, 0x65, 0x74, - 0x49, 0x74, 0x65, 0x6d, 0x28, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, - 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, - 0x67, 0x65, 0x4b, 0x65, 0x79, 0x20, 0x2b, 0x20, 0x27, 0x2f, 0x27, 0x20, - 0x2b, 0x20, 0x74, 0x61, 0x67, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x69, 0x74, 0x65, 0x6d, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x72, - 0x65, 0x61, 0x74, 0x65, 0x20, 0x61, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, - 0x69, 0x6e, 0x65, 0x72, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x75, 0x73, 0x65, - 0x72, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, - 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, - 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, - 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, - 0x7b, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x27, 0x27, 0x2c, 0x20, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3a, 0x20, 0x7b, 0x20, - 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x7b, 0x7d, 0x2c, - 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x3a, 0x20, 0x7b, 0x7d, 0x20, - 0x7d, 0x20, 0x7d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, - 0x74, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x6c, 0x79, 0x20, 0x73, 0x61, - 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, - 0x67, 0x73, 0x20, 0x69, 0x66, 0x20, 0x74, 0x68, 0x65, 0x72, 0x65, 0x20, - 0x61, 0x72, 0x65, 0x20, 0x61, 0x6e, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x2f, 0x2f, 0x20, 0x75, 0x73, 0x65, 0x72, 0x20, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, 0x65, - 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x61, 0x72, 0x65, 0x20, 0x73, - 0x74, 0x6f, 0x72, 0x65, 0x64, 0x20, 0x69, 0x6e, 0x20, 0x6f, 0x6e, 0x65, - 0x20, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x2f, 0x2f, 0x20, 0x69, 0x6e, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6f, - 0x66, 0x20, 0x7b, 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x20, 0x22, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x64, 0x61, 0x74, 0x61, 0x22, 0x20, 0x7d, - 0x20, 0x61, 0x6e, 0x64, 0x20, 0x7b, 0x20, 0x22, 0x73, 0x65, 0x74, 0x74, - 0x69, 0x6e, 0x67, 0x73, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x6e, 0x61, 0x6d, 0x65, 0x22, 0x3a, 0x22, 0x73, 0x65, 0x74, 0x74, 0x69, - 0x6e, 0x67, 0x73, 0x64, 0x61, 0x74, 0x61, 0x22, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, - 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x49, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x69, - 0x6e, 0x67, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, - 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, - 0x3d, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, - 0x61, 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, - 0x73, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, - 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, - 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, - 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x77, 0x65, 0x72, - 0x65, 0x20, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x66, 0x75, 0x6c, - 0x6c, 0x79, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x2e, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x50, 0x72, 0x6f, - 0x63, 0x65, 0x73, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x73, 0x61, 0x76, 0x65, - 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, - 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x69, 0x6e, 0x67, - 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, - 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, - 0x5b, 0x5d, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, - 0x6f, 0x67, 0x28, 0x69, 0x6d, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x54, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, - 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x69, 0x6d, 0x70, 0x6f, - 0x72, 0x74, 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x73, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x6f, 0x76, 0x65, 0x72, 0x72, 0x69, 0x64, 0x65, 0x20, 0x64, 0x65, 0x66, - 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, 0x76, 0x65, - 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x64, 0x65, 0x66, - 0x61, 0x75, 0x6c, 0x74, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x73, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, - 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, - 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, - 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x73, 0x27, 0x2c, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, - 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, - 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, - 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x64, - 0x65, 0x74, 0x65, 0x63, 0x74, 0x65, 0x64, 0x2e, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, - 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x49, 0x6e, 0x69, 0x74, 0x69, 0x61, 0x6c, - 0x69, 0x7a, 0x69, 0x6e, 0x67, 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53, - 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x73, - 0x61, 0x76, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, - 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x61, 0x76, 0x65, - 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, - 0x20, 0x22, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x22, 0x3a, 0x20, - 0x7b, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3a, 0x20, 0x73, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2c, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x3a, 0x20, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, - 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, - 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, - 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x27, 0x2c, 0x20, 0x73, - 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, 0x65, - 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, - 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x52, - 0x65, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x64, 0x65, 0x66, - 0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, - 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x27, - 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, - 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x20, - 0x3d, 0x20, 0x73, 0x61, 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x5b, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, - 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, - 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, - 0x70, 0x70, 0x6c, 0x79, 0x28, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x2e, 0x64, 0x61, - 0x74, 0x61, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, - 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, - 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x27, - 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, - 0x20, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, - 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, - 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, 0x5d, 0x20, 0x7d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x73, - 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, - 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, - 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, - 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x28, - 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, - 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x4c, 0x6f, 0x61, 0x64, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, - 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x67, - 0x65, 0x74, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, - 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, - 0x65, 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x3d, - 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, - 0x67, 0x65, 0x5f, 0x67, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x41, 0x73, - 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, - 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, - 0x61, 0x73, 0x74, 0x27, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, - 0x64, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x29, 0x20, 0x7b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x41, - 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, - 0x20, 0x72, 0x65, 0x73, 0x74, 0x6f, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, - 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x20, 0x3d, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x55, 0x73, 0x65, 0x64, 0x54, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, - 0x67, 0x28, 0x27, 0x4e, 0x6f, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, - 0x76, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x2c, 0x20, 0x75, 0x73, 0x69, 0x6e, - 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x6e, 0x6f, 0x20, 0x61, - 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, - 0x74, 0x20, 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x20, 0x77, 0x61, 0x73, 0x20, 0x66, 0x6f, 0x75, 0x6e, - 0x64, 0x2c, 0x20, 0x73, 0x6f, 0x20, 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x66, - 0x72, 0x6f, 0x6d, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x2e, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, - 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, - 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, - 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, - 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x41, 0x70, 0x70, 0x6c, 0x79, - 0x69, 0x6e, 0x67, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, - 0x61, 0x6e, 0x64, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x20, 0x69, - 0x6e, 0x74, 0x65, 0x72, 0x6e, 0x61, 0x6c, 0x20, 0x64, 0x61, 0x74, 0x61, - 0x20, 0x66, 0x72, 0x6f, 0x6d, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x73, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, - 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x41, - 0x70, 0x70, 0x6c, 0x79, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, - 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, - 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x73, 0x61, - 0x76, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, - 0x61, 0x74, 0x65, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, - 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, - 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x28, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, - 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x54, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x20, 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, - 0x65, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, - 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, - 0x20, 0x3d, 0x3d, 0x20, 0x27, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, - 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2f, 0x20, 0x77, 0x65, 0x20, 0x64, 0x6f, 0x6e, 0x27, 0x74, - 0x20, 0x77, 0x61, 0x6e, 0x74, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x61, 0x76, - 0x65, 0x20, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, - 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, - 0x20, 0x73, 0x6f, 0x20, 0x6c, 0x65, 0x74, 0x27, 0x73, 0x20, 0x63, 0x72, - 0x65, 0x61, 0x74, 0x65, 0x20, 0x61, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x6f, - 0x6e, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, - 0x65, 0x74, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x27, 0x55, 0x73, - 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2d, 0x27, - 0x20, 0x2b, 0x20, 0x44, 0x61, 0x74, 0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, - 0x29, 0x2e, 0x74, 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x28, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, - 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x27, 0x6e, 0x61, 0x6d, 0x65, 0x27, 0x3a, - 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x27, 0x64, 0x61, 0x74, 0x61, 0x27, - 0x3a, 0x20, 0x7b, 0x20, 0x27, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x27, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x27, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x27, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, - 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x53, 0x61, 0x76, 0x69, 0x6e, - 0x67, 0x20, 0x61, 0x73, 0x20, 0x27, 0x20, 0x2b, 0x20, 0x6e, 0x65, 0x77, - 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4e, 0x61, 0x6d, 0x65, - 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, - 0x2f, 0x20, 0x73, 0x61, 0x76, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, - 0x65, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x20, 0x73, - 0x6c, 0x6f, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, - 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, - 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, - 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, - 0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c, 0x20, 0x6e, 0x65, 0x77, 0x54, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x6e, 0x64, 0x20, - 0x6c, 0x6f, 0x61, 0x64, 0x20, 0x69, 0x74, 0x20, 0x62, 0x61, 0x63, 0x6b, - 0x20, 0x61, 0x6e, 0x64, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x79, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, 0x41, - 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, 0x73, - 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x5f, - 0x73, 0x74, 0x6f, 0x72, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, 0x74, 0x44, - 0x61, 0x74, 0x61, 0x46, 0x72, 0x6f, 0x6d, 0x4f, 0x62, 0x6a, 0x65, 0x63, - 0x74, 0x28, 0x27, 0x75, 0x73, 0x65, 0x72, 0x5f, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x73, 0x5f, 0x6c, 0x61, 0x73, 0x74, 0x27, 0x2c, - 0x20, 0x7b, 0x20, 0x27, 0x6e, 0x61, 0x6d, 0x65, 0x27, 0x3a, 0x20, 0x73, - 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x27, 0x64, 0x61, 0x74, - 0x61, 0x27, 0x3a, 0x20, 0x7b, 0x20, 0x27, 0x73, 0x65, 0x73, 0x73, 0x69, - 0x6f, 0x6e, 0x27, 0x3a, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x27, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x27, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x20, 0x7d, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, - 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x27, 0x43, 0x68, 0x65, 0x63, - 0x6b, 0x69, 0x6e, 0x67, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x61, 0x75, 0x74, - 0x6f, 0x73, 0x61, 0x76, 0x65, 0x64, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, - 0x75, 0x73, 0x65, 0x64, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, - 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x4c, 0x6f, 0x61, 0x64, - 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, 0x41, 0x75, 0x74, 0x6f, - 0x73, 0x61, 0x76, 0x65, 0x64, 0x28, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2a, 0x20, 0x45, 0x4e, 0x44, 0x3a, 0x20, 0x53, 0x75, 0x70, - 0x70, 0x6f, 0x72, 0x74, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x73, 0x74, 0x6f, - 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x73, 0x20, 0x61, 0x6e, - 0x64, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x65, 0x74, 0x65, 0x72, 0x73, - 0x20, 0x69, 0x6e, 0x20, 0x62, 0x72, 0x6f, 0x77, 0x73, 0x65, 0x72, 0x73, - 0x20, 0x4c, 0x6f, 0x63, 0x61, 0x6c, 0x53, 0x74, 0x6f, 0x72, 0x61, 0x67, - 0x65, 0x20, 0x2a, 0x2f, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, - 0x74, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, - 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, - 0x65, 0x72, 0x20, 0x3d, 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, - 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, - 0x2f, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x6c, 0x79, 0x20, - 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x61, - 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x3f, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, - 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x20, 0x3d, 0x20, - 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, - 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x21, 0x3d, 0x20, 0x6e, - 0x75, 0x6c, 0x6c, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x20, 0x68, 0x61, 0x73, 0x20, 0x74, 0x68, 0x65, 0x20, 0x75, 0x73, 0x65, - 0x72, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x65, 0x64, 0x20, 0x61, 0x20, - 0x63, 0x68, 0x61, 0x74, 0x3f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, 0x61, 0x72, - 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x75, 0x74, - 0x65, 0x64, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, 0x6c, 0x65, - 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, 0x29, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x72, 0x61, - 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x20, 0x3d, 0x20, 0x28, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, - 0x69, 0x70, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, - 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, 0x72, 0x65, 0x70, - 0x6c, 0x61, 0x63, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, - 0x3d, 0x20, 0x28, 0x73, 0x74, 0x72, 0x2c, 0x20, 0x65, 0x78, 0x74, 0x72, - 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, - 0x74, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x3d, - 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x65, 0x78, 0x74, 0x72, 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, - 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, - 0x3d, 0x20, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x74, 0x74, 0x69, - 0x6e, 0x67, 0x73, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x65, 0x78, 0x74, 0x72, - 0x61, 0x53, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x20, 0x7d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x53, 0x74, - 0x72, 0x69, 0x6e, 0x67, 0x28, 0x73, 0x74, 0x72, 0x29, 0x2e, 0x72, 0x65, - 0x70, 0x6c, 0x61, 0x63, 0x65, 0x41, 0x6c, 0x6c, 0x28, 0x2f, 0x5c, 0x7b, - 0x5c, 0x7b, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5c, 0x7d, 0x5c, 0x7d, 0x2f, - 0x67, 0x2c, 0x20, 0x28, 0x5f, 0x2c, 0x20, 0x6b, 0x65, 0x79, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, - 0x73, 0x65, 0x74, 0x74, 0x69, 0x6e, 0x67, 0x73, 0x5b, 0x6b, 0x65, 0x79, - 0x5d, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x72, 0x75, 0x6e, 0x4c, 0x6c, - 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2c, - 0x20, 0x63, 0x68, 0x61, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x75, 0x72, - 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, - 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, - 0x72, 0x79, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, - 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, - 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, - 0x28, 0x22, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, - 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x41, 0x62, - 0x6f, 0x72, 0x74, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, - 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6f, 0x72, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x28, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x20, 0x6f, 0x66, - 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x2c, 0x20, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x50, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2c, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, - 0x6c, 0x6c, 0x65, 0x72, 0x3a, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, - 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, - 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x64, 0x61, 0x74, 0x61, 0x20, - 0x3d, 0x20, 0x63, 0x68, 0x75, 0x6e, 0x6b, 0x2e, 0x64, 0x61, 0x74, 0x61, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x74, 0x6f, 0x70, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, - 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, - 0x20, 0x26, 0x26, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, - 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5b, 0x63, 0x75, 0x72, 0x72, - 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x2e, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2e, 0x6d, 0x61, 0x74, 0x63, - 0x68, 0x28, 0x2f, 0x5c, 0x6e, 0x24, 0x2f, 0x29, 0x20, 0x21, 0x3d, 0x20, - 0x6e, 0x75, 0x6c, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, - 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x70, - 0x6f, 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, - 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, - 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x63, - 0x68, 0x61, 0x72, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, - 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x5d, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, 0x28, 0x22, 0x43, - 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x69, - 0x6e, 0x69, 0x73, 0x68, 0x65, 0x64, 0x3a, 0x20, 0x27, 0x22, 0x2c, 0x20, - 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, - 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, - 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, - 0x2c, 0x20, 0x22, 0x27, 0x2c, 0x20, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, - 0x79, 0x3a, 0x20, 0x22, 0x2c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, - 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, - 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x64, 0x61, 0x74, 0x61, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, - 0x64, 0x20, 0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x73, 0x6c, 0x6f, - 0x74, 0x5f, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, - 0x63, 0x74, 0x65, 0x64, 0x5f, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x20, 0x26, - 0x26, 0x20, 0x21, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x75, 0x6c, 0x74, - 0x69, 0x6d, 0x6f, 0x64, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, - 0x65, 0x72, 0x74, 0x28, 0x22, 0x54, 0x68, 0x65, 0x20, 0x73, 0x65, 0x72, - 0x76, 0x65, 0x72, 0x20, 0x77, 0x61, 0x73, 0x20, 0x6e, 0x6f, 0x74, 0x20, - 0x63, 0x6f, 0x6d, 0x70, 0x69, 0x6c, 0x65, 0x64, 0x20, 0x66, 0x6f, 0x72, - 0x20, 0x6d, 0x75, 0x6c, 0x74, 0x69, 0x6d, 0x6f, 0x64, 0x61, 0x6c, 0x20, - 0x6f, 0x72, 0x20, 0x74, 0x68, 0x65, 0x20, 0x6d, 0x6f, 0x64, 0x65, 0x6c, - 0x20, 0x70, 0x72, 0x6f, 0x6a, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x20, 0x63, - 0x61, 0x6e, 0x27, 0x74, 0x20, 0x62, 0x65, 0x20, 0x6c, 0x6f, 0x61, 0x64, - 0x65, 0x64, 0x2e, 0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, - 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x68, 0x69, - 0x73, 0x74, 0x6f, 0x72, 0x79, 0x2c, 0x20, 0x5b, 0x63, 0x68, 0x61, 0x72, - 0x2c, 0x20, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x4d, 0x65, 0x73, - 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x64, 0x61, 0x74, 0x61, - 0x2e, 0x74, 0x69, 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x6c, - 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x20, 0x3d, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x2f, 0x2f, 0x20, 0x73, 0x65, 0x6e, 0x64, 0x20, 0x6d, 0x65, 0x73, 0x73, - 0x61, 0x67, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x73, 0x65, 0x72, 0x76, 0x65, - 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x63, 0x68, 0x61, 0x74, 0x20, 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, - 0x20, 0x28, 0x6d, 0x73, 0x67, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, - 0x67, 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, - 0x75, 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, 0x61, 0x6e, - 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, - 0x63, 0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x7b, 0x7b, 0x75, - 0x73, 0x65, 0x72, 0x7d, 0x7d, 0x22, 0x2c, 0x20, 0x6d, 0x73, 0x67, 0x5d, - 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, - 0x74, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x3a, - 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, - 0x74, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, 0x70, 0x28, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x28, 0x5b, 0x6e, - 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x64, 0x61, 0x74, 0x61, 0x5d, 0x29, 0x20, - 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, - 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x3a, - 0x20, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, - 0x61, 0x79, 0x28, 0x64, 0x61, 0x74, 0x61, 0x29, 0x20, 0x3f, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, - 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, - 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, - 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, - 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, 0x2f, 0x2c, 0x20, 0x27, 0x27, - 0x29, 0x20, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x61, - 0x74, 0x61, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, - 0x28, 0x22, 0x5c, 0x6e, 0x22, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, - 0x5f, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, - 0x20, 0x3d, 0x20, 0x60, 0x41, 0x20, 0x63, 0x68, 0x61, 0x74, 0x20, 0x62, - 0x65, 0x74, 0x77, 0x65, 0x65, 0x6e, 0x20, 0x61, 0x20, 0x63, 0x75, 0x72, - 0x69, 0x6f, 0x75, 0x73, 0x20, 0x68, 0x75, 0x6d, 0x61, 0x6e, 0x20, 0x61, - 0x6e, 0x64, 0x20, 0x61, 0x6e, 0x20, 0x61, 0x72, 0x74, 0x69, 0x66, 0x69, - 0x63, 0x69, 0x61, 0x6c, 0x20, 0x69, 0x6e, 0x74, 0x65, 0x6c, 0x6c, 0x69, - 0x67, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x61, 0x73, 0x73, 0x69, 0x73, 0x74, - 0x61, 0x6e, 0x74, 0x2e, 0x20, 0x54, 0x68, 0x65, 0x20, 0x61, 0x73, 0x73, - 0x69, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x20, 0x67, 0x69, 0x76, 0x65, 0x73, - 0x20, 0x68, 0x65, 0x6c, 0x70, 0x66, 0x75, 0x6c, 0x2c, 0x20, 0x64, 0x65, - 0x74, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x20, - 0x70, 0x6f, 0x6c, 0x69, 0x74, 0x65, 0x20, 0x61, 0x6e, 0x73, 0x77, 0x65, - 0x72, 0x73, 0x20, 0x74, 0x6f, 0x20, 0x74, 0x68, 0x65, 0x20, 0x68, 0x75, - 0x6d, 0x61, 0x6e, 0x27, 0x73, 0x20, 0x71, 0x75, 0x65, 0x73, 0x74, 0x69, - 0x6f, 0x6e, 0x73, 0x2e, 0x5c, 0x6e, 0x55, 0x53, 0x45, 0x52, 0x3a, 0x5b, - 0x69, 0x6d, 0x67, 0x2d, 0x31, 0x30, 0x5d, 0x24, 0x7b, 0x6d, 0x73, 0x67, - 0x7d, 0x5c, 0x6e, 0x41, 0x53, 0x53, 0x49, 0x53, 0x54, 0x41, 0x4e, 0x54, - 0x3a, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, - 0x72, 0x75, 0x6e, 0x4c, 0x6c, 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, - 0x6d, 0x70, 0x74, 0x2c, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x3a, - 0x20, 0x73, 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, - 0x5b, 0x22, 0x3c, 0x2f, 0x73, 0x3e, 0x22, 0x2c, 0x20, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, - 0x72, 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x2c, 0x20, 0x74, 0x65, 0x6d, 0x70, - 0x6c, 0x61, 0x74, 0x65, 0x28, 0x22, 0x7b, 0x7b, 0x75, 0x73, 0x65, 0x72, - 0x7d, 0x7d, 0x3a, 0x22, 0x29, 0x5d, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x22, 0x7b, 0x7b, 0x63, 0x68, 0x61, 0x72, - 0x7d, 0x7d, 0x22, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, - 0x75, 0x6e, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x6f, 0x6c, 0x65, 0x2e, 0x6c, 0x6f, 0x67, - 0x28, 0x27, 0x61, 0x6c, 0x72, 0x65, 0x61, 0x64, 0x79, 0x20, 0x72, 0x75, - 0x6e, 0x6e, 0x69, 0x6e, 0x67, 0x2e, 0x2e, 0x2e, 0x27, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x7b, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x7d, 0x20, 0x3d, - 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x72, - 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x28, 0x5b, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, - 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, - 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2c, 0x20, 0x5b, 0x22, 0x22, - 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x5d, 0x5d, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6e, 0x4c, 0x6c, - 0x61, 0x6d, 0x61, 0x28, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x2c, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, - 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, - 0x6c, 0x6f, 0x74, 0x5f, 0x69, 0x64, 0x3a, 0x20, 0x73, 0x6c, 0x6f, 0x74, - 0x5f, 0x69, 0x64, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x73, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x5b, 0x5d, 0x2c, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x22, 0x22, 0x29, 0x2e, - 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x28, 0x28, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x73, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x2e, - 0x6d, 0x61, 0x70, 0x28, 0x28, 0x5b, 0x5f, 0x2c, 0x20, 0x64, 0x61, 0x74, - 0x61, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69, - 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, 0x64, 0x61, 0x74, 0x61, 0x29, - 0x20, 0x3f, 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, - 0x6d, 0x73, 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, - 0x28, 0x27, 0x27, 0x29, 0x20, 0x3a, 0x20, 0x64, 0x61, 0x74, 0x61, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2e, 0x6a, 0x6f, - 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, - 0x72, 0x69, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x73, 0x74, 0x6f, 0x70, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, - 0x2e, 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, - 0x75, 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x61, 0x62, 0x6f, 0x72, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x72, - 0x6f, 0x6c, 0x6c, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x73, - 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, - 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x55, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x28, 0x5b, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x75, 0x70, 0x6c, 0x6f, 0x61, 0x64, 0x49, 0x6d, 0x61, 0x67, - 0x65, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, 0x70, 0x72, 0x65, - 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, 0x63, - 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x45, 0x6c, 0x65, - 0x6d, 0x65, 0x6e, 0x74, 0x42, 0x79, 0x49, 0x64, 0x28, 0x22, 0x66, 0x69, - 0x6c, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, 0x29, 0x2e, 0x63, 0x6c, - 0x69, 0x63, 0x6b, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x67, 0x65, - 0x74, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x42, 0x79, 0x49, 0x64, - 0x28, 0x22, 0x66, 0x69, 0x6c, 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, - 0x29, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, - 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x22, 0x63, 0x68, 0x61, 0x6e, - 0x67, 0x65, 0x22, 0x2c, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x46, 0x69, - 0x6c, 0x65, 0x20, 0x3d, 0x20, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x66, 0x69, 0x6c, 0x65, 0x73, 0x5b, - 0x30, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, - 0x46, 0x69, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, - 0x20, 0x46, 0x69, 0x6c, 0x65, 0x52, 0x65, 0x61, 0x64, 0x65, 0x72, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x6f, 0x6e, 0x6c, 0x6f, - 0x61, 0x64, 0x20, 0x3d, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x20, - 0x3d, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x73, - 0x75, 0x6c, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x2e, - 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, - 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x3a, 0x20, 0x69, 0x6d, 0x61, - 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x20, 0x7d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x69, - 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x20, 0x5b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7b, 0x20, 0x64, 0x61, 0x74, 0x61, 0x3a, - 0x20, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x64, 0x61, 0x74, 0x61, 0x2e, - 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x64, 0x61, 0x74, - 0x61, 0x3a, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5c, 0x2f, 0x5b, 0x5e, 0x3b, - 0x5d, 0x2b, 0x3b, 0x62, 0x61, 0x73, 0x65, 0x36, 0x34, 0x2c, 0x2f, 0x2c, - 0x20, 0x27, 0x27, 0x29, 0x2c, 0x20, 0x69, 0x64, 0x3a, 0x20, 0x31, 0x30, - 0x20, 0x7d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, - 0x64, 0x5f, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x72, - 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x61, 0x64, 0x65, 0x72, 0x2e, 0x72, 0x65, 0x61, - 0x64, 0x41, 0x73, 0x44, 0x61, 0x74, 0x61, 0x55, 0x52, 0x4c, 0x28, 0x73, - 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x46, 0x69, 0x6c, 0x65, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, - 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, - 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x22, 0x22, 0x29, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, - 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x68, 0x61, 0x74, 0x28, - 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x20, 0x3d, 0x20, 0x22, 0x22, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, - 0x62, 0x6d, 0x69, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, - 0x6e, 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x65, 0x76, 0x65, - 0x6e, 0x74, 0x2e, 0x77, 0x68, 0x69, 0x63, 0x68, 0x20, 0x3d, 0x3d, 0x3d, - 0x20, 0x31, 0x33, 0x20, 0x26, 0x26, 0x20, 0x21, 0x65, 0x76, 0x65, 0x6e, - 0x74, 0x2e, 0x73, 0x68, 0x69, 0x66, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x28, 0x65, 0x76, 0x65, 0x6e, 0x74, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, - 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, 0x20, 0x6f, 0x6e, 0x73, 0x75, 0x62, - 0x6d, 0x69, 0x74, 0x3d, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, - 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, - 0x61, 0x72, 0x65, 0x61, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, - 0x73, 0x4e, 0x61, 0x6d, 0x65, 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, - 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x20, 0x3f, 0x20, 0x22, 0x6c, 0x6f, 0x61, 0x64, 0x69, 0x6e, 0x67, 0x22, - 0x20, 0x3a, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x28, 0x65, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x65, 0x2e, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6f, 0x6e, 0x6b, 0x65, 0x79, 0x70, 0x72, 0x65, - 0x73, 0x73, 0x3d, 0x24, 0x7b, 0x65, 0x6e, 0x74, 0x65, 0x72, 0x53, 0x75, - 0x62, 0x6d, 0x69, 0x74, 0x73, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x53, - 0x61, 0x79, 0x20, 0x73, 0x6f, 0x6d, 0x65, 0x74, 0x68, 0x69, 0x6e, 0x67, - 0x2e, 0x2e, 0x2e, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x6f, 0x77, 0x73, - 0x3d, 0x32, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, - 0x74, 0x65, 0x78, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x7d, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, - 0x69, 0x76, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x72, 0x69, - 0x67, 0x68, 0x74, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, - 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x73, 0x75, 0x62, 0x6d, - 0x69, 0x74, 0x22, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, - 0x3d, 0x24, 0x7b, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, - 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x65, 0x6e, - 0x64, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, - 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x6c, 0x6f, 0x61, 0x64, 0x49, - 0x6d, 0x61, 0x67, 0x65, 0x7d, 0x3e, 0x55, 0x70, 0x6c, 0x6f, 0x61, 0x64, - 0x20, 0x49, 0x6d, 0x61, 0x67, 0x65, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, - 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, - 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x73, 0x74, - 0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, - 0x3d, 0x24, 0x7b, 0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, - 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x74, - 0x6f, 0x70, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, - 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, - 0x3e, 0x52, 0x65, 0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, - 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x73, 0x28, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x20, 0x3d, - 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x6f, 0x70, 0x28, 0x65, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x75, 0x6e, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, - 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, - 0x7b, 0x73, 0x75, 0x62, 0x6d, 0x69, 0x74, 0x7d, 0x20, 0x74, 0x79, 0x70, - 0x65, 0x3d, 0x22, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x22, 0x20, 0x64, - 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, 0x64, 0x3d, 0x24, 0x7b, 0x67, 0x65, - 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x7d, 0x3e, 0x53, 0x74, 0x61, 0x72, 0x74, 0x3c, 0x2f, 0x62, - 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, - 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x73, - 0x74, 0x6f, 0x70, 0x7d, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, - 0x64, 0x3d, 0x24, 0x7b, 0x21, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, - 0x69, 0x6e, 0x67, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3e, 0x53, - 0x74, 0x6f, 0x70, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, - 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x72, 0x65, 0x73, 0x65, 0x74, 0x7d, 0x3e, - 0x52, 0x65, 0x73, 0x65, 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, - 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3d, 0x20, 0x28, - 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x73, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x20, 0x3d, - 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, - 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, 0x73, 0x65, - 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, - 0x2f, 0x20, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, - 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x28, 0x69, 0x66, 0x20, 0x6e, - 0x65, 0x65, 0x64, 0x65, 0x64, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x61, 0x72, - 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, - 0x6e, 0x65, 0x72, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, - 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, - 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x20, 0x26, 0x26, - 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, - 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, 0x20, 0x3c, 0x3d, 0x20, - 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, - 0x6c, 0x54, 0x6f, 0x70, 0x20, 0x2b, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, - 0x74, 0x2e, 0x6f, 0x66, 0x66, 0x73, 0x65, 0x74, 0x48, 0x65, 0x69, 0x67, - 0x68, 0x74, 0x20, 0x2b, 0x20, 0x33, 0x30, 0x30, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, - 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x54, - 0x6f, 0x28, 0x30, 0x2c, 0x20, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x2e, - 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x48, 0x65, 0x69, 0x67, 0x68, 0x74, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x6d, 0x65, - 0x73, 0x73, 0x61, 0x67, 0x65, 0x73, 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x73, - 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x4d, 0x6f, - 0x64, 0x65, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, - 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x69, 0x6f, 0x6e, 0x27, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, - 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, 0x75, 0x73, 0x65, 0x72, 0x2c, 0x20, - 0x64, 0x61, 0x74, 0x61, 0x5d, 0x2c, 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, - 0x67, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, - 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, 0x41, 0x72, - 0x72, 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, - 0x64, 0x61, 0x74, 0x61, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x6f, - 0x62, 0x73, 0x20, 0x3e, 0x20, 0x30, 0x20, 0x26, 0x26, 0x20, 0x69, 0x73, - 0x41, 0x72, 0x72, 0x61, 0x79, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3d, 0x20, - 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x62, - 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x7d, 0x20, 0x64, - 0x61, 0x74, 0x61, 0x3d, 0x24, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x7d, 0x20, - 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x74, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x69, 0x73, 0x41, 0x72, - 0x72, 0x61, 0x79, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x20, 0x3f, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x61, 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, - 0x67, 0x20, 0x3d, 0x3e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, - 0x27, 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, - 0x5e, 0x5c, 0x73, 0x2b, 0x2f, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x20, 0x3a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x61, 0x74, 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, - 0x20, 0x3d, 0x20, 0x69, 0x73, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x69, 0x6f, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x3f, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x65, - 0x78, 0x74, 0x20, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, - 0x7b, 0x4d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, - 0x7d, 0x20, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x65, 0x6d, - 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x74, 0x65, 0x78, 0x74, 0x29, 0x7d, - 0x20, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x75, 0x73, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x70, 0x20, - 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, - 0x3e, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x7b, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x28, 0x75, 0x73, 0x65, 0x72, - 0x29, 0x7d, 0x3a, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, - 0x20, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, - 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x69, 0x73, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x69, 0x6f, 0x6e, 0x4d, 0x6f, 0x64, 0x65, 0x20, 0x3f, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, - 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x6b, 0x65, 0x79, - 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x3e, 0x24, 0x7b, - 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, - 0x61, 0x6e, 0x3e, 0x60, 0x20, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, - 0x3c, 0x70, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x69, 0x6e, 0x64, - 0x65, 0x78, 0x7d, 0x3e, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, - 0x65, 0x7d, 0x3c, 0x2f, 0x70, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6f, - 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x45, 0x64, 0x69, 0x74, - 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x70, 0x72, - 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x3d, 0x20, 0x65, 0x2e, 0x74, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x54, 0x65, 0x78, - 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, - 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, 0x20, - 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x69, - 0x64, 0x3d, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x72, 0x65, 0x66, - 0x3d, 0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, - 0x7d, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, 0x6d, 0x65, 0x73, 0x73, - 0x61, 0x67, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x7d, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x69, 0x6d, 0x67, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x22, - 0x77, 0x69, 0x64, 0x74, 0x68, 0x3a, 0x20, 0x36, 0x30, 0x25, 0x3b, 0x24, - 0x7b, 0x21, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, 0x73, 0x65, - 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x20, 0x3f, 0x20, 0x60, 0x64, 0x69, - 0x73, 0x70, 0x6c, 0x61, 0x79, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, - 0x60, 0x20, 0x3a, 0x20, 0x60, 0x60, 0x7d, 0x22, 0x20, 0x73, 0x72, 0x63, - 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x5f, - 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x7d, 0x22, 0x2f, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x73, 0x70, 0x61, 0x6e, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x65, 0x64, 0x69, 0x74, 0x61, 0x62, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x69, - 0x73, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x4d, - 0x6f, 0x64, 0x65, 0x7d, 0x20, 0x72, 0x65, 0x66, 0x3d, 0x24, 0x7b, 0x63, - 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x7d, 0x20, 0x6f, 0x6e, - 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x68, 0x61, 0x6e, 0x64, - 0x6c, 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, - 0x45, 0x64, 0x69, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, 0x65, 0x73, - 0x73, 0x61, 0x67, 0x65, 0x73, 0x2e, 0x66, 0x6c, 0x61, 0x74, 0x4d, 0x61, - 0x70, 0x28, 0x63, 0x68, 0x61, 0x74, 0x4c, 0x69, 0x6e, 0x65, 0x29, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x60, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, - 0x46, 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, - 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, - 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, - 0x3a, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x28, - 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x20, - 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, 0x20, 0x65, - 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x20, 0x3d, - 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, - 0x20, 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, - 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x20, 0x3d, - 0x20, 0x28, 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, - 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, 0x5b, 0x65, 0x6c, 0x2e, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3a, - 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, - 0x70, 0x61, 0x72, 0x73, 0x65, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x28, 0x65, - 0x6c, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x29, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x72, 0x61, 0x6d, - 0x6d, 0x61, 0x72, 0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x50, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d, - 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x27, 0x27, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, - 0x72, 0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, - 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, - 0x65, 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, - 0x61, 0x72, 0x4a, 0x73, 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, - 0x50, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x65, 0x6c, 0x2e, 0x74, 0x61, 0x72, - 0x67, 0x65, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, - 0x6e, 0x76, 0x65, 0x72, 0x74, 0x4a, 0x53, 0x4f, 0x4e, 0x53, 0x63, 0x68, - 0x65, 0x6d, 0x61, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, - 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x72, 0x79, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x70, 0x61, 0x72, - 0x73, 0x65, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x2e, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, - 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x53, 0x63, 0x68, - 0x65, 0x6d, 0x61, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, - 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, - 0x72, 0x3a, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73, - 0x6f, 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70, - 0x4f, 0x72, 0x64, 0x65, 0x72, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x2c, 0x27, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x64, 0x75, 0x63, 0x65, 0x28, - 0x28, 0x61, 0x63, 0x63, 0x2c, 0x20, 0x63, 0x75, 0x72, 0x2c, 0x20, 0x69, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x20, 0x2e, 0x2e, 0x2e, 0x61, - 0x63, 0x63, 0x2c, 0x20, 0x5b, 0x63, 0x75, 0x72, 0x2e, 0x74, 0x72, 0x69, - 0x6d, 0x28, 0x29, 0x5d, 0x3a, 0x20, 0x69, 0x20, 0x7d, 0x29, 0x2c, 0x20, - 0x7b, 0x7d, 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x5f, 0x66, - 0x65, 0x74, 0x63, 0x68, 0x3a, 0x20, 0x74, 0x72, 0x75, 0x65, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, - 0x74, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x2e, - 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x52, 0x65, 0x66, 0x73, 0x28, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x27, 0x69, 0x6e, 0x70, - 0x75, 0x74, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, - 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x2c, 0x20, 0x27, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x2e, - 0x2e, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3a, 0x20, - 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x65, 0x72, 0x2e, 0x66, 0x6f, - 0x72, 0x6d, 0x61, 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, - 0x29, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x20, 0x63, 0x61, 0x74, 0x63, 0x68, 0x20, 0x28, 0x65, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, - 0x6c, 0x65, 0x72, 0x74, 0x28, 0x60, 0x43, 0x6f, 0x6e, 0x76, 0x65, 0x72, - 0x74, 0x20, 0x66, 0x61, 0x69, 0x6c, 0x65, 0x64, 0x3a, 0x20, 0x24, 0x7b, - 0x65, 0x2e, 0x6d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x7d, 0x60, 0x29, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x46, 0x6c, 0x6f, 0x61, - 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x3d, 0x20, 0x28, 0x7b, 0x20, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x2c, 0x20, - 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x73, - 0x74, 0x65, 0x70, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, - 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, - 0x6d, 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x7d, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, - 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, - 0x61, 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, - 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, - 0x24, 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, - 0x22, 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x73, 0x74, 0x65, - 0x70, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x74, 0x65, 0x70, 0x7d, 0x22, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, - 0x7d, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, - 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x50, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x7d, - 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x20, - 0x3d, 0x20, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x2c, 0x20, - 0x6d, 0x61, 0x78, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x2c, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x7d, 0x29, - 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, - 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, - 0x65, 0x7d, 0x22, 0x3e, 0x24, 0x7b, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x7d, - 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, - 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, - 0x6e, 0x67, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x24, 0x7b, 0x6e, - 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x6d, 0x69, 0x6e, 0x3d, 0x22, 0x24, - 0x7b, 0x6d, 0x69, 0x6e, 0x7d, 0x22, 0x20, 0x6d, 0x61, 0x78, 0x3d, 0x22, - 0x24, 0x7b, 0x6d, 0x61, 0x78, 0x7d, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, - 0x3d, 0x22, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x22, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, - 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, - 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x75, 0x73, 0x65, - 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, 0x73, - 0x65, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x65, 0x2e, - 0x70, 0x72, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x44, 0x65, 0x66, 0x61, 0x75, - 0x6c, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x54, 0x6f, 0x44, 0x65, 0x66, - 0x61, 0x75, 0x6c, 0x74, 0x41, 0x6e, 0x64, 0x41, 0x70, 0x70, 0x6c, 0x79, - 0x28, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x52, 0x65, 0x73, 0x65, 0x74, 0x42, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, - 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x65, - 0x6c, 0x65, 0x63, 0x74, 0x65, 0x64, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, - 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x27, 0x64, 0x65, - 0x66, 0x61, 0x75, 0x6c, 0x74, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, - 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x64, 0x69, 0x73, 0x61, 0x62, 0x6c, 0x65, - 0x64, 0x3e, 0x55, 0x73, 0x69, 0x6e, 0x67, 0x20, 0x64, 0x65, 0x66, 0x61, - 0x75, 0x6c, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x20, - 0x6f, 0x6e, 0x63, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x75, 0x73, - 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x52, 0x65, - 0x73, 0x65, 0x74, 0x7d, 0x3e, 0x52, 0x65, 0x73, 0x65, 0x74, 0x20, 0x61, - 0x6c, 0x6c, 0x20, 0x74, 0x6f, 0x20, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, - 0x74, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x61, 0x75, 0x74, 0x6f, 0x73, 0x61, - 0x76, 0x65, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x20, - 0x6f, 0x6e, 0x20, 0x65, 0x76, 0x65, 0x72, 0x79, 0x20, 0x63, 0x68, 0x61, - 0x6e, 0x67, 0x65, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x75, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x41, 0x75, 0x74, 0x6f, 0x73, 0x61, 0x76, 0x65, 0x28, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x73, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x20, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x5d, 0x29, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x43, - 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x74, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x47, 0x72, 0x61, - 0x6d, 0x6d, 0x61, 0x72, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, - 0x64, 0x3d, 0x22, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x22, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, - 0x72, 0x22, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, - 0x65, 0x72, 0x3d, 0x22, 0x55, 0x73, 0x65, 0x20, 0x67, 0x62, 0x6e, 0x66, - 0x20, 0x6f, 0x72, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x20, 0x53, 0x63, 0x68, - 0x65, 0x6d, 0x61, 0x2b, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, 0x22, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x67, - 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x7d, 0x22, 0x20, 0x72, 0x6f, 0x77, - 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, - 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x70, 0x2d, - 0x6f, 0x72, 0x64, 0x65, 0x72, 0x22, 0x20, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, 0x6f, 0x72, 0x64, 0x65, - 0x72, 0x3a, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x31, 0x2c, 0x70, 0x72, 0x6f, - 0x70, 0x32, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x33, 0x22, 0x20, 0x6f, 0x6e, - 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, - 0x74, 0x65, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x4a, 0x73, 0x6f, - 0x6e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x50, 0x72, 0x6f, 0x70, 0x4f, - 0x72, 0x64, 0x65, 0x72, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x62, 0x75, - 0x74, 0x74, 0x6f, 0x6e, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x62, - 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x22, 0x20, 0x6f, 0x6e, 0x63, 0x6c, 0x69, - 0x63, 0x6b, 0x3d, 0x24, 0x7b, 0x63, 0x6f, 0x6e, 0x76, 0x65, 0x72, 0x74, - 0x4a, 0x53, 0x4f, 0x4e, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x47, 0x72, - 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x7d, 0x3e, 0x43, 0x6f, 0x6e, 0x76, 0x65, - 0x72, 0x74, 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x20, 0x53, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x3c, 0x2f, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, - 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f, 0x6e, - 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, - 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x46, - 0x6f, 0x72, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x22, 0x3e, - 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, - 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x70, 0x72, 0x6f, 0x6d, 0x70, - 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, - 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x7d, 0x22, 0x20, 0x6f, - 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, - 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, - 0x68, 0x61, 0x74, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, - 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, 0x6d, 0x6c, - 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f, 0x6e, 0x74, - 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, 0x74, 0x28, - 0x29, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, - 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x75, 0x73, 0x65, - 0x72, 0x22, 0x3e, 0x55, 0x73, 0x65, 0x72, 0x20, 0x6e, 0x61, 0x6d, 0x65, - 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, - 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, - 0x75, 0x73, 0x65, 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, - 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x75, 0x73, 0x65, 0x72, 0x7d, 0x22, 0x20, - 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, - 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, - 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, 0x62, 0x6f, 0x74, 0x22, - 0x3e, 0x42, 0x6f, 0x74, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3c, 0x2f, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, - 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, - 0x74, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x63, 0x68, 0x61, - 0x72, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, - 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x63, 0x68, 0x61, 0x72, 0x7d, 0x22, 0x20, 0x6f, 0x6e, 0x69, - 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, - 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, - 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, 0x3d, 0x22, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x3e, 0x50, 0x72, - 0x6f, 0x6d, 0x70, 0x74, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, - 0x65, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x74, 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, - 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, - 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, - 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x34, 0x20, 0x6f, 0x6e, 0x69, - 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, - 0x6f, 0x72, 0x3d, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, - 0x22, 0x3e, 0x43, 0x68, 0x61, 0x74, 0x20, 0x68, 0x69, 0x73, 0x74, 0x6f, - 0x72, 0x79, 0x20, 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x3c, - 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x74, - 0x65, 0x78, 0x74, 0x61, 0x72, 0x65, 0x61, 0x20, 0x69, 0x64, 0x3d, 0x22, - 0x74, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3d, 0x22, 0x68, 0x69, 0x73, 0x74, 0x6f, 0x72, 0x79, 0x54, - 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, - 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x68, 0x69, 0x73, 0x74, - 0x6f, 0x72, 0x79, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, 0x74, 0x65, 0x7d, - 0x22, 0x20, 0x72, 0x6f, 0x77, 0x73, 0x3d, 0x31, 0x20, 0x6f, 0x6e, 0x69, - 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x47, 0x72, 0x61, - 0x6d, 0x6d, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x28, - 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, - 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, - 0x6f, 0x72, 0x6d, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x68, 0x74, - 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x6d, 0x70, 0x74, 0x43, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x53, 0x65, - 0x74, 0x28, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, - 0x3e, 0x24, 0x7b, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x43, 0x6f, - 0x6e, 0x74, 0x72, 0x6f, 0x6c, 0x28, 0x29, 0x7d, 0x3c, 0x2f, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x72, 0x6d, - 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, - 0x61, 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x24, 0x7b, 0x55, 0x73, 0x65, 0x72, 0x54, 0x65, 0x6d, 0x70, 0x6c, 0x61, - 0x74, 0x65, 0x52, 0x65, 0x73, 0x65, 0x74, 0x42, 0x75, 0x74, 0x74, 0x6f, - 0x6e, 0x7d, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x63, 0x6c, 0x61, 0x73, - 0x73, 0x3d, 0x22, 0x73, 0x6c, 0x69, 0x6d, 0x22, 0x3e, 0x3c, 0x69, 0x6e, - 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, - 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x74, - 0x79, 0x70, 0x65, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, - 0x63, 0x68, 0x61, 0x74, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, - 0x64, 0x3d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, - 0x3d, 0x3d, 0x20, 0x22, 0x63, 0x68, 0x61, 0x74, 0x22, 0x7d, 0x20, 0x6f, - 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x7d, 0x20, - 0x2f, 0x3e, 0x20, 0x43, 0x68, 0x61, 0x74, 0x3c, 0x2f, 0x6c, 0x61, 0x62, - 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x73, 0x6c, 0x69, 0x6d, - 0x22, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, - 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3d, 0x22, 0x74, 0x79, 0x70, 0x65, 0x22, 0x20, 0x76, 0x61, - 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, - 0x69, 0x6f, 0x6e, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, - 0x3d, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x22, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, - 0x6e, 0x22, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x53, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x43, 0x6f, 0x6d, 0x70, - 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x73, 0x65, 0x73, - 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, - 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x63, 0x68, 0x61, - 0x74, 0x27, 0x20, 0x3f, 0x20, 0x43, 0x68, 0x61, 0x74, 0x43, 0x6f, 0x6e, - 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x28, 0x29, 0x20, 0x3a, 0x20, - 0x43, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, - 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x28, 0x29, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, - 0x73, 0x73, 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x49, 0x6e, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x50, 0x72, 0x65, 0x64, 0x69, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, - 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, - 0x20, 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, - 0x6e, 0x5f, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x22, 0x2c, 0x20, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, - 0x65, 0x64, 0x69, 0x63, 0x74, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x65, 0x6d, - 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x22, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x3a, 0x20, 0x32, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, - 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, - 0x20, 0x22, 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, - 0x65, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x74, 0x65, 0x6d, 0x70, 0x65, 0x72, 0x61, 0x74, 0x75, 0x72, 0x65, 0x20, - 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, - 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x3a, 0x20, 0x22, 0x50, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x20, - 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x20, 0x73, 0x65, 0x71, 0x75, 0x65, - 0x6e, 0x63, 0x65, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x32, - 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x72, 0x65, 0x70, - 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, - 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, - 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x72, 0x65, - 0x70, 0x65, 0x61, 0x74, 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, - 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, - 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, - 0x20, 0x22, 0x43, 0x6f, 0x6e, 0x73, 0x69, 0x64, 0x65, 0x72, 0x20, 0x4e, - 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x66, 0x6f, 0x72, 0x20, - 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x69, 0x7a, 0x65, 0x22, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x3a, 0x20, 0x32, 0x30, 0x34, 0x38, 0x2c, 0x20, 0x6d, 0x69, - 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, - 0x22, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, - 0x5f, 0x6e, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x5f, 0x6c, 0x61, 0x73, 0x74, - 0x5f, 0x6e, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, - 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x6f, 0x70, 0x2d, 0x4b, 0x20, 0x73, 0x61, - 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, - 0x3a, 0x20, 0x31, 0x30, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, - 0x2d, 0x31, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, - 0x6f, 0x70, 0x5f, 0x6b, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x74, 0x6f, 0x70, 0x5f, 0x6b, 0x20, 0x7d, 0x29, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, - 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, - 0x54, 0x6f, 0x70, 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, - 0x6e, 0x67, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, - 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x6f, 0x70, 0x5f, - 0x70, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x74, 0x6f, 0x70, 0x5f, 0x70, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x6e, - 0x2d, 0x50, 0x20, 0x73, 0x61, 0x6d, 0x70, 0x6c, 0x69, 0x6e, 0x67, 0x22, - 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, - 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x6e, 0x5f, 0x70, 0x22, 0x2c, - 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, - 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x6e, - 0x5f, 0x70, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, - 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, 0x73, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x4d, 0x6f, - 0x72, 0x65, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x3c, 0x2f, - 0x73, 0x75, 0x6d, 0x6d, 0x61, 0x72, 0x79, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, - 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, - 0x3d, 0x22, 0x74, 0x77, 0x6f, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, - 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, 0x46, 0x53, - 0x2d, 0x5a, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, - 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x74, 0x66, 0x73, 0x5f, - 0x7a, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x74, 0x66, 0x73, 0x5f, 0x7a, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, - 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x54, - 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x20, 0x50, 0x22, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, - 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, - 0x20, 0x22, 0x74, 0x79, 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x22, - 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, 0x31, - 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, - 0x70, 0x69, 0x63, 0x61, 0x6c, 0x5f, 0x70, 0x20, 0x7d, 0x29, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, - 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, - 0x22, 0x50, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x20, 0x70, 0x65, - 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, - 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, - 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x70, - 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, 0x65, 0x6e, 0x61, - 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, - 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, - 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x63, 0x65, 0x5f, 0x70, - 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, - 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, - 0x46, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x20, 0x70, 0x65, - 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, - 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, - 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x66, - 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, 0x5f, 0x70, 0x65, 0x6e, - 0x61, 0x6c, 0x74, 0x79, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x66, 0x72, 0x65, 0x71, 0x75, 0x65, 0x6e, 0x63, 0x79, - 0x5f, 0x70, 0x65, 0x6e, 0x61, 0x6c, 0x74, 0x79, 0x20, 0x7d, 0x29, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x68, 0x72, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, - 0x6c, 0x64, 0x73, 0x65, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, - 0x22, 0x74, 0x68, 0x72, 0x65, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, 0x74, - 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x30, - 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, 0x7b, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, 0x3d, - 0x20, 0x30, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, - 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x6e, 0x6f, - 0x20, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x3c, 0x2f, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6c, - 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, - 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, 0x22, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, - 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, - 0x31, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, 0x24, - 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x3d, - 0x3d, 0x20, 0x31, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, - 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, - 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, 0x4d, - 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x76, 0x31, 0x3c, 0x2f, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, - 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x72, 0x61, 0x64, 0x69, 0x6f, - 0x22, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x6d, 0x69, 0x72, 0x6f, - 0x73, 0x74, 0x61, 0x74, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, - 0x22, 0x32, 0x22, 0x20, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x3d, - 0x24, 0x7b, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x2e, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, - 0x3d, 0x3d, 0x20, 0x32, 0x7d, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, - 0x74, 0x3d, 0x24, 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x49, 0x6e, 0x74, 0x7d, 0x20, 0x2f, 0x3e, 0x20, - 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x20, 0x76, 0x32, 0x3c, - 0x2f, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, - 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, - 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x20, 0x74, 0x61, 0x75, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, - 0x3a, 0x20, 0x31, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, - 0x20, 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, - 0x22, 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, - 0x75, 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, - 0x30, 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x74, 0x61, 0x75, - 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x46, 0x6c, 0x6f, - 0x61, 0x74, 0x46, 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, - 0x62, 0x65, 0x6c, 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x72, 0x6f, 0x73, 0x74, - 0x61, 0x74, 0x20, 0x65, 0x74, 0x61, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, - 0x3a, 0x20, 0x31, 0x2e, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, - 0x30, 0x2e, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, - 0x6d, 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, - 0x22, 0x2c, 0x20, 0x73, 0x74, 0x65, 0x70, 0x3a, 0x20, 0x30, 0x2e, 0x30, - 0x31, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, - 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6d, - 0x69, 0x72, 0x6f, 0x73, 0x74, 0x61, 0x74, 0x5f, 0x65, 0x74, 0x61, 0x20, - 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, - 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, - 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, 0x69, - 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x3a, - 0x20, 0x22, 0x53, 0x68, 0x6f, 0x77, 0x20, 0x50, 0x72, 0x6f, 0x62, 0x61, - 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x22, 0x2c, 0x20, 0x6d, - 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, - 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6e, - 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3a, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x73, - 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, - 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, - 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x49, 0x6e, 0x74, 0x46, - 0x69, 0x65, 0x6c, 0x64, 0x28, 0x7b, 0x20, 0x6c, 0x61, 0x62, 0x65, 0x6c, - 0x3a, 0x20, 0x22, 0x4d, 0x69, 0x6e, 0x20, 0x50, 0x72, 0x6f, 0x62, 0x61, - 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x66, 0x72, 0x6f, - 0x6d, 0x20, 0x65, 0x61, 0x63, 0x68, 0x20, 0x53, 0x61, 0x6d, 0x70, 0x6c, - 0x65, 0x72, 0x22, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x3a, 0x20, 0x31, 0x30, - 0x2c, 0x20, 0x6d, 0x69, 0x6e, 0x3a, 0x20, 0x30, 0x2c, 0x20, 0x6e, 0x61, - 0x6d, 0x65, 0x3a, 0x20, 0x22, 0x6d, 0x69, 0x6e, 0x5f, 0x6b, 0x65, 0x65, - 0x70, 0x22, 0x2c, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x70, - 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x6d, 0x69, 0x6e, 0x5f, 0x6b, 0x65, 0x65, 0x70, 0x20, 0x7d, 0x29, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x66, 0x69, 0x65, 0x6c, 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x20, 0x66, 0x6f, 0x72, - 0x3d, 0x22, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x22, 0x3e, 0x41, - 0x50, 0x49, 0x20, 0x4b, 0x65, 0x79, 0x3c, 0x2f, 0x6c, 0x61, 0x62, 0x65, - 0x6c, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x20, - 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x74, 0x65, 0x78, 0x74, 0x22, 0x20, - 0x6e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, - 0x79, 0x22, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x22, 0x24, 0x7b, - 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2e, 0x61, 0x70, 0x69, 0x5f, 0x6b, 0x65, 0x79, 0x7d, 0x22, 0x20, 0x70, - 0x6c, 0x61, 0x63, 0x65, 0x68, 0x6f, 0x6c, 0x64, 0x65, 0x72, 0x3d, 0x22, - 0x45, 0x6e, 0x74, 0x65, 0x72, 0x20, 0x41, 0x50, 0x49, 0x20, 0x6b, 0x65, - 0x79, 0x22, 0x20, 0x6f, 0x6e, 0x69, 0x6e, 0x70, 0x75, 0x74, 0x3d, 0x24, - 0x7b, 0x75, 0x70, 0x64, 0x61, 0x74, 0x65, 0x50, 0x61, 0x72, 0x61, 0x6d, - 0x73, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x66, 0x69, 0x65, 0x6c, - 0x64, 0x73, 0x65, 0x74, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x65, 0x74, 0x61, 0x69, 0x6c, - 0x73, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x66, 0x6f, 0x72, 0x6d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x62, - 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x70, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x20, 0x3d, 0x20, 0x4d, 0x61, 0x74, - 0x68, 0x2e, 0x66, 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x31, 0x39, 0x32, 0x20, - 0x2a, 0x20, 0x28, 0x31, 0x20, 0x2d, 0x20, 0x70, 0x29, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x67, 0x20, 0x3d, 0x20, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, 0x6c, 0x6f, - 0x6f, 0x72, 0x28, 0x31, 0x39, 0x32, 0x20, 0x2a, 0x20, 0x70, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x60, 0x72, 0x67, 0x62, 0x61, 0x28, 0x24, 0x7b, 0x72, 0x7d, - 0x2c, 0x24, 0x7b, 0x67, 0x7d, 0x2c, 0x30, 0x2c, 0x30, 0x2e, 0x33, 0x29, - 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x72, 0x6f, 0x62, - 0x61, 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x3d, 0x20, - 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x2e, 0x64, 0x61, - 0x74, 0x61, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6d, 0x73, 0x67, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x63, 0x6f, 0x6d, 0x70, - 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, - 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x7d, 0x20, 0x3d, - 0x20, 0x6d, 0x73, 0x67, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x21, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, - 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, - 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x20, 0x7c, 0x7c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, - 0x6c, 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, - 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, - 0x67, 0x74, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x30, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, - 0x6e, 0x74, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, - 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, - 0x74, 0x69, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, - 0x3e, 0x20, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x4e, 0x6f, 0x74, 0x20, - 0x66, 0x6f, 0x72, 0x20, 0x62, 0x79, 0x74, 0x65, 0x20, 0x70, 0x61, 0x69, - 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, - 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, - 0x74, 0x69, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x2e, 0x63, 0x6f, 0x6e, 0x74, - 0x65, 0x6e, 0x74, 0x2e, 0x73, 0x74, 0x61, 0x72, 0x74, 0x73, 0x57, 0x69, - 0x74, 0x68, 0x28, 0x27, 0x62, 0x79, 0x74, 0x65, 0x3a, 0x20, 0x5c, 0x5c, - 0x27, 0x29, 0x29, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, - 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x61, 0x74, - 0x61, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, 0x74, 0x69, - 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, 0x6c, 0x69, - 0x74, 0x69, 0x65, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x70, 0x72, 0x6f, - 0x62, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x74, - 0x65, 0x6e, 0x74, 0x3a, 0x20, 0x70, 0x72, 0x6f, 0x62, 0x2e, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, - 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, - 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x3a, 0x20, 0x5b, 0x70, 0x72, - 0x6f, 0x62, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, - 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x72, 0x6f, 0x62, 0x61, - 0x62, 0x69, 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x7d, 0x20, 0x64, 0x61, - 0x74, 0x61, 0x3d, 0x24, 0x7b, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x44, 0x61, - 0x74, 0x61, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x7b, 0x20, 0x70, - 0x72, 0x6f, 0x62, 0x73, 0x2c, 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x20, 0x7d, 0x20, 0x3d, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6c, 0x65, - 0x74, 0x69, 0x6f, 0x6e, 0x5f, 0x70, 0x72, 0x6f, 0x62, 0x61, 0x62, 0x69, - 0x6c, 0x69, 0x74, 0x69, 0x65, 0x73, 0x5b, 0x30, 0x5d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x62, - 0x73, 0x2e, 0x66, 0x69, 0x6e, 0x64, 0x28, 0x70, 0x20, 0x3d, 0x3e, 0x20, - 0x70, 0x2e, 0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, - 0x74, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x20, - 0x3d, 0x20, 0x66, 0x6f, 0x75, 0x6e, 0x64, 0x20, 0x3f, 0x20, 0x70, 0x72, - 0x6f, 0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x28, 0x66, 0x6f, 0x75, 0x6e, - 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x29, 0x20, 0x3a, 0x20, 0x27, 0x74, - 0x72, 0x61, 0x6e, 0x73, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x27, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, - 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x3d, 0x20, 0x68, 0x74, 0x6d, - 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, - 0x22, 0x70, 0x72, 0x6f, 0x62, 0x2d, 0x73, 0x65, 0x74, 0x22, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x62, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, - 0x28, 0x70, 0x2c, 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, - 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6b, 0x65, 0x79, 0x3d, 0x24, 0x7b, - 0x69, 0x6e, 0x64, 0x65, 0x78, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x69, 0x74, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x60, 0x70, 0x72, - 0x6f, 0x62, 0x3a, 0x20, 0x24, 0x7b, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, - 0x7d, 0x60, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, - 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x61, - 0x64, 0x64, 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x27, 0x30, 0x2e, 0x33, 0x65, - 0x6d, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, - 0x6f, 0x75, 0x6e, 0x64, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x70, - 0x2e, 0x74, 0x6f, 0x6b, 0x5f, 0x73, 0x74, 0x72, 0x20, 0x3d, 0x3d, 0x3d, - 0x20, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x20, 0x3f, 0x20, 0x70, - 0x72, 0x6f, 0x62, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x28, 0x70, 0x2e, 0x70, - 0x72, 0x6f, 0x62, 0x29, 0x20, 0x3a, 0x20, 0x27, 0x74, 0x72, 0x61, 0x6e, - 0x73, 0x70, 0x61, 0x72, 0x65, 0x6e, 0x74, 0x27, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x70, 0x2e, 0x74, 0x6f, - 0x6b, 0x5f, 0x73, 0x74, 0x72, 0x7d, 0x3a, 0x20, 0x3c, 0x2f, 0x73, 0x70, - 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, - 0x70, 0x61, 0x6e, 0x3e, 0x24, 0x7b, 0x4d, 0x61, 0x74, 0x68, 0x2e, 0x66, - 0x6c, 0x6f, 0x6f, 0x72, 0x28, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x62, 0x20, - 0x2a, 0x20, 0x31, 0x30, 0x30, 0x29, 0x7d, 0x25, 0x3c, 0x2f, 0x73, 0x70, - 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, - 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x29, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x70, - 0x6f, 0x76, 0x65, 0x72, 0x7d, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, - 0x24, 0x7b, 0x7b, 0x20, 0x62, 0x61, 0x63, 0x6b, 0x67, 0x72, 0x6f, 0x75, - 0x6e, 0x64, 0x43, 0x6f, 0x6c, 0x6f, 0x72, 0x3a, 0x20, 0x70, 0x43, 0x6f, - 0x6c, 0x6f, 0x72, 0x20, 0x7d, 0x7d, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, - 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3d, 0x24, - 0x7b, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, - 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6d, 0x73, 0x67, - 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, 0x2e, 0x6d, 0x61, 0x74, - 0x63, 0x68, 0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x29, 0x20, - 0x3f, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x62, 0x72, 0x20, 0x2f, - 0x3e, 0x60, 0x20, 0x3a, 0x20, 0x6d, 0x73, 0x67, 0x2e, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x6e, 0x74, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x70, 0x6f, 0x6f, 0x72, 0x20, 0x6d, - 0x61, 0x6e, 0x73, 0x20, 0x6d, 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, - 0x20, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x6d, 0x65, 0x6e, 0x74, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4d, - 0x61, 0x72, 0x6b, 0x64, 0x6f, 0x77, 0x6e, 0x69, 0x73, 0x68, 0x20, 0x3d, - 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x6d, 0x64, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, 0x61, - 0x6d, 0x73, 0x2e, 0x74, 0x65, 0x78, 0x74, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x28, 0x2f, 0x26, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, 0x61, 0x6d, 0x70, - 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3c, 0x2f, - 0x67, 0x2c, 0x20, 0x27, 0x26, 0x6c, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, - 0x61, 0x63, 0x65, 0x28, 0x2f, 0x3e, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x26, - 0x67, 0x74, 0x3b, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, - 0x5e, 0x23, 0x7b, 0x31, 0x2c, 0x36, 0x7d, 0x20, 0x28, 0x2e, 0x2a, 0x29, - 0x24, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, 0x3c, 0x68, 0x33, 0x3e, - 0x24, 0x31, 0x3c, 0x2f, 0x68, 0x33, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, - 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x5c, 0x2a, 0x28, 0x2e, 0x2a, 0x3f, - 0x29, 0x5c, 0x2a, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, - 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, - 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x28, 0x2f, 0x5f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x5f, 0x5f, 0x2f, - 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, - 0x24, 0x31, 0x3c, 0x2f, 0x73, 0x74, 0x72, 0x6f, 0x6e, 0x67, 0x3e, 0x27, - 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, - 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5c, 0x2a, 0x28, 0x2e, - 0x2a, 0x3f, 0x29, 0x5c, 0x2a, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, - 0x6d, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, - 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5f, 0x28, 0x2e, 0x2a, 0x3f, 0x29, - 0x5f, 0x2f, 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x65, 0x6d, 0x3e, 0x24, 0x31, - 0x3c, 0x2f, 0x65, 0x6d, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, - 0x28, 0x2f, 0x60, 0x60, 0x60, 0x2e, 0x2a, 0x3f, 0x5c, 0x6e, 0x28, 0x5b, - 0x5c, 0x73, 0x5c, 0x53, 0x5d, 0x2a, 0x3f, 0x29, 0x60, 0x60, 0x60, 0x2f, - 0x67, 0x2c, 0x20, 0x27, 0x3c, 0x70, 0x72, 0x65, 0x3e, 0x3c, 0x63, 0x6f, - 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, - 0x3c, 0x2f, 0x70, 0x72, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, - 0x65, 0x28, 0x2f, 0x60, 0x28, 0x2e, 0x2a, 0x3f, 0x29, 0x60, 0x2f, 0x67, - 0x2c, 0x20, 0x27, 0x3c, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x24, 0x31, 0x3c, - 0x2f, 0x63, 0x6f, 0x64, 0x65, 0x3e, 0x27, 0x29, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, - 0x65, 0x28, 0x2f, 0x5c, 0x6e, 0x2f, 0x67, 0x69, 0x6d, 0x2c, 0x20, 0x27, - 0x3c, 0x62, 0x72, 0x20, 0x2f, 0x3e, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x68, - 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, 0x64, 0x61, - 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, - 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x24, 0x7b, - 0x7b, 0x20, 0x5f, 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3a, 0x20, 0x6d, 0x64, - 0x20, 0x7d, 0x7d, 0x20, 0x2f, 0x3e, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, 0x6e, 0x65, - 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, 0x20, 0x3d, - 0x20, 0x28, 0x70, 0x61, 0x72, 0x61, 0x6d, 0x73, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x21, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x2f, - 0x3e, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x5f, 0x70, 0x72, 0x65, - 0x64, 0x69, 0x63, 0x74, 0x65, 0x64, 0x7d, 0x20, 0x70, 0x72, 0x65, 0x64, - 0x69, 0x63, 0x74, 0x65, 0x64, 0x2c, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x5f, 0x63, 0x61, 0x63, - 0x68, 0x65, 0x64, 0x7d, 0x20, 0x63, 0x61, 0x63, 0x68, 0x65, 0x64, 0x2c, - 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, 0x61, 0x74, - 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x69, 0x6d, 0x69, - 0x6e, 0x67, 0x73, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, 0x74, 0x65, - 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x5f, - 0x6d, 0x73, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x29, - 0x7d, 0x6d, 0x73, 0x20, 0x70, 0x65, 0x72, 0x20, 0x74, 0x6f, 0x6b, 0x65, - 0x6e, 0x2c, 0x20, 0x24, 0x7b, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x53, 0x74, - 0x61, 0x74, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x69, - 0x6d, 0x69, 0x6e, 0x67, 0x73, 0x2e, 0x70, 0x72, 0x65, 0x64, 0x69, 0x63, - 0x74, 0x65, 0x64, 0x5f, 0x70, 0x65, 0x72, 0x5f, 0x73, 0x65, 0x63, 0x6f, - 0x6e, 0x64, 0x2e, 0x74, 0x6f, 0x46, 0x69, 0x78, 0x65, 0x64, 0x28, 0x32, - 0x29, 0x7d, 0x20, 0x74, 0x6f, 0x6b, 0x65, 0x6e, 0x73, 0x20, 0x70, 0x65, - 0x72, 0x20, 0x73, 0x65, 0x63, 0x6f, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x70, 0x61, 0x6e, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, - 0x69, 0x6d, 0x70, 0x6c, 0x65, 0x20, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, - 0x72, 0x20, 0x69, 0x6d, 0x70, 0x6c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, - 0x20, 0x3d, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x20, 0x3d, - 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x28, 0x66, - 0x61, 0x6c, 0x73, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x3d, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, - 0x6e, 0x61, 0x6c, 0x28, 0x7b, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x27, - 0x30, 0x70, 0x78, 0x27, 0x2c, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, - 0x27, 0x30, 0x70, 0x78, 0x27, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x62, 0x75, - 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x75, 0x73, - 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, - 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x20, 0x3d, - 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, - 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x74, 0x6f, 0x67, 0x67, 0x6c, 0x65, 0x50, 0x6f, - 0x70, 0x6f, 0x76, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, - 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x63, 0x74, 0x20, 0x3d, 0x20, - 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x67, 0x65, 0x74, 0x42, 0x6f, 0x75, - 0x6e, 0x64, 0x69, 0x6e, 0x67, 0x43, 0x6c, 0x69, 0x65, 0x6e, 0x74, 0x52, - 0x65, 0x63, 0x74, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, - 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x60, 0x24, 0x7b, 0x72, 0x65, 0x63, 0x74, - 0x2e, 0x62, 0x6f, 0x74, 0x74, 0x6f, 0x6d, 0x20, 0x2b, 0x20, 0x77, 0x69, - 0x6e, 0x64, 0x6f, 0x77, 0x2e, 0x73, 0x63, 0x72, 0x6f, 0x6c, 0x6c, 0x59, - 0x7d, 0x70, 0x78, 0x60, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, - 0x60, 0x24, 0x7b, 0x72, 0x65, 0x63, 0x74, 0x2e, 0x6c, 0x65, 0x66, 0x74, - 0x20, 0x2b, 0x20, 0x77, 0x69, 0x6e, 0x64, 0x6f, 0x77, 0x2e, 0x73, 0x63, - 0x72, 0x6f, 0x6c, 0x6c, 0x58, 0x7d, 0x70, 0x78, 0x60, 0x2c, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x3d, 0x20, 0x21, 0x69, 0x73, - 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x68, 0x61, - 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, - 0x73, 0x69, 0x64, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x65, 0x76, 0x65, 0x6e, - 0x74, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x6f, 0x70, 0x6f, - 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, - 0x6e, 0x74, 0x20, 0x26, 0x26, 0x20, 0x21, 0x70, 0x6f, 0x70, 0x6f, 0x76, - 0x65, 0x72, 0x52, 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, - 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x28, 0x65, - 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x29, - 0x20, 0x26, 0x26, 0x20, 0x21, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, - 0x65, 0x66, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x2e, 0x63, - 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x73, 0x28, 0x65, 0x76, 0x65, 0x6e, - 0x74, 0x2e, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x29, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3d, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x75, - 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x28, 0x28, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x61, 0x64, - 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, - 0x65, 0x72, 0x28, 0x27, 0x6d, 0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77, - 0x6e, 0x27, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, - 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x64, 0x6f, - 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, - 0x65, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, 0x69, 0x73, 0x74, 0x65, 0x6e, - 0x65, 0x72, 0x28, 0x27, 0x6d, 0x6f, 0x75, 0x73, 0x65, 0x64, 0x6f, 0x77, - 0x6e, 0x27, 0x2c, 0x20, 0x68, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x43, 0x6c, - 0x69, 0x63, 0x6b, 0x4f, 0x75, 0x74, 0x73, 0x69, 0x64, 0x65, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x2c, 0x20, 0x5b, 0x5d, 0x29, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x70, 0x61, 0x6e, 0x20, - 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x7d, 0x20, 0x72, 0x65, 0x66, - 0x3d, 0x24, 0x7b, 0x62, 0x75, 0x74, 0x74, 0x6f, 0x6e, 0x52, 0x65, 0x66, - 0x7d, 0x20, 0x6f, 0x6e, 0x43, 0x6c, 0x69, 0x63, 0x6b, 0x3d, 0x24, 0x7b, - 0x74, 0x6f, 0x67, 0x67, 0x6c, 0x65, 0x50, 0x6f, 0x70, 0x6f, 0x76, 0x65, - 0x72, 0x7d, 0x3e, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, - 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, 0x3c, 0x2f, 0x73, 0x70, - 0x61, 0x6e, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x24, 0x7b, 0x69, 0x73, 0x4f, 0x70, 0x65, 0x6e, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x20, 0x26, 0x26, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, - 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x7d, 0x20, 0x69, 0x6e, 0x74, - 0x6f, 0x3d, 0x22, 0x23, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x22, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x64, 0x69, 0x76, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x66, 0x3d, - 0x24, 0x7b, 0x70, 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x52, 0x65, 0x66, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x3d, 0x22, 0x70, - 0x6f, 0x70, 0x6f, 0x76, 0x65, 0x72, 0x2d, 0x63, 0x6f, 0x6e, 0x74, 0x65, - 0x6e, 0x74, 0x22, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, - 0x24, 0x7b, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x74, 0x6f, 0x70, 0x3a, 0x20, 0x70, 0x6f, 0x73, - 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, - 0x74, 0x6f, 0x70, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x66, 0x74, 0x3a, 0x20, 0x70, - 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x6c, 0x65, 0x66, 0x74, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x70, 0x6f, 0x70, - 0x6f, 0x76, 0x65, 0x72, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x24, 0x7b, 0x50, - 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x60, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x53, 0x6f, 0x75, 0x72, 0x63, - 0x65, 0x3a, 0x20, 0x70, 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, - 0x72, 0x74, 0x61, 0x6c, 0x20, 0x28, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, - 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6d, - 0x2f, 0x64, 0x65, 0x76, 0x65, 0x6c, 0x6f, 0x70, 0x69, 0x74, 0x2f, 0x70, - 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, - 0x2f, 0x62, 0x6c, 0x6f, 0x62, 0x2f, 0x6d, 0x61, 0x73, 0x74, 0x65, 0x72, - 0x2f, 0x73, 0x72, 0x63, 0x2f, 0x70, 0x72, 0x65, 0x61, 0x63, 0x74, 0x2d, - 0x70, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x2e, 0x6a, 0x73, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x2f, 0x2a, 0x2a, 0x20, 0x52, 0x65, 0x64, 0x69, 0x72, - 0x65, 0x63, 0x74, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, - 0x67, 0x20, 0x6f, 0x66, 0x20, 0x64, 0x65, 0x73, 0x63, 0x65, 0x6e, 0x64, - 0x61, 0x6e, 0x74, 0x73, 0x20, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x74, 0x68, - 0x65, 0x20, 0x67, 0x69, 0x76, 0x65, 0x6e, 0x20, 0x43, 0x53, 0x53, 0x20, - 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x20, 0x2a, 0x2f, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, 0x20, 0x50, 0x6f, - 0x72, 0x74, 0x61, 0x6c, 0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x73, - 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f, - 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x55, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, - 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, 0x6e, 0x20, 0x70, 0x72, 0x6f, - 0x70, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x5b, 0x69, 0x5d, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x5b, 0x69, 0x5d, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x73, 0x65, 0x74, - 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, - 0x20, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, 0x72, 0x20, 0x3d, 0x20, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, - 0x61, 0x79, 0x65, 0x72, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, - 0x69, 0x73, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, - 0x4c, 0x61, 0x79, 0x65, 0x72, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, - 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, 0x79, 0x65, - 0x72, 0x28, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, - 0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x66, - 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, - 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72, - 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x29, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x2e, 0x70, 0x61, 0x72, - 0x65, 0x6e, 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x2e, 0x72, 0x65, 0x6d, 0x6f, - 0x76, 0x65, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x66, 0x69, 0x6e, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x28, 0x6e, 0x6f, - 0x64, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x20, 0x3d, 0x3d, 0x3d, - 0x20, 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x20, 0x3f, 0x20, - 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x71, 0x75, 0x65, - 0x72, 0x79, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x6e, - 0x6f, 0x64, 0x65, 0x29, 0x20, 0x3a, 0x20, 0x6e, 0x6f, 0x64, 0x65, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x4c, 0x61, - 0x79, 0x65, 0x72, 0x28, 0x73, 0x68, 0x6f, 0x77, 0x20, 0x3d, 0x20, 0x74, - 0x72, 0x75, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x69, 0x73, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x65, 0x64, 0x29, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x63, 0x6c, 0x65, 0x61, - 0x6e, 0x20, 0x75, 0x70, 0x20, 0x6f, 0x6c, 0x64, 0x20, 0x6e, 0x6f, 0x64, - 0x65, 0x20, 0x69, 0x66, 0x20, 0x6d, 0x6f, 0x76, 0x69, 0x6e, 0x67, 0x20, - 0x62, 0x61, 0x73, 0x65, 0x73, 0x3a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x21, - 0x3d, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, - 0x50, 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x50, 0x6f, 0x69, 0x6e, 0x74, 0x65, - 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, - 0x70, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x26, 0x26, 0x20, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, - 0x74, 0x65, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, - 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, - 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d, 0x20, 0x2f, 0x3e, 0x60, - 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x2c, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x20, 0x3d, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x69, 0x6e, 0x64, 0x4e, 0x6f, - 0x64, 0x65, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x2e, 0x69, 0x6e, 0x74, 0x6f, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, - 0x6f, 0x74, 0x65, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, - 0x28, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, - 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, 0x7d, 0x20, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x78, 0x74, 0x3d, 0x24, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x7d, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x24, 0x7b, - 0x73, 0x68, 0x6f, 0x77, 0x20, 0x26, 0x26, 0x20, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, - 0x72, 0x65, 0x6e, 0x20, 0x7c, 0x7c, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x24, 0x7b, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, - 0x78, 0x79, 0x7d, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x60, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x6e, 0x74, - 0x6f, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x72, 0x65, 0x6d, 0x6f, - 0x74, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, - 0x65, 0x72, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, - 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x20, 0x68, 0x69, 0x67, 0x68, 0x2d, 0x6f, 0x72, 0x64, 0x65, 0x72, 0x20, - 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x74, 0x68, - 0x61, 0x74, 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x73, 0x20, 0x69, - 0x74, 0x73, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x20, 0x63, 0x68, 0x69, - 0x6c, 0x64, 0x20, 0x69, 0x66, 0x20, 0x69, 0x74, 0x20, 0x65, 0x78, 0x69, - 0x73, 0x74, 0x73, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, - 0x75, 0x73, 0x65, 0x64, 0x20, 0x61, 0x73, 0x20, 0x61, 0x20, 0x63, 0x6f, - 0x6e, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x20, 0x72, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x70, 0x72, 0x6f, 0x78, - 0x79, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, - 0x20, 0x50, 0x6f, 0x72, 0x74, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x78, 0x79, - 0x20, 0x65, 0x78, 0x74, 0x65, 0x6e, 0x64, 0x73, 0x20, 0x43, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, - 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x28, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, - 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x7b, 0x20, 0x63, 0x68, 0x69, - 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7d, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x20, 0x7c, - 0x7c, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x41, 0x70, 0x70, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x20, 0x7b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x60, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, 0x20, 0x63, 0x6c, - 0x61, 0x73, 0x73, 0x3d, 0x22, 0x6d, 0x6f, 0x64, 0x65, 0x2d, 0x24, 0x7b, - 0x73, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x7d, 0x22, 0x3e, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x65, 0x61, - 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x68, 0x31, 0x3e, 0x6c, 0x6c, 0x61, - 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x68, 0x31, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, - 0x68, 0x65, 0x61, 0x64, 0x65, 0x72, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x6d, 0x61, 0x69, 0x6e, - 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x6e, 0x74, - 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x63, 0x68, 0x61, 0x74, 0x53, 0x74, - 0x61, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x3f, 0x20, 0x43, 0x68, 0x61, 0x74, 0x4c, 0x6f, 0x67, 0x20, 0x3a, 0x20, - 0x43, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x46, 0x6f, 0x72, 0x6d, 0x7d, 0x20, - 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x3c, 0x2f, 0x6d, 0x61, 0x69, 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x73, 0x65, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x77, 0x72, 0x69, - 0x74, 0x65, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x24, 0x7b, 0x73, 0x65, 0x73, 0x73, - 0x69, 0x6f, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x74, 0x79, - 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x63, 0x68, 0x61, 0x74, - 0x27, 0x20, 0x3f, 0x20, 0x4d, 0x65, 0x73, 0x73, 0x61, 0x67, 0x65, 0x49, - 0x6e, 0x70, 0x75, 0x74, 0x20, 0x3a, 0x20, 0x43, 0x6f, 0x6d, 0x70, 0x6c, - 0x65, 0x74, 0x69, 0x6f, 0x6e, 0x43, 0x6f, 0x6e, 0x74, 0x72, 0x6f, 0x6c, - 0x73, 0x7d, 0x20, 0x2f, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x65, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x3e, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x3c, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x70, 0x3e, 0x3c, 0x24, 0x7b, 0x4d, 0x6f, 0x64, 0x65, 0x6c, 0x47, 0x65, - 0x6e, 0x65, 0x72, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, 0x6e, 0x66, 0x6f, - 0x7d, 0x20, 0x2f, 0x3e, 0x3c, 0x2f, 0x70, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x70, 0x3e, - 0x50, 0x6f, 0x77, 0x65, 0x72, 0x65, 0x64, 0x20, 0x62, 0x79, 0x20, 0x3c, - 0x61, 0x20, 0x68, 0x72, 0x65, 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, - 0x73, 0x3a, 0x2f, 0x2f, 0x67, 0x69, 0x74, 0x68, 0x75, 0x62, 0x2e, 0x63, - 0x6f, 0x6d, 0x2f, 0x67, 0x67, 0x65, 0x72, 0x67, 0x61, 0x6e, 0x6f, 0x76, - 0x2f, 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x22, 0x3e, - 0x6c, 0x6c, 0x61, 0x6d, 0x61, 0x2e, 0x63, 0x70, 0x70, 0x3c, 0x2f, 0x61, - 0x3e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x3c, 0x61, 0x20, 0x68, 0x72, 0x65, - 0x66, 0x3d, 0x22, 0x68, 0x74, 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x67, - 0x67, 0x6d, 0x6c, 0x2e, 0x61, 0x69, 0x22, 0x3e, 0x67, 0x67, 0x6d, 0x6c, - 0x2e, 0x61, 0x69, 0x3c, 0x2f, 0x61, 0x3e, 0x2e, 0x3c, 0x2f, 0x70, 0x3e, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, - 0x2f, 0x66, 0x6f, 0x6f, 0x74, 0x65, 0x72, 0x3e, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x6e, 0x64, - 0x65, 0x72, 0x28, 0x68, 0x28, 0x41, 0x70, 0x70, 0x29, 0x2c, 0x20, 0x64, - 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x71, 0x75, 0x65, 0x72, - 0x79, 0x53, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x27, 0x23, - 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, 0x65, 0x72, 0x27, 0x29, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74, - 0x3e, 0x0a, 0x3c, 0x2f, 0x68, 0x65, 0x61, 0x64, 0x3e, 0x0a, 0x0a, 0x3c, - 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x20, 0x20, 0x3c, 0x64, 0x69, 0x76, - 0x20, 0x69, 0x64, 0x3d, 0x22, 0x63, 0x6f, 0x6e, 0x74, 0x61, 0x69, 0x6e, - 0x65, 0x72, 0x22, 0x3e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x3c, 0x69, 0x6e, - 0x70, 0x75, 0x74, 0x20, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x22, 0x66, 0x69, - 0x6c, 0x65, 0x22, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x66, 0x69, 0x6c, 0x65, - 0x49, 0x6e, 0x70, 0x75, 0x74, 0x22, 0x20, 0x61, 0x63, 0x63, 0x65, 0x70, - 0x74, 0x3d, 0x22, 0x69, 0x6d, 0x61, 0x67, 0x65, 0x2f, 0x2a, 0x22, 0x20, - 0x73, 0x74, 0x79, 0x6c, 0x65, 0x3d, 0x22, 0x64, 0x69, 0x73, 0x70, 0x6c, - 0x61, 0x79, 0x3a, 0x20, 0x6e, 0x6f, 0x6e, 0x65, 0x3b, 0x22, 0x3e, 0x0a, - 0x20, 0x20, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x20, 0x20, 0x3c, - 0x64, 0x69, 0x76, 0x20, 0x69, 0x64, 0x3d, 0x22, 0x70, 0x6f, 0x72, 0x74, - 0x61, 0x6c, 0x22, 0x3e, 0x3c, 0x2f, 0x64, 0x69, 0x76, 0x3e, 0x0a, 0x3c, - 0x2f, 0x62, 0x6f, 0x64, 0x79, 0x3e, 0x0a, 0x0a, 0x3c, 0x2f, 0x68, 0x74, - 0x6d, 0x6c, 0x3e, 0x0a, 0x0a -}; -size_t index_html_len = 33809; diff --git a/examples/server/index.js.hpp b/examples/server/index.js.hpp deleted file mode 100644 index 24d0b33fa..000000000 --- a/examples/server/index.js.hpp +++ /dev/null @@ -1,1931 +0,0 @@ -unsigned char index_js[] = { - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x28, 0x29, - 0x7b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, - 0x72, 0x72, 0x6f, 0x72, 0x28, 0x22, 0x43, 0x79, 0x63, 0x6c, 0x65, 0x20, - 0x64, 0x65, 0x74, 0x65, 0x63, 0x74, 0x65, 0x64, 0x22, 0x29, 0x7d, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x53, 0x79, 0x6d, 0x62, 0x6f, - 0x6c, 0x2e, 0x66, 0x6f, 0x72, 0x28, 0x22, 0x70, 0x72, 0x65, 0x61, 0x63, - 0x74, 0x2d, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x73, 0x22, 0x29, 0x3b, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x65, 0x28, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x66, 0x3e, 0x31, 0x29, 0x7b, 0x66, 0x2d, 0x2d, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7d, 0x6c, 0x65, 0x74, 0x20, - 0x74, 0x2c, 0x6e, 0x3d, 0x21, 0x31, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, - 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6f, 0x29, - 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x3d, 0x6f, 0x3b, 0x6f, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x6c, 0x2b, 0x2b, 0x3b, 0x77, 0x68, - 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, - 0x3d, 0x5f, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x69, 0x3d, - 0x5f, 0x2e, 0x6f, 0x3b, 0x5f, 0x2e, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x3b, 0x5f, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x3b, 0x69, - 0x66, 0x28, 0x21, 0x28, 0x38, 0x26, 0x5f, 0x2e, 0x66, 0x29, 0x26, 0x26, - 0x70, 0x28, 0x5f, 0x29, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x5f, 0x2e, 0x63, - 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x65, 0x29, 0x7b, - 0x69, 0x66, 0x28, 0x21, 0x6e, 0x29, 0x7b, 0x74, 0x3d, 0x65, 0x3b, 0x6e, - 0x3d, 0x21, 0x30, 0x7d, 0x7d, 0x5f, 0x3d, 0x69, 0x7d, 0x7d, 0x6c, 0x3d, - 0x30, 0x3b, 0x66, 0x2d, 0x2d, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x74, - 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x5f, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, - 0x66, 0x3e, 0x30, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x28, 0x29, 0x3b, 0x66, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x7d, 0x66, 0x69, - 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x65, 0x28, 0x29, 0x7d, 0x7d, 0x6c, - 0x65, 0x74, 0x20, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x3d, 0x30, 0x3b, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x75, 0x28, 0x74, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x72, 0x3e, 0x30, 0x29, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x6e, 0x3d, 0x69, 0x3b, 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x3b, 0x72, 0x2b, 0x2b, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x29, 0x7d, 0x66, 0x69, 0x6e, - 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x72, 0x2d, 0x2d, 0x3b, 0x69, 0x3d, 0x6e, - 0x7d, 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x66, 0x3d, 0x30, 0x2c, 0x6c, 0x3d, - 0x30, 0x2c, 0x73, 0x3d, 0x30, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x63, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x69, 0x29, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, - 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x6e, 0x2e, 0x74, 0x21, 0x3d, 0x3d, - 0x69, 0x29, 0x7b, 0x6e, 0x3d, 0x7b, 0x69, 0x3a, 0x30, 0x2c, 0x53, 0x3a, - 0x74, 0x2c, 0x70, 0x3a, 0x69, 0x2e, 0x73, 0x2c, 0x6e, 0x3a, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x2c, 0x74, 0x3a, 0x69, 0x2c, 0x65, 0x3a, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x78, 0x3a, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x2c, 0x72, 0x3a, 0x6e, 0x7d, 0x3b, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x69, 0x2e, 0x73, 0x29, - 0x69, 0x2e, 0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, - 0x6e, 0x3b, 0x74, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x33, - 0x32, 0x26, 0x69, 0x2e, 0x66, 0x29, 0x74, 0x2e, 0x53, 0x28, 0x6e, 0x29, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x2d, 0x31, 0x3d, 0x3d, 0x3d, 0x6e, - 0x2e, 0x69, 0x29, 0x7b, 0x6e, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x69, 0x66, - 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, - 0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x6e, 0x2e, 0x70, 0x3d, 0x6e, 0x2e, 0x70, - 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, - 0x3d, 0x6e, 0x2e, 0x70, 0x29, 0x6e, 0x2e, 0x70, 0x2e, 0x6e, 0x3d, 0x6e, - 0x2e, 0x6e, 0x3b, 0x6e, 0x2e, 0x70, 0x3d, 0x69, 0x2e, 0x73, 0x3b, 0x6e, - 0x2e, 0x6e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x2e, - 0x73, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x69, 0x2e, 0x73, 0x3d, 0x6e, 0x7d, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x68, 0x28, 0x74, 0x29, 0x7b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x69, 0x3d, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6e, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x74, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x68, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x62, 0x72, - 0x61, 0x6e, 0x64, 0x3d, 0x6e, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x68, 0x3d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x74, 0x21, 0x3d, 0x3d, 0x74, 0x26, 0x26, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x65, 0x29, 0x7b, - 0x74, 0x2e, 0x78, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3b, 0x69, - 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, - 0x2e, 0x65, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3d, - 0x74, 0x7d, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, - 0x79, 0x70, 0x65, 0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, - 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, - 0x65, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x78, 0x3b, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x7b, 0x6e, - 0x2e, 0x78, 0x3d, 0x65, 0x3b, 0x74, 0x2e, 0x65, 0x3d, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x65, 0x2e, 0x65, 0x3d, 0x6e, - 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, - 0x69, 0x66, 0x28, 0x74, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x74, 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3d, 0x65, 0x7d, 0x7d, - 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, - 0x2e, 0x73, 0x75, 0x62, 0x73, 0x63, 0x72, 0x69, 0x62, 0x65, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x3b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x77, 0x28, 0x28, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2c, 0x5f, 0x3d, 0x33, 0x32, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, - 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x33, - 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x28, 0x65, 0x29, 0x7d, 0x66, 0x69, - 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, - 0x7c, 0x3d, 0x5f, 0x7d, 0x7d, 0x29, 0x29, 0x7d, 0x3b, 0x68, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x4f, 0x66, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3b, 0x68, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, - 0x6f, 0x53, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x3d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x2b, 0x22, 0x22, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, 0x6f, 0x4a, 0x53, 0x4f, 0x4e, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x3b, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, - 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69, - 0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x68, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x22, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x7b, 0x67, 0x65, 0x74, 0x28, - 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x63, 0x28, - 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, 0x69, 0x3d, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x2c, 0x73, 0x65, - 0x74, 0x28, 0x6e, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x69, 0x20, 0x69, 0x6e, - 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, 0x6f, 0x66, 0x20, 0x79, 0x29, 0x21, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x74, - 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, - 0x6f, 0x72, 0x28, 0x22, 0x43, 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, - 0x20, 0x63, 0x61, 0x6e, 0x6e, 0x6f, 0x74, 0x20, 0x68, 0x61, 0x76, 0x65, - 0x20, 0x73, 0x69, 0x64, 0x65, 0x2d, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, - 0x73, 0x22, 0x29, 0x7d, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x21, - 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x29, 0x7b, 0x69, 0x66, - 0x28, 0x6c, 0x3e, 0x31, 0x30, 0x30, 0x29, 0x74, 0x28, 0x29, 0x3b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x6e, 0x3b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x69, 0x2b, 0x2b, 0x3b, 0x73, 0x2b, 0x2b, 0x3b, 0x66, 0x2b, 0x2b, - 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, - 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x3b, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, - 0x2e, 0x78, 0x29, 0x74, 0x2e, 0x74, 0x2e, 0x4e, 0x28, 0x29, 0x7d, 0x66, - 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x65, 0x28, 0x29, 0x7d, 0x7d, - 0x7d, 0x7d, 0x29, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x61, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x6e, 0x65, 0x77, 0x20, 0x68, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x70, 0x28, 0x74, 0x29, 0x7b, - 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, - 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, - 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x2e, - 0x53, 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69, 0x7c, 0x7c, 0x21, - 0x6e, 0x2e, 0x53, 0x2e, 0x68, 0x28, 0x29, 0x7c, 0x7c, 0x6e, 0x2e, 0x53, - 0x2e, 0x69, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x69, 0x29, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x21, 0x31, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x64, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, - 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, - 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x53, - 0x2e, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x21, 0x3d, 0x3d, 0x65, 0x29, 0x6e, 0x2e, 0x72, 0x3d, 0x65, 0x3b, 0x6e, - 0x2e, 0x53, 0x2e, 0x6e, 0x3d, 0x6e, 0x3b, 0x6e, 0x2e, 0x69, 0x3d, 0x2d, - 0x31, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, - 0x3d, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x73, 0x3d, 0x6e, - 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x7d, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x76, 0x28, 0x74, 0x29, 0x7b, 0x6c, - 0x65, 0x74, 0x20, 0x6e, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x73, 0x3b, 0x77, - 0x68, 0x69, 0x6c, 0x65, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, - 0x3d, 0x3d, 0x65, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, - 0x3d, 0x65, 0x2e, 0x70, 0x3b, 0x69, 0x66, 0x28, 0x2d, 0x31, 0x3d, 0x3d, - 0x3d, 0x65, 0x2e, 0x69, 0x29, 0x7b, 0x65, 0x2e, 0x53, 0x2e, 0x55, 0x28, - 0x65, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x21, 0x3d, 0x3d, 0x74, 0x29, 0x74, 0x2e, 0x6e, 0x3d, 0x65, 0x2e, 0x6e, - 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, - 0x3d, 0x65, 0x2e, 0x6e, 0x29, 0x65, 0x2e, 0x6e, 0x2e, 0x70, 0x3d, 0x74, - 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x6e, 0x3d, 0x65, 0x3b, 0x65, 0x2e, - 0x53, 0x2e, 0x6e, 0x3d, 0x65, 0x2e, 0x72, 0x3b, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x72, 0x29, - 0x65, 0x2e, 0x72, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x65, - 0x3d, 0x74, 0x7d, 0x74, 0x2e, 0x73, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x79, 0x28, 0x74, 0x29, 0x7b, 0x68, - 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x78, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, - 0x3d, 0x73, 0x2d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x3d, - 0x34, 0x7d, 0x28, 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, - 0x70, 0x65, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x68, 0x29, 0x2e, 0x68, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x3b, 0x69, 0x66, - 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x33, 0x32, - 0x3d, 0x3d, 0x28, 0x33, 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, - 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x35, 0x3b, 0x69, 0x66, - 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x3d, 0x3d, 0x3d, 0x73, 0x29, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x67, 0x3d, 0x73, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, - 0x7c, 0x3d, 0x31, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x69, 0x3e, 0x30, 0x26, 0x26, 0x21, 0x70, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, - 0x32, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x7d, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x69, 0x3b, 0x74, 0x72, 0x79, - 0x7b, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x3d, 0x74, - 0x68, 0x69, 0x73, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, - 0x31, 0x36, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x7c, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x76, 0x21, 0x3d, 0x3d, 0x74, 0x7c, 0x7c, 0x30, - 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x69, 0x29, 0x7b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x31, 0x37, 0x3b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, - 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3d, 0x74, 0x3b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x31, 0x36, 0x3b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x69, 0x2b, 0x2b, 0x7d, 0x69, 0x3d, 0x74, 0x3b, - 0x76, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x21, 0x30, 0x7d, 0x3b, 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, - 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x33, - 0x36, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x21, 0x3d, 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x6e, 0x29, - 0x74, 0x2e, 0x53, 0x2e, 0x53, 0x28, 0x74, 0x29, 0x7d, 0x68, 0x2e, 0x70, - 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x53, 0x2e, 0x63, - 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x29, 0x7d, - 0x3b, 0x79, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, - 0x2e, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x68, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x55, - 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, - 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, - 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x74, 0x29, 0x7b, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x33, 0x33, 0x3b, 0x66, 0x6f, - 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x73, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, - 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x6e, 0x29, 0x74, 0x2e, 0x53, 0x2e, - 0x55, 0x28, 0x74, 0x29, 0x7d, 0x7d, 0x7d, 0x3b, 0x79, 0x2e, 0x70, 0x72, - 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x4e, 0x3d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, - 0x21, 0x28, 0x32, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, - 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x36, 0x3b, 0x66, - 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x74, 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, - 0x3d, 0x74, 0x3b, 0x74, 0x3d, 0x74, 0x2e, 0x78, 0x29, 0x74, 0x2e, 0x74, - 0x2e, 0x4e, 0x28, 0x29, 0x7d, 0x7d, 0x3b, 0x79, 0x2e, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x69, - 0x66, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x68, 0x28, 0x29, 0x29, - 0x74, 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, 0x26, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x3b, 0x4f, 0x62, 0x6a, - 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x50, 0x72, - 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x79, 0x2e, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x22, 0x76, 0x61, 0x6c, 0x75, - 0x65, 0x22, 0x2c, 0x7b, 0x67, 0x65, 0x74, 0x28, 0x29, 0x7b, 0x69, 0x66, - 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x28, - 0x29, 0x3b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x63, 0x28, - 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x68, - 0x28, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x21, 0x3d, 0x3d, 0x6e, 0x29, 0x6e, 0x2e, 0x69, 0x3d, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x31, 0x36, 0x26, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x29, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x76, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x7d, 0x7d, 0x29, 0x3b, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6d, 0x28, 0x74, 0x29, - 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x65, 0x77, 0x20, - 0x79, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x67, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x75, 0x3b, 0x74, 0x2e, 0x75, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x29, 0x7b, 0x66, 0x2b, 0x2b, 0x3b, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5f, 0x3d, 0x69, 0x3b, 0x69, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x6e, 0x28, - 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, 0x29, 0x7b, 0x74, - 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x74, 0x2e, 0x66, 0x7c, 0x3d, - 0x38, 0x3b, 0x62, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x68, 0x72, 0x6f, 0x77, - 0x20, 0x6e, 0x7d, 0x66, 0x69, 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x69, - 0x3d, 0x5f, 0x3b, 0x65, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x62, 0x28, 0x74, 0x29, 0x7b, 0x66, - 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x73, - 0x3b, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x3b, - 0x6e, 0x3d, 0x6e, 0x2e, 0x6e, 0x29, 0x6e, 0x2e, 0x53, 0x2e, 0x55, 0x28, - 0x6e, 0x29, 0x3b, 0x74, 0x2e, 0x78, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x3b, 0x74, 0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x3b, 0x67, 0x28, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x6b, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x69, - 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x29, 0x74, 0x68, 0x72, 0x6f, - 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, - 0x22, 0x4f, 0x75, 0x74, 0x2d, 0x6f, 0x66, 0x2d, 0x6f, 0x72, 0x64, 0x65, - 0x72, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, 0x74, 0x22, 0x29, 0x3b, 0x76, - 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x69, 0x3d, 0x74, 0x3b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x69, 0x66, - 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x62, 0x28, - 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x65, 0x28, 0x29, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x53, 0x28, 0x74, 0x29, 0x7b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x3d, 0x74, 0x3b, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x73, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x3d, 0x33, 0x32, - 0x7d, 0x53, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, - 0x2e, 0x63, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x53, 0x28, 0x29, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x69, - 0x66, 0x28, 0x38, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, - 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x3b, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x78, 0x28, 0x29, - 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, - 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x75, 0x3d, 0x6e, 0x7d, 0x66, 0x69, - 0x6e, 0x61, 0x6c, 0x6c, 0x79, 0x7b, 0x74, 0x28, 0x29, 0x7d, 0x7d, 0x3b, - 0x53, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, - 0x53, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, - 0x29, 0x74, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, - 0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x26, 0x3d, 0x2d, - 0x39, 0x3b, 0x67, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x64, 0x28, - 0x74, 0x68, 0x69, 0x73, 0x29, 0x3b, 0x66, 0x2b, 0x2b, 0x3b, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x69, 0x3b, 0x69, 0x3d, 0x74, 0x68, - 0x69, 0x73, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, 0x2e, - 0x62, 0x69, 0x6e, 0x64, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e, 0x29, - 0x7d, 0x3b, 0x53, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, - 0x65, 0x2e, 0x4e, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x32, 0x26, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x66, 0x29, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6f, 0x3d, - 0x6f, 0x3b, 0x6f, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x7d, 0x7d, 0x3b, 0x53, - 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, - 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x7c, 0x3d, 0x38, 0x3b, 0x69, 0x66, - 0x28, 0x21, 0x28, 0x31, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x66, 0x29, - 0x29, 0x62, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x7d, 0x3b, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x77, 0x28, 0x74, 0x29, 0x7b, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, - 0x53, 0x28, 0x74, 0x29, 0x3b, 0x74, 0x72, 0x79, 0x7b, 0x6e, 0x2e, 0x63, - 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, - 0x6e, 0x2e, 0x64, 0x28, 0x29, 0x3b, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, - 0x74, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, 0x64, - 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x29, 0x7d, 0x76, 0x61, 0x72, - 0x20, 0x78, 0x2c, 0x43, 0x2c, 0x45, 0x2c, 0x55, 0x2c, 0x48, 0x2c, 0x50, - 0x2c, 0x4e, 0x2c, 0x24, 0x2c, 0x44, 0x2c, 0x54, 0x3d, 0x7b, 0x7d, 0x2c, - 0x56, 0x3d, 0x5b, 0x5d, 0x2c, 0x41, 0x3d, 0x2f, 0x61, 0x63, 0x69, 0x74, - 0x7c, 0x65, 0x78, 0x28, 0x3f, 0x3a, 0x73, 0x7c, 0x67, 0x7c, 0x6e, 0x7c, - 0x70, 0x7c, 0x24, 0x29, 0x7c, 0x72, 0x70, 0x68, 0x7c, 0x67, 0x72, 0x69, - 0x64, 0x7c, 0x6f, 0x77, 0x73, 0x7c, 0x6d, 0x6e, 0x63, 0x7c, 0x6e, 0x74, - 0x77, 0x7c, 0x69, 0x6e, 0x65, 0x5b, 0x63, 0x68, 0x5d, 0x7c, 0x7a, 0x6f, - 0x6f, 0x7c, 0x5e, 0x6f, 0x72, 0x64, 0x7c, 0x69, 0x74, 0x65, 0x72, 0x61, - 0x2f, 0x69, 0x2c, 0x46, 0x3d, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69, - 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x66, - 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x65, 0x20, 0x69, 0x6e, 0x20, - 0x6e, 0x29, 0x74, 0x5b, 0x65, 0x5d, 0x3d, 0x6e, 0x5b, 0x65, 0x5d, 0x3b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x57, 0x28, 0x74, 0x29, 0x7b, 0x76, - 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x70, 0x61, 0x72, 0x65, 0x6e, - 0x74, 0x4e, 0x6f, 0x64, 0x65, 0x3b, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x72, - 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x28, 0x74, - 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c, - 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x3d, 0x7b, 0x7d, 0x3b, 0x66, - 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x22, 0x6b, - 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, 0x5b, 0x6f, - 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x69, - 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x72, 0x5b, 0x6f, 0x5d, 0x3d, 0x6e, - 0x5b, 0x6f, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d, - 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, - 0x32, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, - 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, - 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x33, 0x3f, 0x78, 0x2e, - 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, - 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, 0x2c, 0x22, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, - 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, - 0x6f, 0x70, 0x73, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6f, 0x20, 0x69, 0x6e, - 0x20, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, - 0x6f, 0x70, 0x73, 0x29, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, - 0x3d, 0x72, 0x5b, 0x6f, 0x5d, 0x26, 0x26, 0x28, 0x72, 0x5b, 0x6f, 0x5d, - 0x3d, 0x74, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, 0x50, 0x72, - 0x6f, 0x70, 0x73, 0x5b, 0x6f, 0x5d, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x4f, 0x28, 0x74, 0x2c, 0x72, 0x2c, 0x5f, 0x2c, 0x69, - 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x4f, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, - 0x5f, 0x2c, 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x7b, - 0x74, 0x79, 0x70, 0x65, 0x3a, 0x74, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x73, - 0x3a, 0x6e, 0x2c, 0x6b, 0x65, 0x79, 0x3a, 0x65, 0x2c, 0x72, 0x65, 0x66, - 0x3a, 0x5f, 0x2c, 0x5f, 0x5f, 0x6b, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, - 0x5f, 0x5f, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f, 0x5f, 0x62, 0x3a, - 0x30, 0x2c, 0x5f, 0x5f, 0x65, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5f, - 0x5f, 0x64, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x5f, - 0x63, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x2c, 0x5f, 0x5f, 0x76, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, - 0x69, 0x3f, 0x2b, 0x2b, 0x45, 0x3a, 0x69, 0x2c, 0x5f, 0x5f, 0x69, 0x3a, - 0x2d, 0x31, 0x2c, 0x5f, 0x5f, 0x75, 0x3a, 0x30, 0x7d, 0x3b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x69, - 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x43, 0x2e, 0x76, 0x6e, - 0x6f, 0x64, 0x65, 0x26, 0x26, 0x43, 0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65, - 0x28, 0x6f, 0x29, 0x2c, 0x6f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x52, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3a, 0x6e, 0x75, - 0x6c, 0x6c, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x6a, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x7d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x28, 0x74, - 0x2c, 0x6e, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, - 0x70, 0x73, 0x3d, 0x74, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, - 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x71, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, - 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6e, 0x29, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x3f, 0x71, - 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x69, 0x2b, - 0x31, 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, 0x66, 0x6f, 0x72, 0x28, - 0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x6e, 0x3c, 0x74, 0x2e, 0x5f, 0x5f, - 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6e, 0x2b, 0x2b, - 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x65, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x6e, 0x5d, 0x29, 0x26, 0x26, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x29, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x5f, 0x5f, 0x65, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3f, 0x71, 0x28, 0x74, - 0x29, 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x42, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, - 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x2c, 0x69, 0x3d, 0x74, 0x2e, 0x5f, - 0x5f, 0x76, 0x2c, 0x6f, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x72, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x50, 0x3b, 0x69, 0x66, 0x28, 0x72, 0x29, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x28, 0x5f, 0x3d, 0x4d, 0x28, 0x7b, - 0x7d, 0x2c, 0x69, 0x29, 0x29, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x69, 0x2e, - 0x5f, 0x5f, 0x76, 0x2b, 0x31, 0x2c, 0x43, 0x2e, 0x76, 0x6e, 0x6f, 0x64, - 0x65, 0x26, 0x26, 0x43, 0x2e, 0x76, 0x6e, 0x6f, 0x64, 0x65, 0x28, 0x5f, - 0x29, 0x2c, 0x69, 0x74, 0x28, 0x72, 0x2c, 0x5f, 0x2c, 0x69, 0x2c, 0x74, - 0x2e, 0x5f, 0x5f, 0x6e, 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, - 0x3d, 0x3d, 0x72, 0x2e, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, - 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x33, 0x32, 0x26, 0x69, - 0x2e, 0x5f, 0x5f, 0x75, 0x3f, 0x5b, 0x6f, 0x5d, 0x3a, 0x6e, 0x75, 0x6c, - 0x6c, 0x2c, 0x6e, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6f, 0x3f, - 0x71, 0x28, 0x69, 0x29, 0x3a, 0x6f, 0x2c, 0x21, 0x21, 0x28, 0x33, 0x32, - 0x26, 0x69, 0x2e, 0x5f, 0x5f, 0x75, 0x29, 0x2c, 0x65, 0x29, 0x2c, 0x5f, - 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x76, 0x2c, 0x5f, - 0x2e, 0x5f, 0x5f, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x5f, 0x2e, 0x5f, 0x5f, - 0x69, 0x5d, 0x3d, 0x5f, 0x2c, 0x5f, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x5f, 0x2e, 0x5f, 0x5f, 0x65, 0x21, - 0x3d, 0x6f, 0x26, 0x26, 0x47, 0x28, 0x5f, 0x29, 0x2c, 0x5f, 0x7d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x47, 0x28, 0x74, 0x29, - 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3b, 0x69, 0x66, 0x28, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x74, 0x3d, 0x74, 0x2e, 0x5f, - 0x5f, 0x29, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x2e, - 0x5f, 0x5f, 0x63, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x74, 0x2e, 0x5f, - 0x5f, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, - 0x65, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x3d, 0x30, 0x3b, 0x6e, - 0x3c, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x3b, 0x6e, 0x2b, 0x2b, 0x29, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, - 0x6c, 0x21, 0x3d, 0x28, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, - 0x6e, 0x5d, 0x29, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x65, - 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x65, - 0x2e, 0x5f, 0x5f, 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x47, 0x28, 0x74, 0x29, 0x7d, 0x7d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x7a, 0x28, 0x74, - 0x29, 0x7b, 0x28, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, 0x26, 0x28, - 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x30, 0x29, 0x26, 0x26, 0x48, - 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x26, 0x26, 0x21, 0x4a, - 0x2e, 0x5f, 0x5f, 0x72, 0x2b, 0x2b, 0x7c, 0x7c, 0x50, 0x21, 0x3d, 0x3d, - 0x43, 0x2e, 0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, 0x52, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x26, 0x26, 0x28, 0x28, - 0x50, 0x3d, 0x43, 0x2e, 0x64, 0x65, 0x62, 0x6f, 0x75, 0x6e, 0x63, 0x65, - 0x52, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x69, 0x6e, 0x67, 0x29, 0x7c, 0x7c, - 0x4e, 0x29, 0x28, 0x4a, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x4a, 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, - 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x3d, 0x5b, 0x5d, 0x2c, 0x69, 0x3d, - 0x5b, 0x5d, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x48, 0x2e, 0x73, 0x6f, 0x72, - 0x74, 0x28, 0x24, 0x29, 0x3b, 0x74, 0x3d, 0x48, 0x2e, 0x73, 0x68, 0x69, - 0x66, 0x74, 0x28, 0x29, 0x3b, 0x29, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, - 0x26, 0x28, 0x65, 0x3d, 0x48, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x2c, 0x6e, 0x3d, 0x42, 0x28, 0x74, 0x2c, 0x5f, 0x2c, 0x69, 0x29, 0x7c, - 0x7c, 0x6e, 0x2c, 0x30, 0x3d, 0x3d, 0x3d, 0x65, 0x7c, 0x7c, 0x48, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x65, 0x3f, 0x28, 0x6f, 0x74, - 0x28, 0x5f, 0x2c, 0x6e, 0x2c, 0x69, 0x29, 0x2c, 0x69, 0x2e, 0x6c, 0x65, - 0x6e, 0x67, 0x74, 0x68, 0x3d, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x3d, 0x30, 0x2c, 0x6e, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x2c, 0x48, 0x2e, 0x73, 0x6f, 0x72, 0x74, 0x28, 0x24, 0x29, 0x29, 0x3a, - 0x6e, 0x26, 0x26, 0x43, 0x2e, 0x5f, 0x5f, 0x63, 0x26, 0x26, 0x43, 0x2e, - 0x5f, 0x5f, 0x63, 0x28, 0x6e, 0x2c, 0x56, 0x29, 0x29, 0x3b, 0x6e, 0x26, - 0x26, 0x6f, 0x74, 0x28, 0x5f, 0x2c, 0x6e, 0x2c, 0x69, 0x29, 0x2c, 0x4a, - 0x2e, 0x5f, 0x5f, 0x72, 0x3d, 0x30, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x4b, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, - 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x2c, - 0x6c, 0x2c, 0x73, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x63, 0x2c, 0x68, - 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64, 0x2c, 0x76, 0x3d, 0x5f, 0x26, 0x26, - 0x5f, 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x56, 0x2c, 0x79, 0x3d, 0x6e, - 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x66, 0x6f, 0x72, 0x28, - 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x66, 0x2c, 0x51, 0x28, 0x65, 0x2c, - 0x6e, 0x2c, 0x76, 0x29, 0x2c, 0x66, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x64, - 0x2c, 0x63, 0x3d, 0x30, 0x3b, 0x63, 0x3c, 0x79, 0x3b, 0x63, 0x2b, 0x2b, - 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x61, 0x3d, 0x65, 0x2e, - 0x5f, 0x5f, 0x6b, 0x5b, 0x63, 0x5d, 0x29, 0x26, 0x26, 0x22, 0x62, 0x6f, - 0x6f, 0x6c, 0x65, 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, - 0x6f, 0x66, 0x20, 0x61, 0x26, 0x26, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, - 0x20, 0x61, 0x26, 0x26, 0x28, 0x68, 0x3d, 0x2d, 0x31, 0x3d, 0x3d, 0x3d, - 0x61, 0x2e, 0x5f, 0x5f, 0x69, 0x3f, 0x54, 0x3a, 0x76, 0x5b, 0x61, 0x2e, - 0x5f, 0x5f, 0x69, 0x5d, 0x7c, 0x7c, 0x54, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, - 0x69, 0x3d, 0x63, 0x2c, 0x69, 0x74, 0x28, 0x74, 0x2c, 0x61, 0x2c, 0x68, - 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x2c, 0x6c, - 0x2c, 0x73, 0x29, 0x2c, 0x70, 0x3d, 0x61, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, - 0x61, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26, 0x68, 0x2e, 0x72, 0x65, 0x66, - 0x21, 0x3d, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x26, 0x26, 0x28, 0x68, 0x2e, - 0x72, 0x65, 0x66, 0x26, 0x26, 0x75, 0x74, 0x28, 0x68, 0x2e, 0x72, 0x65, - 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x29, 0x2c, 0x73, 0x2e, - 0x70, 0x75, 0x73, 0x68, 0x28, 0x61, 0x2e, 0x72, 0x65, 0x66, 0x2c, 0x61, - 0x2e, 0x5f, 0x5f, 0x63, 0x7c, 0x7c, 0x70, 0x2c, 0x61, 0x29, 0x29, 0x2c, - 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x64, 0x26, 0x26, 0x6e, 0x75, 0x6c, - 0x6c, 0x21, 0x3d, 0x70, 0x26, 0x26, 0x28, 0x64, 0x3d, 0x70, 0x29, 0x2c, - 0x36, 0x35, 0x35, 0x33, 0x36, 0x26, 0x61, 0x2e, 0x5f, 0x5f, 0x75, 0x7c, - 0x7c, 0x68, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x3d, 0x3d, 0x61, 0x2e, 0x5f, - 0x5f, 0x6b, 0x3f, 0x66, 0x3d, 0x58, 0x28, 0x61, 0x2c, 0x66, 0x2c, 0x74, - 0x29, 0x3a, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, - 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x61, 0x2e, 0x74, - 0x79, 0x70, 0x65, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, - 0x3d, 0x3d, 0x61, 0x2e, 0x5f, 0x5f, 0x64, 0x3f, 0x66, 0x3d, 0x61, 0x2e, - 0x5f, 0x5f, 0x64, 0x3a, 0x70, 0x26, 0x26, 0x28, 0x66, 0x3d, 0x70, 0x2e, - 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, 0x29, - 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x2c, 0x61, 0x2e, 0x5f, 0x5f, 0x75, 0x26, 0x3d, 0x2d, 0x31, 0x39, - 0x36, 0x36, 0x30, 0x39, 0x29, 0x3b, 0x65, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, - 0x66, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x64, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x51, 0x28, 0x74, 0x2c, 0x6e, - 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x2c, 0x69, 0x2c, - 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x3d, 0x6e, 0x2e, 0x6c, 0x65, - 0x6e, 0x67, 0x74, 0x68, 0x2c, 0x6c, 0x3d, 0x65, 0x2e, 0x6c, 0x65, 0x6e, - 0x67, 0x74, 0x68, 0x2c, 0x73, 0x3d, 0x6c, 0x2c, 0x63, 0x3d, 0x30, 0x3b, - 0x66, 0x6f, 0x72, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x5b, 0x5d, - 0x2c, 0x5f, 0x3d, 0x30, 0x3b, 0x5f, 0x3c, 0x66, 0x3b, 0x5f, 0x2b, 0x2b, - 0x29, 0x72, 0x3d, 0x5f, 0x2b, 0x63, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, - 0x3d, 0x28, 0x69, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x5b, 0x5f, 0x5d, - 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x28, 0x69, 0x3d, 0x6e, 0x5b, - 0x5f, 0x5d, 0x29, 0x7c, 0x7c, 0x22, 0x62, 0x6f, 0x6f, 0x6c, 0x65, 0x61, - 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, - 0x7c, 0x7c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, - 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x3f, 0x6e, - 0x75, 0x6c, 0x6c, 0x3a, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, - 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x7c, 0x7c, - 0x22, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x22, 0x3d, 0x3d, 0x74, 0x79, - 0x70, 0x65, 0x6f, 0x66, 0x20, 0x69, 0x7c, 0x7c, 0x22, 0x62, 0x69, 0x67, - 0x69, 0x6e, 0x74, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, - 0x20, 0x69, 0x7c, 0x7c, 0x69, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, - 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3d, 0x3d, 0x53, 0x74, 0x72, 0x69, 0x6e, - 0x67, 0x3f, 0x4f, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x69, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, - 0x6c, 0x29, 0x3a, 0x46, 0x28, 0x69, 0x29, 0x3f, 0x4f, 0x28, 0x6a, 0x2c, - 0x7b, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x69, 0x7d, - 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x29, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, - 0x3d, 0x3d, 0x69, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, - 0x74, 0x6f, 0x72, 0x26, 0x26, 0x69, 0x2e, 0x5f, 0x5f, 0x62, 0x3e, 0x30, - 0x3f, 0x4f, 0x28, 0x69, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x69, 0x2e, - 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x69, 0x2e, 0x6b, 0x65, 0x79, 0x2c, - 0x69, 0x2e, 0x72, 0x65, 0x66, 0x3f, 0x69, 0x2e, 0x72, 0x65, 0x66, 0x3a, - 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x3a, - 0x69, 0x29, 0x3f, 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x69, - 0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x62, 0x2b, 0x31, - 0x2c, 0x75, 0x3d, 0x5a, 0x28, 0x69, 0x2c, 0x65, 0x2c, 0x72, 0x2c, 0x73, - 0x29, 0x2c, 0x69, 0x2e, 0x5f, 0x5f, 0x69, 0x3d, 0x75, 0x2c, 0x6f, 0x3d, - 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x2d, 0x31, 0x21, 0x3d, 0x3d, 0x75, 0x26, - 0x26, 0x28, 0x73, 0x2d, 0x2d, 0x2c, 0x28, 0x6f, 0x3d, 0x65, 0x5b, 0x75, - 0x5d, 0x29, 0x26, 0x26, 0x28, 0x6f, 0x2e, 0x5f, 0x5f, 0x75, 0x7c, 0x3d, - 0x31, 0x33, 0x31, 0x30, 0x37, 0x32, 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x6f, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, - 0x3d, 0x6f, 0x2e, 0x5f, 0x5f, 0x76, 0x3f, 0x28, 0x2d, 0x31, 0x3d, 0x3d, - 0x75, 0x26, 0x26, 0x63, 0x2d, 0x2d, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x69, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x28, 0x69, - 0x2e, 0x5f, 0x5f, 0x75, 0x7c, 0x3d, 0x36, 0x35, 0x35, 0x33, 0x36, 0x29, - 0x29, 0x3a, 0x75, 0x21, 0x3d, 0x3d, 0x72, 0x26, 0x26, 0x28, 0x75, 0x3d, - 0x3d, 0x3d, 0x72, 0x2b, 0x31, 0x3f, 0x63, 0x2b, 0x2b, 0x3a, 0x75, 0x3e, - 0x72, 0x3f, 0x73, 0x3e, 0x66, 0x2d, 0x72, 0x3f, 0x63, 0x2b, 0x3d, 0x75, - 0x2d, 0x72, 0x3a, 0x63, 0x2d, 0x2d, 0x3a, 0x75, 0x3c, 0x72, 0x3f, 0x75, - 0x3d, 0x3d, 0x72, 0x2d, 0x31, 0x26, 0x26, 0x28, 0x63, 0x3d, 0x75, 0x2d, - 0x72, 0x29, 0x3a, 0x63, 0x3d, 0x30, 0x2c, 0x75, 0x21, 0x3d, 0x3d, 0x5f, - 0x2b, 0x63, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x5f, 0x5f, 0x75, 0x7c, 0x3d, - 0x36, 0x35, 0x35, 0x33, 0x36, 0x29, 0x29, 0x29, 0x3a, 0x28, 0x6f, 0x3d, - 0x65, 0x5b, 0x72, 0x5d, 0x29, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, - 0x3d, 0x6f, 0x2e, 0x6b, 0x65, 0x79, 0x26, 0x26, 0x6f, 0x2e, 0x5f, 0x5f, - 0x65, 0x26, 0x26, 0x30, 0x3d, 0x3d, 0x28, 0x31, 0x33, 0x31, 0x30, 0x37, - 0x32, 0x26, 0x6f, 0x2e, 0x5f, 0x5f, 0x75, 0x29, 0x26, 0x26, 0x28, 0x6f, - 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x26, - 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x71, 0x28, 0x6f, 0x29, - 0x29, 0x2c, 0x66, 0x74, 0x28, 0x6f, 0x2c, 0x6f, 0x2c, 0x21, 0x31, 0x29, - 0x2c, 0x65, 0x5b, 0x72, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x73, - 0x2d, 0x2d, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x73, 0x29, 0x66, 0x6f, 0x72, - 0x28, 0x5f, 0x3d, 0x30, 0x3b, 0x5f, 0x3c, 0x6c, 0x3b, 0x5f, 0x2b, 0x2b, - 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x6f, 0x3d, 0x65, 0x5b, - 0x5f, 0x5d, 0x29, 0x26, 0x26, 0x30, 0x3d, 0x3d, 0x28, 0x31, 0x33, 0x31, - 0x30, 0x37, 0x32, 0x26, 0x6f, 0x2e, 0x5f, 0x5f, 0x75, 0x29, 0x26, 0x26, - 0x28, 0x6f, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, - 0x64, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x71, 0x28, - 0x6f, 0x29, 0x29, 0x2c, 0x66, 0x74, 0x28, 0x6f, 0x2c, 0x6f, 0x29, 0x29, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x28, - 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, - 0x2c, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, - 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x66, 0x6f, 0x72, - 0x28, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x69, 0x3d, 0x30, - 0x3b, 0x5f, 0x26, 0x26, 0x69, 0x3c, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, - 0x74, 0x68, 0x3b, 0x69, 0x2b, 0x2b, 0x29, 0x5f, 0x5b, 0x69, 0x5d, 0x26, - 0x26, 0x28, 0x5f, 0x5b, 0x69, 0x5d, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, - 0x6e, 0x3d, 0x58, 0x28, 0x5f, 0x5b, 0x69, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, - 0x29, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, - 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x21, 0x3d, 0x6e, 0x26, 0x26, 0x28, 0x65, - 0x2e, 0x69, 0x6e, 0x73, 0x65, 0x72, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, - 0x65, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x7c, 0x7c, 0x6e, - 0x75, 0x6c, 0x6c, 0x29, 0x2c, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65, - 0x29, 0x3b, 0x64, 0x6f, 0x7b, 0x6e, 0x3d, 0x6e, 0x26, 0x26, 0x6e, 0x2e, - 0x6e, 0x65, 0x78, 0x74, 0x53, 0x69, 0x62, 0x6c, 0x69, 0x6e, 0x67, 0x7d, - 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, - 0x6e, 0x26, 0x26, 0x38, 0x3d, 0x3d, 0x3d, 0x6e, 0x2e, 0x6e, 0x6f, 0x64, - 0x65, 0x54, 0x79, 0x70, 0x65, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x59, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x6e, 0x3d, 0x6e, 0x7c, 0x7c, 0x5b, 0x5d, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x7c, 0x7c, 0x22, 0x62, 0x6f, 0x6f, - 0x6c, 0x65, 0x61, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x74, 0x7c, 0x7c, 0x28, 0x46, 0x28, 0x74, 0x29, 0x3f, 0x74, - 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x59, 0x28, 0x74, 0x2c, 0x6e, - 0x29, 0x7d, 0x29, 0x29, 0x3a, 0x6e, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, - 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x5a, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, - 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x74, 0x2e, 0x6b, 0x65, - 0x79, 0x2c, 0x6f, 0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x72, - 0x3d, 0x65, 0x2d, 0x31, 0x2c, 0x75, 0x3d, 0x65, 0x2b, 0x31, 0x2c, 0x66, - 0x3d, 0x6e, 0x5b, 0x65, 0x5d, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x3d, 0x66, 0x7c, 0x7c, 0x66, 0x26, 0x26, 0x69, 0x3d, - 0x3d, 0x66, 0x2e, 0x6b, 0x65, 0x79, 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, - 0x66, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, 0x26, 0x30, 0x3d, 0x3d, 0x28, - 0x31, 0x33, 0x31, 0x30, 0x37, 0x32, 0x26, 0x66, 0x2e, 0x5f, 0x5f, 0x75, - 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x3b, 0x69, - 0x66, 0x28, 0x5f, 0x3e, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x66, - 0x26, 0x26, 0x30, 0x3d, 0x3d, 0x28, 0x31, 0x33, 0x31, 0x30, 0x37, 0x32, - 0x26, 0x66, 0x2e, 0x5f, 0x5f, 0x75, 0x29, 0x3f, 0x31, 0x3a, 0x30, 0x29, - 0x29, 0x66, 0x6f, 0x72, 0x28, 0x3b, 0x72, 0x3e, 0x3d, 0x30, 0x7c, 0x7c, - 0x75, 0x3c, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x72, 0x3e, 0x3d, 0x30, 0x29, 0x7b, 0x69, 0x66, - 0x28, 0x28, 0x66, 0x3d, 0x6e, 0x5b, 0x72, 0x5d, 0x29, 0x26, 0x26, 0x30, - 0x3d, 0x3d, 0x28, 0x31, 0x33, 0x31, 0x30, 0x37, 0x32, 0x26, 0x66, 0x2e, - 0x5f, 0x5f, 0x75, 0x29, 0x26, 0x26, 0x69, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, - 0x65, 0x79, 0x26, 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e, 0x74, 0x79, - 0x70, 0x65, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x3b, - 0x72, 0x2d, 0x2d, 0x7d, 0x69, 0x66, 0x28, 0x75, 0x3c, 0x6e, 0x2e, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x28, 0x66, - 0x3d, 0x6e, 0x5b, 0x75, 0x5d, 0x29, 0x26, 0x26, 0x30, 0x3d, 0x3d, 0x28, - 0x31, 0x33, 0x31, 0x30, 0x37, 0x32, 0x26, 0x66, 0x2e, 0x5f, 0x5f, 0x75, - 0x29, 0x26, 0x26, 0x69, 0x3d, 0x3d, 0x66, 0x2e, 0x6b, 0x65, 0x79, 0x26, - 0x26, 0x6f, 0x3d, 0x3d, 0x3d, 0x66, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x3b, 0x75, 0x2b, 0x2b, - 0x7d, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x2d, 0x31, 0x7d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x74, 0x28, 0x74, - 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x22, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, - 0x6e, 0x5b, 0x30, 0x5d, 0x3f, 0x74, 0x2e, 0x73, 0x65, 0x74, 0x50, 0x72, - 0x6f, 0x70, 0x65, 0x72, 0x74, 0x79, 0x28, 0x6e, 0x2c, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x65, 0x3f, 0x22, 0x22, 0x3a, 0x65, 0x29, 0x3a, 0x74, - 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x3f, - 0x22, 0x22, 0x3a, 0x22, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x22, 0x21, - 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x7c, 0x7c, 0x41, - 0x2e, 0x74, 0x65, 0x73, 0x74, 0x28, 0x6e, 0x29, 0x3f, 0x65, 0x3a, 0x65, - 0x2b, 0x22, 0x70, 0x78, 0x22, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x6e, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, - 0x5f, 0x2c, 0x69, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3b, 0x74, - 0x3a, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x22, 0x3d, - 0x3d, 0x3d, 0x6e, 0x29, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, - 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, - 0x65, 0x29, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e, 0x63, 0x73, - 0x73, 0x54, 0x65, 0x78, 0x74, 0x3d, 0x65, 0x3b, 0x65, 0x6c, 0x73, 0x65, - 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, - 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x5f, 0x26, 0x26, - 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2e, 0x63, 0x73, 0x73, - 0x54, 0x65, 0x78, 0x74, 0x3d, 0x5f, 0x3d, 0x22, 0x22, 0x29, 0x2c, 0x5f, - 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x5f, 0x29, - 0x65, 0x26, 0x26, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x7c, 0x7c, 0x74, - 0x74, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e, 0x2c, - 0x22, 0x22, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x65, 0x29, 0x66, 0x6f, 0x72, - 0x28, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x29, 0x5f, 0x26, 0x26, 0x65, - 0x5b, 0x6e, 0x5d, 0x3d, 0x3d, 0x3d, 0x5f, 0x5b, 0x6e, 0x5d, 0x7c, 0x7c, - 0x74, 0x74, 0x28, 0x74, 0x2e, 0x73, 0x74, 0x79, 0x6c, 0x65, 0x2c, 0x6e, - 0x2c, 0x65, 0x5b, 0x6e, 0x5d, 0x29, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x69, 0x66, 0x28, 0x22, 0x6f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x30, - 0x5d, 0x26, 0x26, 0x22, 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, 0x31, - 0x5d, 0x29, 0x6f, 0x3d, 0x6e, 0x21, 0x3d, 0x3d, 0x28, 0x6e, 0x3d, 0x6e, - 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x28, 0x50, - 0x6f, 0x69, 0x6e, 0x74, 0x65, 0x72, 0x43, 0x61, 0x70, 0x74, 0x75, 0x72, - 0x65, 0x29, 0x24, 0x7c, 0x43, 0x61, 0x70, 0x74, 0x75, 0x72, 0x65, 0x24, - 0x2f, 0x69, 0x2c, 0x22, 0x24, 0x31, 0x22, 0x29, 0x29, 0x2c, 0x6e, 0x3d, - 0x6e, 0x2e, 0x74, 0x6f, 0x4c, 0x6f, 0x77, 0x65, 0x72, 0x43, 0x61, 0x73, - 0x65, 0x28, 0x29, 0x69, 0x6e, 0x20, 0x74, 0x3f, 0x6e, 0x2e, 0x74, 0x6f, - 0x4c, 0x6f, 0x77, 0x65, 0x72, 0x43, 0x61, 0x73, 0x65, 0x28, 0x29, 0x2e, - 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x32, 0x29, 0x3a, 0x6e, 0x2e, 0x73, - 0x6c, 0x69, 0x63, 0x65, 0x28, 0x32, 0x29, 0x2c, 0x74, 0x2e, 0x6c, 0x7c, - 0x7c, 0x28, 0x74, 0x2e, 0x6c, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x74, 0x2e, - 0x6c, 0x5b, 0x6e, 0x2b, 0x6f, 0x5d, 0x3d, 0x65, 0x2c, 0x65, 0x3f, 0x5f, - 0x3f, 0x65, 0x2e, 0x75, 0x3d, 0x5f, 0x2e, 0x75, 0x3a, 0x28, 0x65, 0x2e, - 0x75, 0x3d, 0x44, 0x61, 0x74, 0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, 0x29, - 0x2c, 0x74, 0x2e, 0x61, 0x64, 0x64, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, - 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c, 0x6f, 0x3f, - 0x5f, 0x74, 0x3a, 0x65, 0x74, 0x2c, 0x6f, 0x29, 0x29, 0x3a, 0x74, 0x2e, - 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x45, 0x76, 0x65, 0x6e, 0x74, 0x4c, - 0x69, 0x73, 0x74, 0x65, 0x6e, 0x65, 0x72, 0x28, 0x6e, 0x2c, 0x6f, 0x3f, - 0x5f, 0x74, 0x3a, 0x65, 0x74, 0x2c, 0x6f, 0x29, 0x3b, 0x65, 0x6c, 0x73, - 0x65, 0x7b, 0x69, 0x66, 0x28, 0x69, 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x72, - 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x78, 0x6c, 0x69, 0x6e, - 0x6b, 0x28, 0x48, 0x7c, 0x3a, 0x68, 0x29, 0x2f, 0x2c, 0x22, 0x68, 0x22, - 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x73, - 0x4e, 0x61, 0x6d, 0x65, 0x24, 0x2f, 0x2c, 0x22, 0x73, 0x22, 0x29, 0x3b, - 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x22, 0x77, 0x69, 0x64, - 0x74, 0x68, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x68, 0x65, - 0x69, 0x67, 0x68, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, - 0x68, 0x72, 0x65, 0x66, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, - 0x6c, 0x69, 0x73, 0x74, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, - 0x66, 0x6f, 0x72, 0x6d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, - 0x74, 0x61, 0x62, 0x49, 0x6e, 0x64, 0x65, 0x78, 0x22, 0x21, 0x3d, 0x3d, - 0x6e, 0x26, 0x26, 0x22, 0x64, 0x6f, 0x77, 0x6e, 0x6c, 0x6f, 0x61, 0x64, - 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x72, 0x6f, 0x77, 0x53, - 0x70, 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x22, 0x63, - 0x6f, 0x6c, 0x53, 0x70, 0x61, 0x6e, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, - 0x26, 0x22, 0x72, 0x6f, 0x6c, 0x65, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x26, - 0x26, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, 0x74, 0x72, 0x79, 0x7b, - 0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, - 0x3f, 0x22, 0x22, 0x3a, 0x65, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x20, - 0x74, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x7d, - 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, - 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x7c, 0x7c, 0x28, 0x6e, - 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, 0x7c, 0x7c, 0x21, 0x31, 0x3d, 0x3d, - 0x3d, 0x65, 0x26, 0x26, 0x22, 0x2d, 0x22, 0x21, 0x3d, 0x3d, 0x6e, 0x5b, - 0x34, 0x5d, 0x3f, 0x74, 0x2e, 0x72, 0x65, 0x6d, 0x6f, 0x76, 0x65, 0x41, - 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x29, 0x3a, - 0x74, 0x2e, 0x73, 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, - 0x74, 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x29, 0x29, 0x7d, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x65, 0x74, 0x28, 0x74, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, - 0x5b, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x31, 0x5d, 0x3b, - 0x69, 0x66, 0x28, 0x74, 0x2e, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, - 0x2e, 0x74, 0x3c, 0x3d, 0x6e, 0x2e, 0x75, 0x29, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x74, 0x2e, 0x74, 0x3d, - 0x44, 0x61, 0x74, 0x65, 0x2e, 0x6e, 0x6f, 0x77, 0x28, 0x29, 0x3b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x28, 0x43, 0x2e, 0x65, 0x76, - 0x65, 0x6e, 0x74, 0x3f, 0x43, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x28, - 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x5f, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, - 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, 0x29, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x6c, 0x5b, 0x74, 0x2e, - 0x74, 0x79, 0x70, 0x65, 0x2b, 0x21, 0x30, 0x5d, 0x28, 0x43, 0x2e, 0x65, - 0x76, 0x65, 0x6e, 0x74, 0x3f, 0x43, 0x2e, 0x65, 0x76, 0x65, 0x6e, 0x74, - 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x69, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, - 0x2c, 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, - 0x2c, 0x6c, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x73, 0x2c, 0x63, 0x2c, - 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, 0x64, 0x2c, 0x76, 0x2c, 0x79, 0x2c, - 0x6d, 0x2c, 0x67, 0x2c, 0x62, 0x2c, 0x6b, 0x2c, 0x53, 0x2c, 0x77, 0x2c, - 0x78, 0x2c, 0x45, 0x3d, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3b, 0x69, - 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x6e, - 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, - 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x75, 0x6c, 0x6c, - 0x3b, 0x31, 0x32, 0x38, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x75, 0x26, 0x26, - 0x28, 0x66, 0x3d, 0x21, 0x21, 0x28, 0x33, 0x32, 0x26, 0x65, 0x2e, 0x5f, - 0x5f, 0x75, 0x29, 0x2c, 0x6f, 0x3d, 0x5b, 0x75, 0x3d, 0x6e, 0x2e, 0x5f, - 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x5d, 0x29, 0x2c, 0x28, - 0x73, 0x3d, 0x43, 0x2e, 0x5f, 0x5f, 0x62, 0x29, 0x26, 0x26, 0x73, 0x28, - 0x6e, 0x29, 0x3b, 0x74, 0x3a, 0x69, 0x66, 0x28, 0x22, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, - 0x6f, 0x66, 0x20, 0x45, 0x29, 0x74, 0x72, 0x79, 0x7b, 0x69, 0x66, 0x28, - 0x79, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x6d, 0x3d, - 0x28, 0x73, 0x3d, 0x45, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, - 0x54, 0x79, 0x70, 0x65, 0x29, 0x26, 0x26, 0x5f, 0x5b, 0x73, 0x2e, 0x5f, - 0x5f, 0x63, 0x5d, 0x2c, 0x67, 0x3d, 0x73, 0x3f, 0x6d, 0x3f, 0x6d, 0x2e, - 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, - 0x73, 0x2e, 0x5f, 0x5f, 0x3a, 0x5f, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x63, - 0x3f, 0x76, 0x3d, 0x28, 0x63, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, - 0x65, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x3d, 0x63, 0x2e, - 0x5f, 0x5f, 0x45, 0x3a, 0x28, 0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, - 0x79, 0x70, 0x65, 0x22, 0x69, 0x6e, 0x20, 0x45, 0x26, 0x26, 0x45, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, - 0x6e, 0x64, 0x65, 0x72, 0x3f, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x63, - 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x28, 0x79, 0x2c, 0x67, 0x29, 0x3a, - 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x63, 0x3d, 0x6e, 0x65, 0x77, - 0x20, 0x49, 0x28, 0x79, 0x2c, 0x67, 0x29, 0x2c, 0x63, 0x2e, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x3d, 0x45, 0x2c, - 0x63, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x3d, 0x6c, 0x74, 0x29, - 0x2c, 0x6d, 0x26, 0x26, 0x6d, 0x2e, 0x73, 0x75, 0x62, 0x28, 0x63, 0x29, - 0x2c, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63, - 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, 0x63, 0x2e, 0x73, - 0x74, 0x61, 0x74, 0x65, 0x3d, 0x7b, 0x7d, 0x29, 0x2c, 0x63, 0x2e, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x67, 0x2c, 0x63, 0x2e, 0x5f, - 0x5f, 0x6e, 0x3d, 0x5f, 0x2c, 0x68, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x64, - 0x3d, 0x21, 0x30, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, - 0x2c, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x29, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, - 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x63, 0x2e, 0x73, 0x74, 0x61, - 0x74, 0x65, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x45, 0x2e, - 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, - 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, - 0x26, 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x3d, 0x63, 0x2e, - 0x73, 0x74, 0x61, 0x74, 0x65, 0x26, 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f, - 0x73, 0x3d, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, - 0x29, 0x29, 0x2c, 0x4d, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x45, - 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, - 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, - 0x73, 0x28, 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x29, 0x29, 0x29, - 0x2c, 0x61, 0x3d, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x70, - 0x3d, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x63, 0x2e, 0x5f, - 0x5f, 0x76, 0x3d, 0x6e, 0x2c, 0x68, 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, - 0x3d, 0x45, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, - 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, - 0x6f, 0x70, 0x73, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, - 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, - 0x6c, 0x6c, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x63, 0x2e, 0x63, - 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, - 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x28, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, - 0x21, 0x3d, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x44, 0x69, 0x64, 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x63, - 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x2e, - 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, - 0x4d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, - 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x45, 0x2e, 0x67, - 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, - 0x26, 0x79, 0x21, 0x3d, 0x3d, 0x61, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, - 0x21, 0x3d, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65, 0x63, 0x65, 0x69, 0x76, 0x65, - 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x63, 0x2e, 0x63, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x52, 0x65, - 0x63, 0x65, 0x69, 0x76, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x28, 0x79, - 0x2c, 0x67, 0x29, 0x2c, 0x21, 0x63, 0x2e, 0x5f, 0x5f, 0x65, 0x26, 0x26, - 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, 0x2e, 0x73, 0x68, 0x6f, - 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, - 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x21, 0x31, 0x3d, 0x3d, - 0x3d, 0x63, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x28, 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x67, 0x29, 0x7c, - 0x7c, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, - 0x5f, 0x76, 0x29, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x6e, 0x2e, 0x5f, - 0x5f, 0x76, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, - 0x28, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63, - 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, - 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x29, 0x2c, 0x6e, - 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, - 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, 0x6e, - 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, - 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x29, 0x7b, 0x74, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x6e, - 0x29, 0x7d, 0x29, 0x29, 0x2c, 0x62, 0x3d, 0x30, 0x3b, 0x62, 0x3c, 0x63, - 0x2e, 0x5f, 0x73, 0x62, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, - 0x62, 0x2b, 0x2b, 0x29, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, - 0x73, 0x68, 0x28, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x5b, 0x62, 0x5d, 0x29, - 0x3b, 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x2c, 0x63, 0x2e, - 0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, - 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x29, 0x3b, 0x62, 0x72, - 0x65, 0x61, 0x6b, 0x20, 0x74, 0x7d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, - 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, - 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, 0x63, - 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, - 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x79, 0x2c, 0x63, - 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x67, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, - 0x21, 0x3d, 0x63, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x44, 0x69, 0x64, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x26, 0x26, - 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x28, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, - 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, - 0x64, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x28, 0x61, 0x2c, 0x70, 0x2c, - 0x64, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x69, 0x66, 0x28, 0x63, 0x2e, 0x63, - 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x67, 0x2c, 0x63, 0x2e, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x3d, 0x79, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x50, - 0x3d, 0x74, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, 0x31, 0x2c, - 0x6b, 0x3d, 0x43, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x53, 0x3d, 0x30, 0x2c, - 0x22, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x22, 0x69, - 0x6e, 0x20, 0x45, 0x26, 0x26, 0x45, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x29, - 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, - 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x64, - 0x3d, 0x21, 0x31, 0x2c, 0x6b, 0x26, 0x26, 0x6b, 0x28, 0x6e, 0x29, 0x2c, - 0x73, 0x3d, 0x63, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x63, - 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x63, 0x2e, 0x73, 0x74, 0x61, - 0x74, 0x65, 0x2c, 0x63, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, - 0x29, 0x2c, 0x77, 0x3d, 0x30, 0x3b, 0x77, 0x3c, 0x63, 0x2e, 0x5f, 0x73, - 0x62, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x77, 0x2b, 0x2b, - 0x29, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, - 0x63, 0x2e, 0x5f, 0x73, 0x62, 0x5b, 0x77, 0x5d, 0x29, 0x3b, 0x63, 0x2e, - 0x5f, 0x73, 0x62, 0x3d, 0x5b, 0x5d, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x64, 0x6f, 0x7b, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x21, 0x31, 0x2c, - 0x6b, 0x26, 0x26, 0x6b, 0x28, 0x6e, 0x29, 0x2c, 0x73, 0x3d, 0x63, 0x2e, - 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x28, 0x63, 0x2e, 0x70, 0x72, 0x6f, - 0x70, 0x73, 0x2c, 0x63, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, 0x2c, 0x63, - 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x29, 0x2c, 0x63, 0x2e, - 0x73, 0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x7d, - 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x64, 0x26, - 0x26, 0x2b, 0x2b, 0x53, 0x3c, 0x32, 0x35, 0x29, 0x3b, 0x63, 0x2e, 0x73, - 0x74, 0x61, 0x74, 0x65, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, 0x73, 0x2c, 0x6e, - 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, - 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x26, 0x26, - 0x28, 0x5f, 0x3d, 0x4d, 0x28, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x5f, 0x29, - 0x2c, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, - 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x28, 0x29, 0x29, 0x29, 0x2c, 0x68, - 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x63, 0x2e, 0x67, 0x65, - 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, 0x68, 0x6f, 0x74, 0x42, 0x65, 0x66, - 0x6f, 0x72, 0x65, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x7c, 0x7c, 0x28, - 0x64, 0x3d, 0x63, 0x2e, 0x67, 0x65, 0x74, 0x53, 0x6e, 0x61, 0x70, 0x73, - 0x68, 0x6f, 0x74, 0x42, 0x65, 0x66, 0x6f, 0x72, 0x65, 0x55, 0x70, 0x64, - 0x61, 0x74, 0x65, 0x28, 0x61, 0x2c, 0x70, 0x29, 0x29, 0x2c, 0x4b, 0x28, - 0x74, 0x2c, 0x46, 0x28, 0x78, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, - 0x73, 0x26, 0x26, 0x73, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3d, 0x3d, 0x3d, - 0x6a, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x73, 0x2e, 0x6b, - 0x65, 0x79, 0x3f, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x63, - 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x3a, 0x73, 0x29, 0x3f, 0x78, - 0x3a, 0x5b, 0x78, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69, - 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x2c, 0x6c, 0x29, 0x2c, - 0x63, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, - 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x75, 0x26, 0x3d, 0x2d, 0x31, 0x36, 0x31, - 0x2c, 0x63, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x26, 0x26, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x63, 0x29, - 0x2c, 0x76, 0x26, 0x26, 0x28, 0x63, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x63, - 0x2e, 0x5f, 0x5f, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x63, 0x61, - 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, - 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x66, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, - 0x6c, 0x21, 0x3d, 0x6f, 0x3f, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, - 0x75, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x75, 0x7c, 0x3d, 0x66, 0x3f, 0x31, - 0x36, 0x30, 0x3a, 0x33, 0x32, 0x2c, 0x6f, 0x5b, 0x6f, 0x2e, 0x69, 0x6e, - 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x75, 0x29, 0x5d, 0x3d, 0x6e, 0x75, - 0x6c, 0x6c, 0x29, 0x3a, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, - 0x2e, 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x65, - 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x2c, 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28, - 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x6f, 0x26, 0x26, 0x6e, 0x2e, 0x5f, - 0x5f, 0x76, 0x3d, 0x3d, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x3f, 0x28, - 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x2c, - 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x65, 0x29, - 0x3a, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x72, 0x74, 0x28, 0x65, 0x2e, - 0x5f, 0x5f, 0x65, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69, 0x2c, - 0x6f, 0x2c, 0x72, 0x2c, 0x66, 0x2c, 0x6c, 0x29, 0x3b, 0x28, 0x73, 0x3d, - 0x43, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x29, 0x26, 0x26, 0x73, - 0x28, 0x6e, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x6f, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x66, - 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x3d, 0x30, 0x3b, 0x5f, - 0x3c, 0x65, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x5f, 0x2b, - 0x2b, 0x29, 0x75, 0x74, 0x28, 0x65, 0x5b, 0x5f, 0x5d, 0x2c, 0x65, 0x5b, - 0x2b, 0x2b, 0x5f, 0x5d, 0x2c, 0x65, 0x5b, 0x2b, 0x2b, 0x5f, 0x5d, 0x29, - 0x3b, 0x43, 0x2e, 0x5f, 0x5f, 0x63, 0x26, 0x26, 0x43, 0x2e, 0x5f, 0x5f, - 0x63, 0x28, 0x6e, 0x2c, 0x74, 0x29, 0x2c, 0x74, 0x2e, 0x73, 0x6f, 0x6d, - 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x6e, 0x29, 0x7b, 0x74, 0x72, 0x79, 0x7b, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, - 0x5f, 0x68, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, - 0x74, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x63, 0x61, - 0x6c, 0x6c, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x29, 0x7d, 0x63, 0x61, 0x74, - 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28, - 0x74, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x7d, 0x7d, 0x29, 0x29, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x72, 0x74, - 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x2c, 0x69, 0x2c, 0x6f, - 0x2c, 0x72, 0x2c, 0x75, 0x2c, 0x66, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x6c, 0x2c, 0x73, 0x2c, 0x63, 0x2c, 0x68, 0x2c, 0x61, 0x2c, 0x70, 0x2c, - 0x64, 0x2c, 0x76, 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, - 0x79, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2c, 0x6d, 0x3d, - 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x22, 0x73, - 0x76, 0x67, 0x22, 0x3d, 0x3d, 0x3d, 0x6d, 0x26, 0x26, 0x28, 0x69, 0x3d, - 0x21, 0x30, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x29, - 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x3d, 0x30, 0x3b, 0x6c, 0x3c, 0x6f, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6c, 0x2b, 0x2b, 0x29, 0x69, - 0x66, 0x28, 0x28, 0x61, 0x3d, 0x6f, 0x5b, 0x6c, 0x5d, 0x29, 0x26, 0x26, - 0x22, 0x73, 0x65, 0x74, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, - 0x65, 0x22, 0x69, 0x6e, 0x20, 0x61, 0x3d, 0x3d, 0x21, 0x21, 0x6d, 0x26, - 0x26, 0x28, 0x6d, 0x3f, 0x61, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, 0x4e, - 0x61, 0x6d, 0x65, 0x3d, 0x3d, 0x3d, 0x6d, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, - 0x61, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29, 0x29, - 0x7b, 0x74, 0x3d, 0x61, 0x2c, 0x6f, 0x5b, 0x6c, 0x5d, 0x3d, 0x6e, 0x75, - 0x6c, 0x6c, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, 0x7d, 0x69, 0x66, 0x28, - 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, - 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x3d, 0x6d, 0x29, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x64, 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, - 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x54, 0x65, 0x78, 0x74, 0x4e, - 0x6f, 0x64, 0x65, 0x28, 0x79, 0x29, 0x3b, 0x74, 0x3d, 0x69, 0x3f, 0x64, - 0x6f, 0x63, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, - 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x4e, 0x53, 0x28, - 0x22, 0x68, 0x74, 0x74, 0x70, 0x3a, 0x2f, 0x2f, 0x77, 0x77, 0x77, 0x2e, - 0x77, 0x33, 0x2e, 0x6f, 0x72, 0x67, 0x2f, 0x32, 0x30, 0x30, 0x30, 0x2f, - 0x73, 0x76, 0x67, 0x22, 0x2c, 0x6d, 0x29, 0x3a, 0x64, 0x6f, 0x63, 0x75, - 0x6d, 0x65, 0x6e, 0x74, 0x2e, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x45, - 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x28, 0x6d, 0x2c, 0x79, 0x2e, 0x69, - 0x73, 0x26, 0x26, 0x79, 0x29, 0x2c, 0x6f, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, - 0x2c, 0x75, 0x3d, 0x21, 0x31, 0x7d, 0x69, 0x66, 0x28, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x3d, 0x6d, 0x29, 0x76, 0x3d, 0x3d, 0x3d, 0x79, 0x7c, - 0x7c, 0x75, 0x26, 0x26, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, 0x3d, - 0x3d, 0x79, 0x7c, 0x7c, 0x28, 0x74, 0x2e, 0x64, 0x61, 0x74, 0x61, 0x3d, - 0x79, 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x69, 0x66, 0x28, 0x6f, - 0x3d, 0x6f, 0x26, 0x26, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, - 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73, 0x29, - 0x2c, 0x76, 0x3d, 0x65, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x7c, 0x7c, - 0x54, 0x2c, 0x21, 0x75, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, - 0x6f, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x3d, 0x7b, 0x7d, 0x2c, 0x6c, - 0x3d, 0x30, 0x3b, 0x6c, 0x3c, 0x74, 0x2e, 0x61, 0x74, 0x74, 0x72, 0x69, - 0x62, 0x75, 0x74, 0x65, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x3b, 0x6c, 0x2b, 0x2b, 0x29, 0x76, 0x5b, 0x28, 0x61, 0x3d, 0x74, 0x2e, - 0x61, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x73, 0x5b, 0x6c, - 0x5d, 0x29, 0x2e, 0x6e, 0x61, 0x6d, 0x65, 0x5d, 0x3d, 0x61, 0x2e, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x20, 0x69, - 0x6e, 0x20, 0x76, 0x29, 0x61, 0x3d, 0x76, 0x5b, 0x6c, 0x5d, 0x2c, 0x22, - 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d, 0x6c, - 0x7c, 0x7c, 0x28, 0x22, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, - 0x73, 0x6c, 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, - 0x54, 0x4d, 0x4c, 0x22, 0x3d, 0x3d, 0x6c, 0x3f, 0x63, 0x3d, 0x61, 0x3a, - 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x3d, 0x6c, 0x7c, 0x7c, 0x6c, - 0x20, 0x69, 0x6e, 0x20, 0x79, 0x7c, 0x7c, 0x6e, 0x74, 0x28, 0x74, 0x2c, - 0x6c, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x61, 0x2c, 0x69, 0x29, 0x29, - 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x20, 0x69, 0x6e, 0x20, 0x79, 0x29, - 0x61, 0x3d, 0x79, 0x5b, 0x6c, 0x5d, 0x2c, 0x22, 0x63, 0x68, 0x69, 0x6c, - 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d, 0x6c, 0x3f, 0x68, 0x3d, 0x61, - 0x3a, 0x22, 0x64, 0x61, 0x6e, 0x67, 0x65, 0x72, 0x6f, 0x75, 0x73, 0x6c, - 0x79, 0x53, 0x65, 0x74, 0x49, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, - 0x4c, 0x22, 0x3d, 0x3d, 0x6c, 0x3f, 0x73, 0x3d, 0x61, 0x3a, 0x22, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x22, 0x3d, 0x3d, 0x6c, 0x3f, 0x70, 0x3d, 0x61, - 0x3a, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x22, 0x3d, 0x3d, - 0x6c, 0x3f, 0x64, 0x3d, 0x61, 0x3a, 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, - 0x3d, 0x3d, 0x6c, 0x7c, 0x7c, 0x75, 0x26, 0x26, 0x22, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, 0x79, 0x70, 0x65, - 0x6f, 0x66, 0x20, 0x61, 0x7c, 0x7c, 0x76, 0x5b, 0x6c, 0x5d, 0x3d, 0x3d, - 0x3d, 0x61, 0x7c, 0x7c, 0x6e, 0x74, 0x28, 0x74, 0x2c, 0x6c, 0x2c, 0x61, - 0x2c, 0x76, 0x5b, 0x6c, 0x5d, 0x2c, 0x69, 0x29, 0x3b, 0x69, 0x66, 0x28, - 0x73, 0x29, 0x75, 0x7c, 0x7c, 0x63, 0x26, 0x26, 0x28, 0x73, 0x2e, 0x5f, - 0x5f, 0x68, 0x74, 0x6d, 0x6c, 0x3d, 0x3d, 0x3d, 0x63, 0x2e, 0x5f, 0x5f, - 0x68, 0x74, 0x6d, 0x6c, 0x7c, 0x7c, 0x73, 0x2e, 0x5f, 0x5f, 0x68, 0x74, - 0x6d, 0x6c, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, - 0x48, 0x54, 0x4d, 0x4c, 0x29, 0x7c, 0x7c, 0x28, 0x74, 0x2e, 0x69, 0x6e, - 0x6e, 0x65, 0x72, 0x48, 0x54, 0x4d, 0x4c, 0x3d, 0x73, 0x2e, 0x5f, 0x5f, - 0x68, 0x74, 0x6d, 0x6c, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, - 0x5b, 0x5d, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x28, 0x63, - 0x26, 0x26, 0x28, 0x74, 0x2e, 0x69, 0x6e, 0x6e, 0x65, 0x72, 0x48, 0x54, - 0x4d, 0x4c, 0x3d, 0x22, 0x22, 0x29, 0x2c, 0x4b, 0x28, 0x74, 0x2c, 0x46, - 0x28, 0x68, 0x29, 0x3f, 0x68, 0x3a, 0x5b, 0x68, 0x5d, 0x2c, 0x6e, 0x2c, - 0x65, 0x2c, 0x5f, 0x2c, 0x69, 0x26, 0x26, 0x22, 0x66, 0x6f, 0x72, 0x65, - 0x69, 0x67, 0x6e, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x22, 0x21, 0x3d, - 0x3d, 0x6d, 0x2c, 0x6f, 0x2c, 0x72, 0x2c, 0x6f, 0x3f, 0x6f, 0x5b, 0x30, - 0x5d, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x26, 0x26, 0x71, 0x28, 0x65, - 0x2c, 0x30, 0x29, 0x2c, 0x75, 0x2c, 0x66, 0x29, 0x2c, 0x6e, 0x75, 0x6c, - 0x6c, 0x21, 0x3d, 0x6f, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x3d, 0x6f, - 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6c, 0x2d, 0x2d, 0x3b, - 0x29, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x5b, 0x6c, 0x5d, 0x26, - 0x26, 0x57, 0x28, 0x6f, 0x5b, 0x6c, 0x5d, 0x29, 0x3b, 0x75, 0x7c, 0x7c, - 0x28, 0x6c, 0x3d, 0x22, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x22, 0x2c, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x70, 0x26, 0x26, 0x28, - 0x70, 0x21, 0x3d, 0x3d, 0x74, 0x5b, 0x6c, 0x5d, 0x7c, 0x7c, 0x22, 0x70, - 0x72, 0x6f, 0x67, 0x72, 0x65, 0x73, 0x73, 0x22, 0x3d, 0x3d, 0x3d, 0x6d, - 0x26, 0x26, 0x21, 0x70, 0x7c, 0x7c, 0x22, 0x6f, 0x70, 0x74, 0x69, 0x6f, - 0x6e, 0x22, 0x3d, 0x3d, 0x3d, 0x6d, 0x26, 0x26, 0x70, 0x21, 0x3d, 0x3d, - 0x76, 0x5b, 0x6c, 0x5d, 0x29, 0x26, 0x26, 0x6e, 0x74, 0x28, 0x74, 0x2c, - 0x6c, 0x2c, 0x70, 0x2c, 0x76, 0x5b, 0x6c, 0x5d, 0x2c, 0x21, 0x31, 0x29, - 0x2c, 0x6c, 0x3d, 0x22, 0x63, 0x68, 0x65, 0x63, 0x6b, 0x65, 0x64, 0x22, - 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x64, 0x26, - 0x26, 0x64, 0x21, 0x3d, 0x3d, 0x74, 0x5b, 0x6c, 0x5d, 0x26, 0x26, 0x6e, - 0x74, 0x28, 0x74, 0x2c, 0x6c, 0x2c, 0x64, 0x2c, 0x76, 0x5b, 0x6c, 0x5d, - 0x2c, 0x21, 0x31, 0x29, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x74, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x75, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x74, 0x72, - 0x79, 0x7b, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, - 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x3f, 0x74, - 0x28, 0x6e, 0x29, 0x3a, 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, - 0x74, 0x3d, 0x6e, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, - 0x7b, 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x65, 0x29, 0x7d, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x66, 0x74, - 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x5f, 0x2c, 0x69, 0x3b, 0x69, 0x66, 0x28, 0x43, 0x2e, 0x75, 0x6e, 0x6d, - 0x6f, 0x75, 0x6e, 0x74, 0x26, 0x26, 0x43, 0x2e, 0x75, 0x6e, 0x6d, 0x6f, - 0x75, 0x6e, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x28, 0x5f, 0x3d, 0x74, 0x2e, - 0x72, 0x65, 0x66, 0x29, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x63, 0x75, 0x72, - 0x72, 0x65, 0x6e, 0x74, 0x26, 0x26, 0x5f, 0x2e, 0x63, 0x75, 0x72, 0x72, - 0x65, 0x6e, 0x74, 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x7c, - 0x7c, 0x75, 0x74, 0x28, 0x5f, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6e, - 0x29, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x28, 0x5f, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x5f, - 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x57, 0x69, - 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x29, 0x74, 0x72, - 0x79, 0x7b, 0x5f, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, - 0x28, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, - 0x43, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x5f, - 0x2e, 0x62, 0x61, 0x73, 0x65, 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x50, 0x3d, - 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x69, 0x66, 0x28, 0x5f, 0x3d, 0x74, - 0x2e, 0x5f, 0x5f, 0x6b, 0x29, 0x66, 0x6f, 0x72, 0x28, 0x69, 0x3d, 0x30, - 0x3b, 0x69, 0x3c, 0x5f, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, - 0x69, 0x2b, 0x2b, 0x29, 0x5f, 0x5b, 0x69, 0x5d, 0x26, 0x26, 0x66, 0x74, - 0x28, 0x5f, 0x5b, 0x69, 0x5d, 0x2c, 0x6e, 0x2c, 0x65, 0x7c, 0x7c, 0x22, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x21, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, - 0x29, 0x3b, 0x65, 0x7c, 0x7c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x74, - 0x2e, 0x5f, 0x5f, 0x65, 0x7c, 0x7c, 0x57, 0x28, 0x74, 0x2e, 0x5f, 0x5f, - 0x65, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, - 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x6c, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x74, 0x2c, - 0x65, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x73, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, - 0x72, 0x20, 0x5f, 0x2c, 0x69, 0x2c, 0x6f, 0x2c, 0x72, 0x3b, 0x43, 0x2e, - 0x5f, 0x5f, 0x26, 0x26, 0x43, 0x2e, 0x5f, 0x5f, 0x28, 0x74, 0x2c, 0x6e, - 0x29, 0x2c, 0x69, 0x3d, 0x28, 0x5f, 0x3d, 0x22, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x65, 0x29, 0x3f, 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x65, 0x26, - 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x6b, 0x7c, 0x7c, 0x6e, 0x2e, 0x5f, 0x5f, - 0x6b, 0x2c, 0x6f, 0x3d, 0x5b, 0x5d, 0x2c, 0x72, 0x3d, 0x5b, 0x5d, 0x2c, - 0x69, 0x74, 0x28, 0x6e, 0x2c, 0x74, 0x3d, 0x28, 0x21, 0x5f, 0x26, 0x26, - 0x65, 0x7c, 0x7c, 0x6e, 0x29, 0x2e, 0x5f, 0x5f, 0x6b, 0x3d, 0x4c, 0x28, - 0x6a, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x5b, 0x74, 0x5d, 0x29, 0x2c, - 0x69, 0x7c, 0x7c, 0x54, 0x2c, 0x54, 0x2c, 0x76, 0x6f, 0x69, 0x64, 0x20, - 0x30, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6f, 0x77, 0x6e, 0x65, 0x72, 0x53, - 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x21, 0x5f, - 0x26, 0x26, 0x65, 0x3f, 0x5b, 0x65, 0x5d, 0x3a, 0x69, 0x3f, 0x6e, 0x75, - 0x6c, 0x6c, 0x3a, 0x6e, 0x2e, 0x66, 0x69, 0x72, 0x73, 0x74, 0x43, 0x68, - 0x69, 0x6c, 0x64, 0x3f, 0x78, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x6e, - 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x4e, 0x6f, 0x64, 0x65, 0x73, 0x29, - 0x3a, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x6f, 0x2c, 0x21, 0x5f, 0x26, 0x26, - 0x65, 0x3f, 0x65, 0x3a, 0x69, 0x3f, 0x69, 0x2e, 0x5f, 0x5f, 0x65, 0x3a, - 0x6e, 0x2e, 0x66, 0x69, 0x72, 0x73, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, - 0x2c, 0x5f, 0x2c, 0x72, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x64, 0x3d, - 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6f, 0x74, 0x28, 0x6f, 0x2c, - 0x74, 0x2c, 0x72, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x63, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x73, 0x74, - 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x63, 0x74, 0x29, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x68, 0x74, 0x28, 0x74, 0x2c, 0x6e, - 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x2c, 0x69, 0x2c, - 0x6f, 0x2c, 0x72, 0x2c, 0x75, 0x3d, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x74, - 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x3b, 0x66, 0x6f, 0x72, 0x28, - 0x6f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x26, - 0x26, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66, 0x61, - 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x26, 0x26, 0x28, 0x72, - 0x3d, 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x64, 0x65, 0x66, 0x61, - 0x75, 0x6c, 0x74, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x29, 0x2c, 0x6e, 0x29, - 0x22, 0x6b, 0x65, 0x79, 0x22, 0x3d, 0x3d, 0x6f, 0x3f, 0x5f, 0x3d, 0x6e, - 0x5b, 0x6f, 0x5d, 0x3a, 0x22, 0x72, 0x65, 0x66, 0x22, 0x3d, 0x3d, 0x6f, - 0x3f, 0x69, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3a, 0x75, 0x5b, 0x6f, 0x5d, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6e, 0x5b, - 0x6f, 0x5d, 0x26, 0x26, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, - 0x3d, 0x72, 0x3f, 0x72, 0x5b, 0x6f, 0x5d, 0x3a, 0x6e, 0x5b, 0x6f, 0x5d, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x72, 0x67, 0x75, - 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x3e, 0x32, 0x26, 0x26, 0x28, 0x75, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, - 0x72, 0x65, 0x6e, 0x3d, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, - 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x33, 0x3f, 0x78, - 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x61, 0x72, 0x67, 0x75, 0x6d, 0x65, - 0x6e, 0x74, 0x73, 0x2c, 0x32, 0x29, 0x3a, 0x65, 0x29, 0x2c, 0x4f, 0x28, - 0x74, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x2c, 0x75, 0x2c, 0x5f, 0x7c, 0x7c, - 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x2c, 0x69, 0x7c, 0x7c, 0x74, 0x2e, 0x72, - 0x65, 0x66, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x61, 0x74, 0x28, 0x74, 0x2c, 0x6e, - 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x7b, 0x5f, 0x5f, 0x63, - 0x3a, 0x6e, 0x3d, 0x22, 0x5f, 0x5f, 0x63, 0x43, 0x22, 0x2b, 0x44, 0x2b, - 0x2b, 0x2c, 0x5f, 0x5f, 0x3a, 0x74, 0x2c, 0x43, 0x6f, 0x6e, 0x73, 0x75, - 0x6d, 0x65, 0x72, 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x28, - 0x6e, 0x29, 0x7d, 0x2c, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x72, - 0x3a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, - 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x2c, 0x5f, 0x3b, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, 0x74, - 0x43, 0x68, 0x69, 0x6c, 0x64, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, - 0x7c, 0x7c, 0x28, 0x65, 0x3d, 0x5b, 0x5d, 0x2c, 0x28, 0x5f, 0x3d, 0x7b, - 0x7d, 0x29, 0x5b, 0x6e, 0x5d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x67, 0x65, 0x74, 0x43, 0x68, 0x69, 0x6c, 0x64, - 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x5f, 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x68, - 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x21, 0x3d, 0x3d, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x26, 0x26, - 0x65, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, - 0x65, 0x3d, 0x21, 0x30, 0x2c, 0x7a, 0x28, 0x74, 0x29, 0x7d, 0x29, 0x29, - 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x75, 0x62, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x65, - 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, 0x72, - 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, - 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, - 0x74, 0x3b, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, - 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, - 0x65, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x65, 0x2e, 0x69, - 0x6e, 0x64, 0x65, 0x78, 0x4f, 0x66, 0x28, 0x74, 0x29, 0x2c, 0x31, 0x29, - 0x2c, 0x6e, 0x26, 0x26, 0x6e, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, - 0x29, 0x7d, 0x7d, 0x29, 0x2c, 0x74, 0x2e, 0x63, 0x68, 0x69, 0x6c, 0x64, - 0x72, 0x65, 0x6e, 0x7d, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x65, 0x2e, 0x50, 0x72, 0x6f, 0x76, 0x69, 0x64, 0x65, 0x72, 0x2e, - 0x5f, 0x5f, 0x3d, 0x65, 0x2e, 0x43, 0x6f, 0x6e, 0x73, 0x75, 0x6d, 0x65, - 0x72, 0x2e, 0x63, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x54, 0x79, 0x70, - 0x65, 0x3d, 0x65, 0x7d, 0x78, 0x3d, 0x56, 0x2e, 0x73, 0x6c, 0x69, 0x63, - 0x65, 0x2c, 0x43, 0x3d, 0x7b, 0x5f, 0x5f, 0x65, 0x3a, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, - 0x5f, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x69, - 0x2c, 0x6f, 0x2c, 0x72, 0x3b, 0x6e, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b, - 0x29, 0x69, 0x66, 0x28, 0x28, 0x69, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x63, - 0x29, 0x26, 0x26, 0x21, 0x69, 0x2e, 0x5f, 0x5f, 0x29, 0x74, 0x72, 0x79, - 0x7b, 0x69, 0x66, 0x28, 0x28, 0x6f, 0x3d, 0x69, 0x2e, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x29, 0x26, 0x26, 0x6e, - 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x6f, 0x2e, 0x67, 0x65, 0x74, 0x44, 0x65, - 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, 0x74, 0x65, 0x46, 0x72, - 0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x26, 0x26, 0x28, 0x69, 0x2e, - 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x6f, 0x2e, 0x67, - 0x65, 0x74, 0x44, 0x65, 0x72, 0x69, 0x76, 0x65, 0x64, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x46, 0x72, 0x6f, 0x6d, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, - 0x74, 0x29, 0x29, 0x2c, 0x72, 0x3d, 0x69, 0x2e, 0x5f, 0x5f, 0x64, 0x29, - 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x69, 0x2e, 0x63, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, - 0x63, 0x68, 0x26, 0x26, 0x28, 0x69, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, - 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, 0x68, - 0x28, 0x74, 0x2c, 0x5f, 0x7c, 0x7c, 0x7b, 0x7d, 0x29, 0x2c, 0x72, 0x3d, - 0x69, 0x2e, 0x5f, 0x5f, 0x64, 0x29, 0x2c, 0x72, 0x29, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x69, 0x2e, 0x5f, 0x5f, 0x45, 0x3d, 0x69, 0x7d, - 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x3d, 0x6e, - 0x7d, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x74, 0x7d, 0x7d, 0x2c, 0x45, - 0x3d, 0x30, 0x2c, 0x55, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x6e, 0x75, 0x6c, - 0x6c, 0x3d, 0x3d, 0x74, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, - 0x63, 0x74, 0x6f, 0x72, 0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f, 0x74, - 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3b, 0x65, - 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, - 0x73, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x74, 0x61, - 0x74, 0x65, 0x3f, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3a, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x73, 0x3d, 0x4d, 0x28, 0x7b, - 0x7d, 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x73, 0x74, 0x61, 0x74, 0x65, - 0x29, 0x2c, 0x22, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, - 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x74, 0x26, 0x26, - 0x28, 0x74, 0x3d, 0x74, 0x28, 0x4d, 0x28, 0x7b, 0x7d, 0x2c, 0x65, 0x29, - 0x2c, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x29, - 0x29, 0x2c, 0x74, 0x26, 0x26, 0x4d, 0x28, 0x65, 0x2c, 0x74, 0x29, 0x2c, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x74, 0x26, 0x26, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, 0x28, 0x6e, 0x26, 0x26, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x73, 0x62, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x6e, 0x29, 0x2c, 0x7a, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, 0x29, - 0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, - 0x65, 0x2e, 0x66, 0x6f, 0x72, 0x63, 0x65, 0x55, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x29, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x26, 0x26, - 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x65, 0x3d, 0x21, 0x30, - 0x2c, 0x74, 0x26, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x68, - 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x74, 0x29, 0x2c, 0x7a, 0x28, 0x74, - 0x68, 0x69, 0x73, 0x29, 0x29, 0x7d, 0x2c, 0x49, 0x2e, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x72, 0x65, 0x6e, 0x64, 0x65, - 0x72, 0x3d, 0x6a, 0x2c, 0x48, 0x3d, 0x5b, 0x5d, 0x2c, 0x4e, 0x3d, 0x22, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, - 0x65, 0x3f, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x70, 0x72, - 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x74, 0x68, 0x65, 0x6e, - 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, - 0x65, 0x2e, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x28, 0x29, 0x29, - 0x3a, 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x2c, - 0x24, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x2e, 0x5f, 0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62, 0x2d, 0x6e, 0x2e, 0x5f, - 0x5f, 0x76, 0x2e, 0x5f, 0x5f, 0x62, 0x7d, 0x2c, 0x4a, 0x2e, 0x5f, 0x5f, - 0x72, 0x3d, 0x30, 0x2c, 0x44, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, - 0x70, 0x74, 0x2c, 0x64, 0x74, 0x2c, 0x76, 0x74, 0x2c, 0x79, 0x74, 0x2c, - 0x6d, 0x74, 0x3d, 0x30, 0x2c, 0x67, 0x74, 0x3d, 0x5b, 0x5d, 0x2c, 0x62, - 0x74, 0x3d, 0x5b, 0x5d, 0x2c, 0x6b, 0x74, 0x3d, 0x43, 0x2c, 0x53, 0x74, - 0x3d, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, 0x62, 0x2c, 0x77, 0x74, 0x3d, 0x6b, - 0x74, 0x2e, 0x5f, 0x5f, 0x72, 0x2c, 0x78, 0x74, 0x3d, 0x6b, 0x74, 0x2e, - 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x2c, 0x43, 0x74, 0x3d, 0x6b, 0x74, - 0x2e, 0x5f, 0x5f, 0x63, 0x2c, 0x45, 0x74, 0x3d, 0x6b, 0x74, 0x2e, 0x75, - 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x2c, 0x55, 0x74, 0x3d, 0x6b, 0x74, - 0x2e, 0x5f, 0x5f, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x20, 0x48, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x6b, 0x74, 0x2e, - 0x5f, 0x5f, 0x68, 0x26, 0x26, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x28, - 0x64, 0x74, 0x2c, 0x74, 0x2c, 0x6d, 0x74, 0x7c, 0x7c, 0x6e, 0x29, 0x2c, - 0x6d, 0x74, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x64, - 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x7c, 0x7c, 0x28, 0x64, 0x74, 0x2e, 0x5f, - 0x5f, 0x48, 0x3d, 0x7b, 0x5f, 0x5f, 0x3a, 0x5b, 0x5d, 0x2c, 0x5f, 0x5f, - 0x68, 0x3a, 0x5b, 0x5d, 0x7d, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x3e, 0x3d, 0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x6c, 0x65, - 0x6e, 0x67, 0x74, 0x68, 0x26, 0x26, 0x65, 0x2e, 0x5f, 0x5f, 0x2e, 0x70, - 0x75, 0x73, 0x68, 0x28, 0x7b, 0x5f, 0x5f, 0x56, 0x3a, 0x62, 0x74, 0x7d, - 0x29, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x5b, 0x74, 0x5d, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x50, 0x74, 0x28, 0x74, 0x29, - 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, 0x74, 0x3d, 0x31, - 0x2c, 0x4e, 0x74, 0x28, 0x7a, 0x74, 0x2c, 0x74, 0x29, 0x7d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4e, 0x74, 0x28, 0x74, 0x2c, - 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x3d, 0x48, - 0x74, 0x28, 0x70, 0x74, 0x2b, 0x2b, 0x2c, 0x32, 0x29, 0x3b, 0x69, 0x66, - 0x28, 0x5f, 0x2e, 0x74, 0x3d, 0x74, 0x2c, 0x21, 0x5f, 0x2e, 0x5f, 0x5f, - 0x63, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x5f, 0x5f, 0x3d, 0x5b, 0x65, 0x3f, - 0x65, 0x28, 0x6e, 0x29, 0x3a, 0x7a, 0x74, 0x28, 0x76, 0x6f, 0x69, 0x64, - 0x20, 0x30, 0x2c, 0x6e, 0x29, 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, - 0x5f, 0x2e, 0x5f, 0x5f, 0x4e, 0x3f, 0x5f, 0x2e, 0x5f, 0x5f, 0x4e, 0x5b, - 0x30, 0x5d, 0x3a, 0x5f, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x2c, 0x65, - 0x3d, 0x5f, 0x2e, 0x74, 0x28, 0x6e, 0x2c, 0x74, 0x29, 0x3b, 0x6e, 0x21, - 0x3d, 0x3d, 0x65, 0x26, 0x26, 0x28, 0x5f, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, - 0x5b, 0x65, 0x2c, 0x5f, 0x2e, 0x5f, 0x5f, 0x5b, 0x31, 0x5d, 0x5d, 0x2c, - 0x5f, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, - 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x29, 0x7d, 0x5d, 0x2c, 0x5f, 0x2e, - 0x5f, 0x5f, 0x63, 0x3d, 0x64, 0x74, 0x2c, 0x21, 0x64, 0x74, 0x2e, 0x75, - 0x29, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, - 0x7b, 0x69, 0x66, 0x28, 0x21, 0x5f, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, - 0x5f, 0x48, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, - 0x76, 0x61, 0x72, 0x20, 0x69, 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, - 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, - 0x72, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x21, 0x74, - 0x2e, 0x5f, 0x5f, 0x63, 0x7d, 0x29, 0x29, 0x3b, 0x69, 0x66, 0x28, 0x69, - 0x2e, 0x65, 0x76, 0x65, 0x72, 0x79, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x7d, 0x29, 0x29, 0x29, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x6f, 0x7c, 0x7c, 0x6f, 0x2e, - 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x74, 0x2c, - 0x6e, 0x2c, 0x65, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x72, 0x3d, 0x21, - 0x31, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x69, 0x2e, 0x66, - 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, - 0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x21, 0x3d, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x5b, 0x30, 0x5d, 0x26, 0x26, 0x28, 0x72, 0x3d, - 0x21, 0x30, 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x21, 0x28, 0x21, 0x72, - 0x26, 0x26, 0x5f, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x70, 0x72, 0x6f, 0x70, - 0x73, 0x3d, 0x3d, 0x3d, 0x74, 0x29, 0x26, 0x26, 0x28, 0x21, 0x6f, 0x7c, - 0x7c, 0x6f, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x29, 0x7d, 0x3b, 0x64, 0x74, - 0x2e, 0x75, 0x3d, 0x21, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, - 0x64, 0x74, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, - 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x2c, 0x72, 0x3d, 0x64, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, - 0x65, 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, - 0x65, 0x3b, 0x64, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, - 0x6e, 0x74, 0x57, 0x69, 0x6c, 0x6c, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, - 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, - 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x3d, - 0x6f, 0x3b, 0x6f, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x69, - 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x2c, 0x6f, 0x3d, 0x5f, 0x7d, - 0x72, 0x26, 0x26, 0x72, 0x2e, 0x63, 0x61, 0x6c, 0x6c, 0x28, 0x74, 0x68, - 0x69, 0x73, 0x2c, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7d, 0x2c, 0x64, - 0x74, 0x2e, 0x73, 0x68, 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, - 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, - 0x69, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x5f, 0x2e, 0x5f, - 0x5f, 0x4e, 0x7c, 0x7c, 0x5f, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x24, 0x74, 0x28, 0x74, 0x2c, 0x6e, - 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x48, 0x74, 0x28, 0x70, - 0x74, 0x2b, 0x2b, 0x2c, 0x33, 0x29, 0x3b, 0x21, 0x6b, 0x74, 0x2e, 0x5f, - 0x5f, 0x73, 0x26, 0x26, 0x47, 0x74, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, - 0x2c, 0x6e, 0x29, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, - 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, 0x2c, 0x64, 0x74, 0x2e, 0x5f, 0x5f, - 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x65, - 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x44, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, - 0x65, 0x3d, 0x48, 0x74, 0x28, 0x70, 0x74, 0x2b, 0x2b, 0x2c, 0x34, 0x29, - 0x3b, 0x21, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, 0x73, 0x26, 0x26, 0x47, 0x74, - 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x26, 0x26, 0x28, - 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x65, 0x2e, 0x69, 0x3d, 0x6e, - 0x2c, 0x64, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x65, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x54, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x6d, 0x74, 0x3d, 0x35, 0x2c, 0x41, 0x74, 0x28, 0x28, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, - 0x74, 0x3a, 0x74, 0x7d, 0x7d, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x56, 0x74, 0x28, 0x74, - 0x2c, 0x6e, 0x2c, 0x65, 0x29, 0x7b, 0x6d, 0x74, 0x3d, 0x36, 0x2c, 0x44, - 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, - 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, - 0x6f, 0x66, 0x20, 0x74, 0x3f, 0x28, 0x74, 0x28, 0x6e, 0x28, 0x29, 0x29, - 0x2c, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x28, 0x6e, 0x75, 0x6c, - 0x6c, 0x29, 0x7d, 0x29, 0x3a, 0x74, 0x3f, 0x28, 0x74, 0x2e, 0x63, 0x75, - 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x6e, 0x28, 0x29, 0x2c, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x20, 0x74, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, - 0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x29, 0x3a, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x7d, 0x29, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, - 0x65, 0x3f, 0x65, 0x3a, 0x65, 0x2e, 0x63, 0x6f, 0x6e, 0x63, 0x61, 0x74, - 0x28, 0x74, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x41, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x76, 0x61, - 0x72, 0x20, 0x65, 0x3d, 0x48, 0x74, 0x28, 0x70, 0x74, 0x2b, 0x2b, 0x2c, - 0x37, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x47, 0x74, - 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2c, 0x6e, 0x29, 0x3f, 0x28, 0x65, - 0x2e, 0x5f, 0x5f, 0x56, 0x3d, 0x74, 0x28, 0x29, 0x2c, 0x65, 0x2e, 0x69, - 0x3d, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x74, 0x2c, 0x65, - 0x2e, 0x5f, 0x5f, 0x56, 0x29, 0x3a, 0x65, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x46, 0x74, 0x28, 0x74, - 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, - 0x74, 0x3d, 0x38, 0x2c, 0x41, 0x74, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x7d, 0x29, 0x2c, 0x6e, 0x29, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4d, 0x74, 0x28, 0x74, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x64, 0x74, 0x2e, 0x63, 0x6f, 0x6e, - 0x74, 0x65, 0x78, 0x74, 0x5b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x5d, 0x2c, - 0x65, 0x3d, 0x48, 0x74, 0x28, 0x70, 0x74, 0x2b, 0x2b, 0x2c, 0x39, 0x29, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, 0x63, 0x3d, - 0x74, 0x2c, 0x6e, 0x3f, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x3d, 0x3d, 0x65, - 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x3d, 0x21, - 0x30, 0x2c, 0x6e, 0x2e, 0x73, 0x75, 0x62, 0x28, 0x64, 0x74, 0x29, 0x29, - 0x2c, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x3a, 0x74, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x57, 0x74, 0x28, 0x74, 0x2c, 0x6e, - 0x29, 0x7b, 0x6b, 0x74, 0x2e, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, - 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, 0x26, 0x26, 0x6b, 0x74, 0x2e, 0x75, - 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, 0x56, 0x61, 0x6c, 0x75, 0x65, - 0x28, 0x6e, 0x3f, 0x6e, 0x28, 0x74, 0x29, 0x3a, 0x74, 0x29, 0x7d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4c, 0x74, 0x28, 0x74, - 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x48, 0x74, 0x28, 0x70, - 0x74, 0x2b, 0x2b, 0x2c, 0x31, 0x30, 0x29, 0x2c, 0x65, 0x3d, 0x50, 0x74, - 0x28, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x2e, - 0x5f, 0x5f, 0x3d, 0x74, 0x2c, 0x64, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, - 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, - 0x68, 0x7c, 0x7c, 0x28, 0x64, 0x74, 0x2e, 0x63, 0x6f, 0x6d, 0x70, 0x6f, - 0x6e, 0x65, 0x6e, 0x74, 0x44, 0x69, 0x64, 0x43, 0x61, 0x74, 0x63, 0x68, - 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, - 0x5f, 0x29, 0x7b, 0x6e, 0x2e, 0x5f, 0x5f, 0x26, 0x26, 0x6e, 0x2e, 0x5f, - 0x5f, 0x28, 0x74, 0x2c, 0x5f, 0x29, 0x2c, 0x65, 0x5b, 0x31, 0x5d, 0x28, - 0x74, 0x29, 0x7d, 0x29, 0x2c, 0x5b, 0x65, 0x5b, 0x30, 0x5d, 0x2c, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x65, 0x5b, - 0x31, 0x5d, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x29, 0x7d, 0x5d, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4f, 0x74, - 0x28, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3d, 0x48, 0x74, 0x28, - 0x70, 0x74, 0x2b, 0x2b, 0x2c, 0x31, 0x31, 0x29, 0x3b, 0x69, 0x66, 0x28, - 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, 0x76, - 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x64, 0x74, 0x2e, 0x5f, 0x5f, 0x76, 0x3b, - 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x21, 0x6e, - 0x2e, 0x5f, 0x5f, 0x6d, 0x26, 0x26, 0x6e, 0x75, 0x6c, 0x6c, 0x21, 0x3d, - 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x3b, 0x29, 0x6e, 0x3d, 0x6e, 0x2e, 0x5f, - 0x5f, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, - 0x6d, 0x7c, 0x7c, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x6d, 0x3d, 0x5b, 0x30, - 0x2c, 0x30, 0x5d, 0x29, 0x3b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, 0x22, 0x50, - 0x22, 0x2b, 0x65, 0x5b, 0x30, 0x5d, 0x2b, 0x22, 0x2d, 0x22, 0x2b, 0x65, - 0x5b, 0x31, 0x5d, 0x2b, 0x2b, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x74, 0x2e, 0x5f, 0x5f, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x52, 0x74, 0x28, 0x29, 0x7b, 0x66, 0x6f, 0x72, 0x28, - 0x76, 0x61, 0x72, 0x20, 0x74, 0x3b, 0x74, 0x3d, 0x67, 0x74, 0x2e, 0x73, - 0x68, 0x69, 0x66, 0x74, 0x28, 0x29, 0x3b, 0x29, 0x69, 0x66, 0x28, 0x74, - 0x2e, 0x5f, 0x5f, 0x50, 0x26, 0x26, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x29, - 0x74, 0x72, 0x79, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, - 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x71, 0x74, - 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, - 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x42, 0x74, 0x29, 0x2c, - 0x74, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, - 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, - 0x5f, 0x5f, 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x6b, - 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x6e, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, - 0x76, 0x29, 0x7d, 0x7d, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, 0x62, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x64, - 0x74, 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x53, 0x74, 0x26, 0x26, 0x53, - 0x74, 0x28, 0x74, 0x29, 0x7d, 0x2c, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, - 0x29, 0x7b, 0x74, 0x26, 0x26, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x26, 0x26, - 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, 0x5f, 0x5f, 0x6d, 0x26, 0x26, 0x28, - 0x74, 0x2e, 0x5f, 0x5f, 0x6d, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x6b, 0x2e, - 0x5f, 0x5f, 0x6d, 0x29, 0x2c, 0x55, 0x74, 0x26, 0x26, 0x55, 0x74, 0x28, - 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x2c, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, 0x72, - 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, - 0x7b, 0x77, 0x74, 0x26, 0x26, 0x77, 0x74, 0x28, 0x74, 0x29, 0x2c, 0x70, - 0x74, 0x3d, 0x30, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x28, 0x64, - 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x29, 0x2e, 0x5f, 0x5f, 0x48, - 0x3b, 0x6e, 0x26, 0x26, 0x28, 0x76, 0x74, 0x3d, 0x3d, 0x3d, 0x64, 0x74, - 0x3f, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x64, - 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x6e, 0x2e, 0x5f, - 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, - 0x2e, 0x5f, 0x5f, 0x4e, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, - 0x3d, 0x62, 0x74, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x4e, 0x3d, 0x74, 0x2e, - 0x69, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x29, 0x29, 0x29, - 0x3a, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, - 0x61, 0x63, 0x68, 0x28, 0x71, 0x74, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, - 0x68, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x42, 0x74, - 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, 0x5d, 0x2c, 0x70, - 0x74, 0x3d, 0x30, 0x29, 0x29, 0x2c, 0x76, 0x74, 0x3d, 0x64, 0x74, 0x7d, - 0x2c, 0x6b, 0x74, 0x2e, 0x64, 0x69, 0x66, 0x66, 0x65, 0x64, 0x3d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x78, - 0x74, 0x26, 0x26, 0x78, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, 0x72, - 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x6e, 0x26, 0x26, - 0x6e, 0x2e, 0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x6e, 0x2e, 0x5f, 0x5f, - 0x48, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x26, 0x26, 0x28, 0x31, 0x21, 0x3d, 0x3d, 0x67, 0x74, 0x2e, 0x70, 0x75, - 0x73, 0x68, 0x28, 0x6e, 0x29, 0x26, 0x26, 0x79, 0x74, 0x3d, 0x3d, 0x3d, - 0x6b, 0x74, 0x2e, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, - 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, - 0x7c, 0x7c, 0x28, 0x28, 0x79, 0x74, 0x3d, 0x6b, 0x74, 0x2e, 0x72, 0x65, - 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x29, 0x7c, 0x7c, 0x49, 0x74, - 0x29, 0x28, 0x52, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x2e, 0x5f, 0x5f, 0x48, - 0x2e, 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, - 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, - 0x7b, 0x74, 0x2e, 0x69, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x48, - 0x3d, 0x74, 0x2e, 0x69, 0x29, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x21, - 0x3d, 0x3d, 0x62, 0x74, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x3d, - 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x29, 0x2c, 0x74, 0x2e, 0x69, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, 0x56, 0x3d, - 0x62, 0x74, 0x7d, 0x29, 0x29, 0x29, 0x2c, 0x76, 0x74, 0x3d, 0x64, 0x74, - 0x3d, 0x6e, 0x75, 0x6c, 0x6c, 0x7d, 0x2c, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, - 0x63, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, - 0x2c, 0x6e, 0x29, 0x7b, 0x6e, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, - 0x74, 0x72, 0x79, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x6f, - 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x71, 0x74, 0x29, 0x2c, 0x74, 0x2e, - 0x5f, 0x5f, 0x68, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x2e, 0x66, 0x69, - 0x6c, 0x74, 0x65, 0x72, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, - 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x21, 0x74, 0x2e, 0x5f, 0x5f, 0x7c, 0x7c, 0x42, 0x74, 0x28, 0x74, 0x29, - 0x7d, 0x29, 0x29, 0x7d, 0x63, 0x61, 0x74, 0x63, 0x68, 0x28, 0x75, 0x29, - 0x7b, 0x6e, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x74, 0x2e, 0x5f, - 0x5f, 0x68, 0x26, 0x26, 0x28, 0x74, 0x2e, 0x5f, 0x5f, 0x68, 0x3d, 0x5b, - 0x5d, 0x29, 0x7d, 0x29, 0x29, 0x2c, 0x6e, 0x3d, 0x5b, 0x5d, 0x2c, 0x6b, - 0x74, 0x2e, 0x5f, 0x5f, 0x65, 0x28, 0x75, 0x2c, 0x74, 0x2e, 0x5f, 0x5f, - 0x76, 0x29, 0x7d, 0x7d, 0x29, 0x29, 0x2c, 0x43, 0x74, 0x26, 0x26, 0x43, - 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7d, 0x2c, 0x6b, 0x74, 0x2e, 0x75, - 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x45, 0x74, 0x26, 0x26, 0x45, - 0x74, 0x28, 0x74, 0x29, 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, - 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x65, 0x26, 0x26, 0x65, 0x2e, - 0x5f, 0x5f, 0x48, 0x26, 0x26, 0x28, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x2e, - 0x5f, 0x5f, 0x2e, 0x66, 0x6f, 0x72, 0x45, 0x61, 0x63, 0x68, 0x28, 0x28, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, - 0x74, 0x72, 0x79, 0x7b, 0x71, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x63, 0x61, - 0x74, 0x63, 0x68, 0x28, 0x74, 0x29, 0x7b, 0x6e, 0x3d, 0x74, 0x7d, 0x7d, - 0x29, 0x29, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x48, 0x3d, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x2c, 0x6e, 0x26, 0x26, 0x6b, 0x74, 0x2e, 0x5f, 0x5f, - 0x65, 0x28, 0x6e, 0x2c, 0x65, 0x2e, 0x5f, 0x5f, 0x76, 0x29, 0x29, 0x7d, - 0x3b, 0x76, 0x61, 0x72, 0x20, 0x6a, 0x74, 0x3d, 0x22, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, - 0x6f, 0x66, 0x20, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, - 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, - 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x49, 0x74, - 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x3d, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x29, 0x7b, 0x63, - 0x6c, 0x65, 0x61, 0x72, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, - 0x5f, 0x29, 0x2c, 0x6a, 0x74, 0x26, 0x26, 0x63, 0x61, 0x6e, 0x63, 0x65, - 0x6c, 0x41, 0x6e, 0x69, 0x6d, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, - 0x61, 0x6d, 0x65, 0x28, 0x6e, 0x29, 0x2c, 0x73, 0x65, 0x74, 0x54, 0x69, - 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x74, 0x29, 0x7d, 0x2c, 0x5f, 0x3d, - 0x73, 0x65, 0x74, 0x54, 0x69, 0x6d, 0x65, 0x6f, 0x75, 0x74, 0x28, 0x65, - 0x2c, 0x31, 0x30, 0x30, 0x29, 0x3b, 0x6a, 0x74, 0x26, 0x26, 0x28, 0x6e, - 0x3d, 0x72, 0x65, 0x71, 0x75, 0x65, 0x73, 0x74, 0x41, 0x6e, 0x69, 0x6d, - 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x46, 0x72, 0x61, 0x6d, 0x65, 0x28, 0x65, - 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x71, 0x74, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, - 0x64, 0x74, 0x2c, 0x65, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x22, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x65, 0x26, 0x26, 0x28, 0x74, 0x2e, - 0x5f, 0x5f, 0x63, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x2c, 0x65, - 0x28, 0x29, 0x29, 0x2c, 0x64, 0x74, 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, - 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x42, 0x74, 0x28, 0x74, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, 0x64, 0x74, 0x3b, 0x74, 0x2e, 0x5f, - 0x5f, 0x63, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x28, 0x29, 0x2c, 0x64, 0x74, - 0x3d, 0x6e, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x47, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x21, 0x74, 0x7c, 0x7c, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, - 0x74, 0x68, 0x21, 0x3d, 0x3d, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x7c, 0x7c, 0x6e, 0x2e, 0x73, 0x6f, 0x6d, 0x65, 0x28, 0x28, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x6e, 0x2c, 0x65, 0x29, - 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x21, 0x3d, 0x3d, - 0x74, 0x5b, 0x65, 0x5d, 0x7d, 0x29, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x7a, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x29, - 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x22, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, - 0x66, 0x20, 0x6e, 0x3f, 0x6e, 0x28, 0x74, 0x29, 0x3a, 0x6e, 0x7d, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x4a, 0x74, 0x28, 0x74, - 0x2c, 0x6e, 0x29, 0x7b, 0x43, 0x5b, 0x74, 0x5d, 0x3d, 0x6e, 0x2e, 0x62, - 0x69, 0x6e, 0x64, 0x28, 0x6e, 0x75, 0x6c, 0x6c, 0x2c, 0x43, 0x5b, 0x74, - 0x5d, 0x7c, 0x7c, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x7d, 0x29, 0x29, - 0x7d, 0x6c, 0x65, 0x74, 0x20, 0x4b, 0x74, 0x2c, 0x51, 0x74, 0x3b, 0x66, - 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x58, 0x74, 0x28, 0x74, - 0x29, 0x7b, 0x69, 0x66, 0x28, 0x51, 0x74, 0x29, 0x51, 0x74, 0x28, 0x29, - 0x3b, 0x51, 0x74, 0x3d, 0x74, 0x26, 0x26, 0x74, 0x2e, 0x53, 0x28, 0x29, - 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x59, 0x74, - 0x28, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x3a, 0x74, 0x7d, 0x29, 0x7b, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x6e, 0x28, 0x74, 0x29, - 0x3b, 0x6e, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x74, 0x3b, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x41, 0x74, 0x28, 0x28, 0x29, - 0x3d, 0x3e, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x5f, 0x76, 0x3b, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x28, - 0x74, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, 0x29, 0x69, 0x66, 0x28, 0x74, 0x2e, - 0x5f, 0x5f, 0x63, 0x29, 0x7b, 0x74, 0x2e, 0x5f, 0x5f, 0x63, 0x2e, 0x5f, - 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x62, 0x72, 0x65, 0x61, 0x6b, - 0x7d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x2e, 0x63, - 0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x74, 0x3b, - 0x69, 0x66, 0x28, 0x21, 0x55, 0x28, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, - 0x28, 0x29, 0x29, 0x26, 0x26, 0x33, 0x3d, 0x3d, 0x3d, 0x28, 0x6e, 0x75, - 0x6c, 0x6c, 0x3d, 0x3d, 0x28, 0x74, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x62, 0x61, 0x73, 0x65, 0x29, 0x3f, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, - 0x3a, 0x74, 0x2e, 0x6e, 0x6f, 0x64, 0x65, 0x54, 0x79, 0x70, 0x65, 0x29, - 0x29, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x62, 0x61, 0x73, 0x65, 0x2e, 0x64, - 0x61, 0x74, 0x61, 0x3d, 0x65, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, - 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x73, 0x65, 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, - 0x7d, 0x7d, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6d, 0x28, - 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e, - 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x30, 0x3d, 0x3d, 0x3d, - 0x74, 0x3f, 0x30, 0x3a, 0x21, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x3f, 0x22, - 0x22, 0x3a, 0x74, 0x7c, 0x7c, 0x22, 0x22, 0x7d, 0x29, 0x7d, 0x2c, 0x5b, - 0x5d, 0x29, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x65, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x7d, 0x59, 0x74, 0x2e, 0x64, 0x69, 0x73, - 0x70, 0x6c, 0x61, 0x79, 0x4e, 0x61, 0x6d, 0x65, 0x3d, 0x22, 0x5f, 0x73, - 0x74, 0x22, 0x3b, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x64, 0x65, - 0x66, 0x69, 0x6e, 0x65, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, - 0x65, 0x73, 0x28, 0x68, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, - 0x70, 0x65, 0x2c, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x72, 0x75, 0x63, - 0x74, 0x6f, 0x72, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, - 0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3a, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x2c, 0x74, - 0x79, 0x70, 0x65, 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, - 0x72, 0x61, 0x62, 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x3a, 0x59, 0x74, 0x7d, 0x2c, 0x70, 0x72, 0x6f, 0x70, 0x73, - 0x3a, 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, - 0x6c, 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x67, 0x65, 0x74, 0x28, 0x29, 0x7b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x7b, 0x64, 0x61, 0x74, 0x61, 0x3a, - 0x74, 0x68, 0x69, 0x73, 0x7d, 0x7d, 0x7d, 0x2c, 0x5f, 0x5f, 0x62, 0x3a, - 0x7b, 0x63, 0x6f, 0x6e, 0x66, 0x69, 0x67, 0x75, 0x72, 0x61, 0x62, 0x6c, - 0x65, 0x3a, 0x21, 0x30, 0x2c, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x31, - 0x7d, 0x7d, 0x29, 0x3b, 0x4a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x62, 0x22, - 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, - 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, - 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, - 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x2c, 0x65, 0x3d, 0x6e, 0x2e, 0x70, - 0x72, 0x6f, 0x70, 0x73, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, - 0x20, 0x5f, 0x20, 0x69, 0x6e, 0x20, 0x65, 0x29, 0x7b, 0x69, 0x66, 0x28, - 0x22, 0x63, 0x68, 0x69, 0x6c, 0x64, 0x72, 0x65, 0x6e, 0x22, 0x3d, 0x3d, - 0x3d, 0x5f, 0x29, 0x63, 0x6f, 0x6e, 0x74, 0x69, 0x6e, 0x75, 0x65, 0x3b, - 0x6c, 0x65, 0x74, 0x20, 0x69, 0x3d, 0x65, 0x5b, 0x5f, 0x5d, 0x3b, 0x69, - 0x66, 0x28, 0x69, 0x20, 0x69, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x63, 0x65, - 0x6f, 0x66, 0x20, 0x68, 0x29, 0x7b, 0x69, 0x66, 0x28, 0x21, 0x74, 0x29, - 0x6e, 0x2e, 0x5f, 0x5f, 0x6e, 0x70, 0x3d, 0x74, 0x3d, 0x7b, 0x7d, 0x3b, - 0x74, 0x5b, 0x5f, 0x5d, 0x3d, 0x69, 0x3b, 0x65, 0x5b, 0x5f, 0x5d, 0x3d, - 0x69, 0x2e, 0x70, 0x65, 0x65, 0x6b, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x74, - 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x4a, 0x74, 0x28, 0x22, 0x5f, 0x5f, - 0x72, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x58, - 0x74, 0x28, 0x29, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x2c, 0x5f, 0x3d, - 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x69, 0x66, 0x28, 0x5f, 0x29, 0x7b, - 0x5f, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x26, 0x3d, 0x2d, 0x32, 0x3b, 0x65, - 0x3d, 0x5f, 0x2e, 0x5f, 0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x65, 0x29, 0x5f, 0x2e, - 0x5f, 0x5f, 0x24, 0x75, 0x3d, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, - 0x3b, 0x77, 0x28, 0x28, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, - 0x28, 0x29, 0x7b, 0x6e, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x7d, 0x29, 0x29, - 0x3b, 0x6e, 0x2e, 0x63, 0x3d, 0x28, 0x29, 0x3d, 0x3e, 0x7b, 0x5f, 0x2e, - 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x31, 0x3b, 0x5f, 0x2e, 0x73, 0x65, - 0x74, 0x53, 0x74, 0x61, 0x74, 0x65, 0x28, 0x7b, 0x7d, 0x29, 0x7d, 0x3b, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7d, 0x28, 0x29, 0x7d, - 0x4b, 0x74, 0x3d, 0x5f, 0x3b, 0x58, 0x74, 0x28, 0x65, 0x29, 0x3b, 0x74, - 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x4a, 0x74, 0x28, 0x22, 0x5f, 0x5f, - 0x65, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, - 0x3d, 0x3e, 0x7b, 0x58, 0x74, 0x28, 0x29, 0x3b, 0x4b, 0x74, 0x3d, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x74, 0x28, 0x6e, 0x2c, 0x65, 0x2c, - 0x5f, 0x29, 0x7d, 0x29, 0x3b, 0x4a, 0x74, 0x28, 0x22, 0x64, 0x69, 0x66, - 0x66, 0x65, 0x64, 0x22, 0x2c, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, 0x3e, - 0x7b, 0x58, 0x74, 0x28, 0x29, 0x3b, 0x4b, 0x74, 0x3d, 0x76, 0x6f, 0x69, - 0x64, 0x20, 0x30, 0x3b, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x3b, 0x69, 0x66, - 0x28, 0x22, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, - 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, - 0x26, 0x26, 0x28, 0x65, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x29, 0x29, - 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x6e, - 0x70, 0x2c, 0x5f, 0x3d, 0x6e, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x73, 0x3b, - 0x69, 0x66, 0x28, 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6e, 0x3d, - 0x65, 0x2e, 0x55, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x66, 0x6f, 0x72, - 0x28, 0x6c, 0x65, 0x74, 0x20, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, - 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x3d, 0x6e, 0x5b, 0x65, 0x5d, 0x3b, - 0x69, 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, - 0x5f, 0x26, 0x26, 0x21, 0x28, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, - 0x29, 0x7b, 0x5f, 0x2e, 0x64, 0x28, 0x29, 0x3b, 0x6e, 0x5b, 0x65, 0x5d, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x7d, 0x7d, 0x65, 0x6c, 0x73, - 0x65, 0x7b, 0x6e, 0x3d, 0x7b, 0x7d, 0x3b, 0x65, 0x2e, 0x55, 0x3d, 0x6e, - 0x7d, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x69, - 0x6e, 0x20, 0x74, 0x29, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x6f, 0x3d, 0x6e, - 0x5b, 0x69, 0x5d, 0x2c, 0x72, 0x3d, 0x74, 0x5b, 0x69, 0x5d, 0x3b, 0x69, - 0x66, 0x28, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x6f, - 0x29, 0x7b, 0x6f, 0x3d, 0x5a, 0x74, 0x28, 0x65, 0x2c, 0x69, 0x2c, 0x72, - 0x2c, 0x5f, 0x29, 0x3b, 0x6e, 0x5b, 0x69, 0x5d, 0x3d, 0x6f, 0x7d, 0x65, - 0x6c, 0x73, 0x65, 0x20, 0x6f, 0x2e, 0x6f, 0x28, 0x72, 0x2c, 0x5f, 0x29, - 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, 0x29, 0x3b, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x5a, 0x74, 0x28, 0x74, 0x2c, - 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x69, 0x3d, 0x6e, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x26, 0x26, 0x76, - 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3d, 0x3d, 0x3d, 0x74, 0x2e, 0x6f, 0x77, - 0x6e, 0x65, 0x72, 0x53, 0x56, 0x47, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, - 0x74, 0x2c, 0x6f, 0x3d, 0x61, 0x28, 0x65, 0x29, 0x3b, 0x72, 0x65, 0x74, - 0x75, 0x72, 0x6e, 0x7b, 0x6f, 0x3a, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x3d, - 0x3e, 0x7b, 0x6f, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3d, 0x74, 0x3b, - 0x5f, 0x3d, 0x6e, 0x7d, 0x2c, 0x64, 0x3a, 0x77, 0x28, 0x28, 0x29, 0x3d, - 0x3e, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x6f, 0x2e, - 0x76, 0x61, 0x6c, 0x75, 0x65, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3b, - 0x69, 0x66, 0x28, 0x5f, 0x5b, 0x6e, 0x5d, 0x21, 0x3d, 0x3d, 0x65, 0x29, - 0x7b, 0x5f, 0x5b, 0x6e, 0x5d, 0x3d, 0x65, 0x3b, 0x69, 0x66, 0x28, 0x69, - 0x29, 0x74, 0x5b, 0x6e, 0x5d, 0x3d, 0x65, 0x3b, 0x65, 0x6c, 0x73, 0x65, - 0x20, 0x69, 0x66, 0x28, 0x65, 0x29, 0x74, 0x2e, 0x73, 0x65, 0x74, 0x41, - 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, 0x28, 0x6e, 0x2c, 0x65, - 0x29, 0x3b, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x74, 0x2e, 0x72, 0x65, 0x6d, - 0x6f, 0x76, 0x65, 0x41, 0x74, 0x74, 0x72, 0x69, 0x62, 0x75, 0x74, 0x65, - 0x28, 0x6e, 0x29, 0x7d, 0x7d, 0x29, 0x7d, 0x7d, 0x4a, 0x74, 0x28, 0x22, - 0x75, 0x6e, 0x6d, 0x6f, 0x75, 0x6e, 0x74, 0x22, 0x2c, 0x28, 0x74, 0x2c, - 0x6e, 0x29, 0x3d, 0x3e, 0x7b, 0x69, 0x66, 0x28, 0x22, 0x73, 0x74, 0x72, - 0x69, 0x6e, 0x67, 0x22, 0x3d, 0x3d, 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, - 0x20, 0x6e, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x29, 0x7b, 0x6c, 0x65, 0x74, - 0x20, 0x74, 0x3d, 0x6e, 0x2e, 0x5f, 0x5f, 0x65, 0x3b, 0x69, 0x66, 0x28, - 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, - 0x2e, 0x55, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x55, - 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, - 0x6c, 0x65, 0x74, 0x20, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x6e, 0x29, 0x7b, - 0x6c, 0x65, 0x74, 0x20, 0x65, 0x3d, 0x6e, 0x5b, 0x74, 0x5d, 0x3b, 0x69, - 0x66, 0x28, 0x65, 0x29, 0x65, 0x2e, 0x64, 0x28, 0x29, 0x7d, 0x7d, 0x7d, - 0x7d, 0x65, 0x6c, 0x73, 0x65, 0x7b, 0x6c, 0x65, 0x74, 0x20, 0x74, 0x3d, - 0x6e, 0x2e, 0x5f, 0x5f, 0x63, 0x3b, 0x69, 0x66, 0x28, 0x74, 0x29, 0x7b, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x74, 0x2e, 0x5f, 0x5f, - 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x6e, 0x29, 0x7b, 0x74, 0x2e, 0x5f, - 0x5f, 0x24, 0x75, 0x3d, 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x3b, 0x6e, - 0x2e, 0x64, 0x28, 0x29, 0x7d, 0x7d, 0x7d, 0x74, 0x28, 0x6e, 0x29, 0x7d, - 0x29, 0x3b, 0x4a, 0x74, 0x28, 0x22, 0x5f, 0x5f, 0x68, 0x22, 0x2c, 0x28, - 0x74, 0x2c, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x3d, 0x3e, 0x7b, 0x69, - 0x66, 0x28, 0x5f, 0x3c, 0x33, 0x7c, 0x7c, 0x39, 0x3d, 0x3d, 0x3d, 0x5f, - 0x29, 0x6e, 0x2e, 0x5f, 0x5f, 0x24, 0x66, 0x7c, 0x3d, 0x32, 0x3b, 0x74, - 0x28, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x7d, 0x29, 0x3b, 0x49, 0x2e, - 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x74, 0x79, 0x70, 0x65, 0x2e, 0x73, 0x68, - 0x6f, 0x75, 0x6c, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, - 0x74, 0x55, 0x70, 0x64, 0x61, 0x74, 0x65, 0x3d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, 0x6e, 0x29, 0x7b, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x65, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x5f, 0x24, 0x75, 0x3b, 0x69, 0x66, 0x28, 0x21, 0x28, 0x65, 0x26, 0x26, - 0x76, 0x6f, 0x69, 0x64, 0x20, 0x30, 0x21, 0x3d, 0x3d, 0x65, 0x2e, 0x73, - 0x7c, 0x7c, 0x34, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, 0x24, - 0x66, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, - 0x69, 0x66, 0x28, 0x33, 0x26, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x5f, - 0x24, 0x66, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, - 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x20, 0x69, 0x6e, - 0x20, 0x6e, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x30, 0x3b, - 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, 0x5f, 0x20, 0x69, 0x6e, - 0x20, 0x74, 0x29, 0x69, 0x66, 0x28, 0x22, 0x5f, 0x5f, 0x73, 0x6f, 0x75, - 0x72, 0x63, 0x65, 0x22, 0x21, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x74, 0x5b, - 0x5f, 0x5d, 0x21, 0x3d, 0x3d, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, - 0x6f, 0x70, 0x73, 0x5b, 0x5f, 0x5d, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x21, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x6c, 0x65, 0x74, 0x20, - 0x5f, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x70, 0x72, - 0x6f, 0x70, 0x73, 0x29, 0x69, 0x66, 0x28, 0x21, 0x28, 0x5f, 0x20, 0x69, - 0x6e, 0x20, 0x74, 0x29, 0x29, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, - 0x30, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x21, 0x31, 0x7d, 0x3b, - 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x74, 0x6e, 0x28, - 0x74, 0x29, 0x7b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x41, 0x74, - 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x61, 0x28, 0x74, 0x29, 0x2c, 0x5b, 0x5d, - 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6e, - 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, - 0x3d, 0x54, 0x74, 0x28, 0x74, 0x29, 0x3b, 0x6e, 0x2e, 0x63, 0x75, 0x72, - 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, 0x3b, 0x4b, 0x74, 0x2e, 0x5f, 0x5f, - 0x24, 0x66, 0x7c, 0x3d, 0x34, 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x41, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x6d, 0x28, 0x28, 0x29, - 0x3d, 0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28, - 0x29, 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x66, 0x75, 0x6e, 0x63, 0x74, - 0x69, 0x6f, 0x6e, 0x20, 0x65, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x3d, 0x54, 0x74, 0x28, 0x74, 0x29, 0x3b, - 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x3d, 0x74, 0x3b, - 0x24, 0x74, 0x28, 0x28, 0x29, 0x3d, 0x3e, 0x77, 0x28, 0x28, 0x29, 0x3d, - 0x3e, 0x6e, 0x2e, 0x63, 0x75, 0x72, 0x72, 0x65, 0x6e, 0x74, 0x28, 0x29, - 0x29, 0x2c, 0x5b, 0x5d, 0x29, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x5f, 0x6e, - 0x3d, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x2c, - 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x69, - 0x3b, 0x6e, 0x5b, 0x30, 0x5d, 0x3d, 0x30, 0x3b, 0x66, 0x6f, 0x72, 0x28, - 0x76, 0x61, 0x72, 0x20, 0x6f, 0x3d, 0x31, 0x3b, 0x6f, 0x3c, 0x6e, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6f, 0x2b, 0x2b, 0x29, 0x7b, - 0x76, 0x61, 0x72, 0x20, 0x72, 0x3d, 0x6e, 0x5b, 0x6f, 0x2b, 0x2b, 0x5d, - 0x2c, 0x75, 0x3d, 0x6e, 0x5b, 0x6f, 0x5d, 0x3f, 0x28, 0x6e, 0x5b, 0x30, - 0x5d, 0x7c, 0x3d, 0x72, 0x3f, 0x31, 0x3a, 0x32, 0x2c, 0x65, 0x5b, 0x6e, - 0x5b, 0x6f, 0x2b, 0x2b, 0x5d, 0x5d, 0x29, 0x3a, 0x6e, 0x5b, 0x2b, 0x2b, - 0x6f, 0x5d, 0x3b, 0x33, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x5f, 0x5b, 0x30, - 0x5d, 0x3d, 0x75, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x5f, 0x5b, - 0x31, 0x5d, 0x3d, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x61, 0x73, - 0x73, 0x69, 0x67, 0x6e, 0x28, 0x5f, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, 0x7b, - 0x7d, 0x2c, 0x75, 0x29, 0x3a, 0x35, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x28, - 0x5f, 0x5b, 0x31, 0x5d, 0x3d, 0x5f, 0x5b, 0x31, 0x5d, 0x7c, 0x7c, 0x7b, - 0x7d, 0x29, 0x5b, 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x3d, 0x75, - 0x3a, 0x36, 0x3d, 0x3d, 0x3d, 0x72, 0x3f, 0x5f, 0x5b, 0x31, 0x5d, 0x5b, - 0x6e, 0x5b, 0x2b, 0x2b, 0x6f, 0x5d, 0x5d, 0x2b, 0x3d, 0x75, 0x2b, 0x22, - 0x22, 0x3a, 0x72, 0x3f, 0x28, 0x69, 0x3d, 0x74, 0x2e, 0x61, 0x70, 0x70, - 0x6c, 0x79, 0x28, 0x75, 0x2c, 0x5f, 0x6e, 0x28, 0x74, 0x2c, 0x75, 0x2c, - 0x65, 0x2c, 0x5b, 0x22, 0x22, 0x2c, 0x6e, 0x75, 0x6c, 0x6c, 0x5d, 0x29, - 0x29, 0x2c, 0x5f, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x69, 0x29, 0x2c, - 0x75, 0x5b, 0x30, 0x5d, 0x3f, 0x6e, 0x5b, 0x30, 0x5d, 0x7c, 0x3d, 0x32, - 0x3a, 0x28, 0x6e, 0x5b, 0x6f, 0x2d, 0x32, 0x5d, 0x3d, 0x30, 0x2c, 0x6e, - 0x5b, 0x6f, 0x5d, 0x3d, 0x69, 0x29, 0x29, 0x3a, 0x5f, 0x2e, 0x70, 0x75, - 0x73, 0x68, 0x28, 0x75, 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x5f, 0x7d, 0x2c, 0x6f, 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x4d, - 0x61, 0x70, 0x3b, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x20, - 0x72, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x3d, - 0x6f, 0x6e, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x29, - 0x3b, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, 0x7c, 0x7c, 0x28, - 0x6e, 0x3d, 0x6e, 0x65, 0x77, 0x20, 0x4d, 0x61, 0x70, 0x2c, 0x6f, 0x6e, - 0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x68, 0x69, 0x73, 0x2c, 0x6e, 0x29, - 0x29, 0x2c, 0x28, 0x6e, 0x3d, 0x5f, 0x6e, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x2c, 0x6e, 0x2e, 0x67, 0x65, 0x74, 0x28, 0x74, 0x29, 0x7c, 0x7c, 0x28, - 0x6e, 0x2e, 0x73, 0x65, 0x74, 0x28, 0x74, 0x2c, 0x6e, 0x3d, 0x66, 0x75, - 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x66, 0x6f, - 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x6e, 0x2c, 0x65, 0x2c, 0x5f, 0x3d, - 0x31, 0x2c, 0x69, 0x3d, 0x22, 0x22, 0x2c, 0x6f, 0x3d, 0x22, 0x22, 0x2c, - 0x72, 0x3d, 0x5b, 0x30, 0x5d, 0x2c, 0x75, 0x3d, 0x66, 0x75, 0x6e, 0x63, - 0x74, 0x69, 0x6f, 0x6e, 0x28, 0x74, 0x29, 0x7b, 0x31, 0x3d, 0x3d, 0x3d, - 0x5f, 0x26, 0x26, 0x28, 0x74, 0x7c, 0x7c, 0x28, 0x69, 0x3d, 0x69, 0x2e, - 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x2f, 0x5e, 0x5c, 0x73, - 0x2a, 0x5c, 0x6e, 0x5c, 0x73, 0x2a, 0x7c, 0x5c, 0x73, 0x2a, 0x5c, 0x6e, - 0x5c, 0x73, 0x2a, 0x24, 0x2f, 0x67, 0x2c, 0x22, 0x22, 0x29, 0x29, 0x29, - 0x3f, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x30, 0x2c, 0x74, 0x2c, - 0x69, 0x29, 0x3a, 0x33, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x28, 0x74, - 0x7c, 0x7c, 0x69, 0x29, 0x3f, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x33, 0x2c, 0x74, 0x2c, 0x69, 0x29, 0x2c, 0x5f, 0x3d, 0x32, 0x29, - 0x3a, 0x32, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x22, 0x2e, 0x2e, 0x2e, - 0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x74, 0x3f, 0x72, 0x2e, 0x70, - 0x75, 0x73, 0x68, 0x28, 0x34, 0x2c, 0x74, 0x2c, 0x30, 0x29, 0x3a, 0x32, - 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x69, 0x26, 0x26, 0x21, 0x74, 0x3f, - 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x35, 0x2c, 0x30, 0x2c, 0x21, - 0x30, 0x2c, 0x69, 0x29, 0x3a, 0x5f, 0x3e, 0x3d, 0x35, 0x26, 0x26, 0x28, - 0x28, 0x69, 0x7c, 0x7c, 0x21, 0x74, 0x26, 0x26, 0x35, 0x3d, 0x3d, 0x3d, - 0x5f, 0x29, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, - 0x5f, 0x2c, 0x30, 0x2c, 0x69, 0x2c, 0x65, 0x29, 0x2c, 0x5f, 0x3d, 0x36, - 0x29, 0x2c, 0x74, 0x26, 0x26, 0x28, 0x72, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x5f, 0x2c, 0x74, 0x2c, 0x30, 0x2c, 0x65, 0x29, 0x2c, 0x5f, 0x3d, - 0x36, 0x29, 0x29, 0x2c, 0x69, 0x3d, 0x22, 0x22, 0x7d, 0x2c, 0x66, 0x3d, - 0x30, 0x3b, 0x66, 0x3c, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x3b, 0x66, 0x2b, 0x2b, 0x29, 0x7b, 0x66, 0x26, 0x26, 0x28, 0x31, 0x3d, - 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x75, 0x28, 0x29, 0x2c, 0x75, 0x28, 0x66, - 0x29, 0x29, 0x3b, 0x66, 0x6f, 0x72, 0x28, 0x76, 0x61, 0x72, 0x20, 0x6c, - 0x3d, 0x30, 0x3b, 0x6c, 0x3c, 0x74, 0x5b, 0x66, 0x5d, 0x2e, 0x6c, 0x65, - 0x6e, 0x67, 0x74, 0x68, 0x3b, 0x6c, 0x2b, 0x2b, 0x29, 0x6e, 0x3d, 0x74, - 0x5b, 0x66, 0x5d, 0x5b, 0x6c, 0x5d, 0x2c, 0x31, 0x3d, 0x3d, 0x3d, 0x5f, - 0x3f, 0x22, 0x3c, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28, - 0x29, 0x2c, 0x72, 0x3d, 0x5b, 0x72, 0x5d, 0x2c, 0x5f, 0x3d, 0x33, 0x29, - 0x3a, 0x69, 0x2b, 0x3d, 0x6e, 0x3a, 0x34, 0x3d, 0x3d, 0x3d, 0x5f, 0x3f, - 0x22, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x26, 0x26, 0x22, 0x3e, - 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, 0x5f, 0x3d, 0x31, 0x2c, 0x69, - 0x3d, 0x22, 0x22, 0x29, 0x3a, 0x69, 0x3d, 0x6e, 0x2b, 0x69, 0x5b, 0x30, - 0x5d, 0x3a, 0x6f, 0x3f, 0x6e, 0x3d, 0x3d, 0x3d, 0x6f, 0x3f, 0x6f, 0x3d, - 0x22, 0x22, 0x3a, 0x69, 0x2b, 0x3d, 0x6e, 0x3a, 0x27, 0x22, 0x27, 0x3d, - 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x27, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, - 0x3f, 0x6f, 0x3d, 0x6e, 0x3a, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, - 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x5f, 0x3d, 0x31, 0x29, 0x3a, 0x5f, - 0x26, 0x26, 0x28, 0x22, 0x3d, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x3f, 0x28, - 0x5f, 0x3d, 0x35, 0x2c, 0x65, 0x3d, 0x69, 0x2c, 0x69, 0x3d, 0x22, 0x22, - 0x29, 0x3a, 0x22, 0x2f, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x26, 0x26, 0x28, - 0x5f, 0x3c, 0x35, 0x7c, 0x7c, 0x22, 0x3e, 0x22, 0x3d, 0x3d, 0x3d, 0x74, - 0x5b, 0x66, 0x5d, 0x5b, 0x6c, 0x2b, 0x31, 0x5d, 0x29, 0x3f, 0x28, 0x75, - 0x28, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, 0x5f, 0x26, 0x26, 0x28, 0x72, - 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2c, 0x5f, 0x3d, 0x72, 0x2c, 0x28, - 0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, 0x29, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x32, 0x2c, 0x30, 0x2c, 0x5f, 0x29, 0x2c, 0x5f, 0x3d, 0x30, 0x29, - 0x3a, 0x22, 0x20, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, - 0x74, 0x22, 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x6e, 0x22, - 0x3d, 0x3d, 0x3d, 0x6e, 0x7c, 0x7c, 0x22, 0x5c, 0x72, 0x22, 0x3d, 0x3d, - 0x3d, 0x6e, 0x3f, 0x28, 0x75, 0x28, 0x29, 0x2c, 0x5f, 0x3d, 0x32, 0x29, - 0x3a, 0x69, 0x2b, 0x3d, 0x6e, 0x29, 0x2c, 0x33, 0x3d, 0x3d, 0x3d, 0x5f, - 0x26, 0x26, 0x22, 0x21, 0x2d, 0x2d, 0x22, 0x3d, 0x3d, 0x3d, 0x69, 0x26, - 0x26, 0x28, 0x5f, 0x3d, 0x34, 0x2c, 0x72, 0x3d, 0x72, 0x5b, 0x30, 0x5d, - 0x29, 0x7d, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x75, 0x28, 0x29, - 0x2c, 0x72, 0x7d, 0x28, 0x74, 0x29, 0x29, 0x2c, 0x6e, 0x29, 0x2c, 0x61, - 0x72, 0x67, 0x75, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x2c, 0x5b, 0x5d, 0x29, - 0x29, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3e, 0x31, 0x3f, 0x6e, - 0x3a, 0x6e, 0x5b, 0x30, 0x5d, 0x7d, 0x76, 0x61, 0x72, 0x20, 0x75, 0x6e, - 0x3d, 0x72, 0x6e, 0x2e, 0x62, 0x69, 0x6e, 0x64, 0x28, 0x4c, 0x29, 0x3b, - 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x7b, 0x49, 0x20, 0x61, 0x73, 0x20, - 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x2c, 0x6a, 0x20, - 0x61, 0x73, 0x20, 0x46, 0x72, 0x61, 0x67, 0x6d, 0x65, 0x6e, 0x74, 0x2c, - 0x68, 0x20, 0x61, 0x73, 0x20, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, - 0x5f, 0x20, 0x61, 0x73, 0x20, 0x62, 0x61, 0x74, 0x63, 0x68, 0x2c, 0x68, - 0x74, 0x20, 0x61, 0x73, 0x20, 0x63, 0x6c, 0x6f, 0x6e, 0x65, 0x45, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x6d, 0x20, 0x61, 0x73, 0x20, 0x63, - 0x6f, 0x6d, 0x70, 0x75, 0x74, 0x65, 0x64, 0x2c, 0x61, 0x74, 0x20, 0x61, - 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x43, 0x6f, 0x6e, 0x74, - 0x65, 0x78, 0x74, 0x2c, 0x4c, 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, 0x65, - 0x61, 0x74, 0x65, 0x45, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x52, - 0x20, 0x61, 0x73, 0x20, 0x63, 0x72, 0x65, 0x61, 0x74, 0x65, 0x52, 0x65, - 0x66, 0x2c, 0x77, 0x20, 0x61, 0x73, 0x20, 0x65, 0x66, 0x66, 0x65, 0x63, - 0x74, 0x2c, 0x4c, 0x20, 0x61, 0x73, 0x20, 0x68, 0x2c, 0x75, 0x6e, 0x20, - 0x61, 0x73, 0x20, 0x68, 0x74, 0x6d, 0x6c, 0x2c, 0x63, 0x74, 0x20, 0x61, - 0x73, 0x20, 0x68, 0x79, 0x64, 0x72, 0x61, 0x74, 0x65, 0x2c, 0x55, 0x20, - 0x61, 0x73, 0x20, 0x69, 0x73, 0x56, 0x61, 0x6c, 0x69, 0x64, 0x45, 0x6c, - 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x2c, 0x43, 0x20, 0x61, 0x73, 0x20, 0x6f, - 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x73, 0x2c, 0x73, 0x74, 0x20, 0x61, 0x73, - 0x20, 0x72, 0x65, 0x6e, 0x64, 0x65, 0x72, 0x2c, 0x61, 0x20, 0x61, 0x73, - 0x20, 0x73, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x59, 0x20, 0x61, 0x73, - 0x20, 0x74, 0x6f, 0x43, 0x68, 0x69, 0x6c, 0x64, 0x41, 0x72, 0x72, 0x61, - 0x79, 0x2c, 0x75, 0x20, 0x61, 0x73, 0x20, 0x75, 0x6e, 0x74, 0x72, 0x61, - 0x63, 0x6b, 0x65, 0x64, 0x2c, 0x46, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, - 0x73, 0x65, 0x43, 0x61, 0x6c, 0x6c, 0x62, 0x61, 0x63, 0x6b, 0x2c, 0x6e, - 0x6e, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x43, 0x6f, 0x6d, 0x70, - 0x75, 0x74, 0x65, 0x64, 0x2c, 0x4d, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, - 0x73, 0x65, 0x43, 0x6f, 0x6e, 0x74, 0x65, 0x78, 0x74, 0x2c, 0x57, 0x74, - 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x44, 0x65, 0x62, 0x75, 0x67, - 0x56, 0x61, 0x6c, 0x75, 0x65, 0x2c, 0x24, 0x74, 0x20, 0x61, 0x73, 0x20, - 0x75, 0x73, 0x65, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x4c, 0x74, - 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x45, 0x72, 0x72, 0x6f, 0x72, - 0x42, 0x6f, 0x75, 0x6e, 0x64, 0x61, 0x72, 0x79, 0x2c, 0x4f, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x49, 0x64, 0x2c, 0x56, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x49, 0x6d, 0x70, 0x65, 0x72, 0x61, - 0x74, 0x69, 0x76, 0x65, 0x48, 0x61, 0x6e, 0x64, 0x6c, 0x65, 0x2c, 0x44, - 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x4c, 0x61, 0x79, 0x6f, - 0x75, 0x74, 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x41, 0x74, 0x20, - 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x4d, 0x65, 0x6d, 0x6f, 0x2c, 0x4e, - 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x52, 0x65, 0x64, 0x75, - 0x63, 0x65, 0x72, 0x2c, 0x54, 0x74, 0x20, 0x61, 0x73, 0x20, 0x75, 0x73, - 0x65, 0x52, 0x65, 0x66, 0x2c, 0x74, 0x6e, 0x20, 0x61, 0x73, 0x20, 0x75, - 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, 0x2c, 0x65, 0x6e, 0x20, - 0x61, 0x73, 0x20, 0x75, 0x73, 0x65, 0x53, 0x69, 0x67, 0x6e, 0x61, 0x6c, - 0x45, 0x66, 0x66, 0x65, 0x63, 0x74, 0x2c, 0x50, 0x74, 0x20, 0x61, 0x73, - 0x20, 0x75, 0x73, 0x65, 0x53, 0x74, 0x61, 0x74, 0x65, 0x7d, 0x3b, 0x0a -}; -size_t index_js_len = 23136; diff --git a/examples/server/json-schema-to-grammar.mjs.hpp b/examples/server/json-schema-to-grammar.mjs.hpp deleted file mode 100644 index 72fb6b80f..000000000 --- a/examples/server/json-schema-to-grammar.mjs.hpp +++ /dev/null @@ -1,1667 +0,0 @@ -unsigned char json_schema_to_grammar_mjs[] = { - 0x2f, 0x2f, 0x20, 0x57, 0x41, 0x52, 0x4e, 0x49, 0x4e, 0x47, 0x3a, 0x20, - 0x54, 0x68, 0x69, 0x73, 0x20, 0x66, 0x69, 0x6c, 0x65, 0x20, 0x77, 0x61, - 0x73, 0x20, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x20, 0x66, 0x72, 0x6f, - 0x6d, 0x20, 0x6a, 0x73, 0x6f, 0x6e, 0x2d, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x2d, 0x74, 0x6f, 0x2d, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, - 0x2e, 0x70, 0x79, 0x2c, 0x20, 0x70, 0x6c, 0x65, 0x61, 0x73, 0x65, 0x20, - 0x66, 0x69, 0x78, 0x20, 0x62, 0x75, 0x67, 0x73, 0x20, 0x2f, 0x20, 0x61, - 0x64, 0x64, 0x20, 0x66, 0x65, 0x61, 0x74, 0x75, 0x72, 0x65, 0x73, 0x20, - 0x74, 0x68, 0x65, 0x72, 0x65, 0x20, 0x66, 0x69, 0x72, 0x73, 0x74, 0x2e, - 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, - 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x20, 0x3d, 0x20, 0x27, 0x22, 0x20, 0x22, - 0x3f, 0x27, 0x3b, 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x50, - 0x52, 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, - 0x45, 0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x62, 0x6f, 0x6f, - 0x6c, 0x65, 0x61, 0x6e, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x74, 0x72, 0x75, - 0x65, 0x22, 0x20, 0x7c, 0x20, 0x22, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x22, - 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, - 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, 0x2d, - 0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, - 0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2a, - 0x29, 0x29, 0x20, 0x28, 0x22, 0x2e, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, - 0x5d, 0x2b, 0x29, 0x3f, 0x20, 0x28, 0x5b, 0x65, 0x45, 0x5d, 0x20, 0x5b, - 0x2d, 0x2b, 0x5d, 0x3f, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x2b, 0x29, - 0x3f, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, - 0x69, 0x6e, 0x74, 0x65, 0x67, 0x65, 0x72, 0x3a, 0x20, 0x27, 0x28, 0x22, - 0x2d, 0x22, 0x3f, 0x20, 0x28, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, - 0x20, 0x5b, 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, - 0x2a, 0x29, 0x29, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, - 0x20, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x3a, 0x20, 0x27, 0x6f, 0x62, - 0x6a, 0x65, 0x63, 0x74, 0x20, 0x7c, 0x20, 0x61, 0x72, 0x72, 0x61, 0x79, - 0x20, 0x7c, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x7c, 0x20, - 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x7c, 0x20, 0x62, 0x6f, 0x6f, - 0x6c, 0x65, 0x61, 0x6e, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x6f, 0x62, 0x6a, - 0x65, 0x63, 0x74, 0x3a, 0x20, 0x27, 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, - 0x61, 0x63, 0x65, 0x20, 0x28, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, - 0x20, 0x22, 0x3a, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x20, 0x28, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, - 0x61, 0x63, 0x65, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x20, 0x22, - 0x3a, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x29, 0x2a, 0x20, 0x29, 0x3f, 0x20, 0x22, 0x7d, 0x22, 0x20, - 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x61, 0x72, - 0x72, 0x61, 0x79, 0x3a, 0x20, 0x27, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, - 0x61, 0x63, 0x65, 0x20, 0x28, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, - 0x28, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x29, 0x2a, 0x20, 0x29, 0x3f, 0x20, 0x22, 0x5d, - 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, - 0x75, 0x75, 0x69, 0x64, 0x3a, 0x20, 0x27, 0x22, 0x5c, 0x5c, 0x22, 0x22, - 0x20, 0x27, 0x20, 0x2b, 0x20, 0x5b, 0x38, 0x2c, 0x20, 0x34, 0x2c, 0x20, - 0x34, 0x2c, 0x20, 0x34, 0x2c, 0x20, 0x31, 0x32, 0x5d, 0x2e, 0x6d, 0x61, - 0x70, 0x28, 0x6e, 0x20, 0x3d, 0x3e, 0x20, 0x5b, 0x2e, 0x2e, 0x2e, 0x6e, - 0x65, 0x77, 0x20, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, 0x6e, 0x29, 0x5d, - 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x5f, 0x20, 0x3d, 0x3e, 0x20, 0x27, 0x5b, - 0x30, 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, 0x5d, 0x27, 0x29, - 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x29, 0x2e, 0x6a, - 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x27, 0x29, - 0x20, 0x2b, 0x20, 0x27, 0x20, 0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, - 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x73, 0x74, 0x72, - 0x69, 0x6e, 0x67, 0x3a, 0x20, 0x60, 0x20, 0x22, 0x5c, 0x5c, 0x22, 0x22, - 0x20, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x5b, - 0x5e, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x5d, 0x20, 0x7c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x22, - 0x20, 0x28, 0x5b, 0x22, 0x5c, 0x5c, 0x5c, 0x5c, 0x2f, 0x62, 0x66, 0x6e, - 0x72, 0x74, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x75, 0x22, 0x20, 0x5b, 0x30, - 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, 0x5d, 0x20, 0x5b, 0x30, - 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, 0x5d, 0x20, 0x5b, 0x30, - 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, 0x5d, 0x20, 0x5b, 0x30, - 0x2d, 0x39, 0x61, 0x2d, 0x66, 0x41, 0x2d, 0x46, 0x5d, 0x29, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x2a, 0x20, 0x22, 0x5c, 0x5c, 0x22, - 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x60, 0x2c, 0x0a, 0x20, 0x20, - 0x6e, 0x75, 0x6c, 0x6c, 0x3a, 0x20, 0x27, 0x22, 0x6e, 0x75, 0x6c, 0x6c, - 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x7d, 0x3b, - 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4f, 0x42, 0x4a, 0x45, 0x43, - 0x54, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, - 0x20, 0x3d, 0x20, 0x5b, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, - 0x2c, 0x20, 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27, 0x2c, 0x20, 0x27, - 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x2c, 0x20, 0x27, 0x6e, 0x75, - 0x6d, 0x62, 0x65, 0x72, 0x27, 0x2c, 0x20, 0x27, 0x62, 0x6f, 0x6f, 0x6c, - 0x65, 0x61, 0x6e, 0x27, 0x2c, 0x20, 0x27, 0x6e, 0x75, 0x6c, 0x6c, 0x27, - 0x2c, 0x20, 0x27, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x27, 0x5d, 0x3b, 0x0a, - 0x0a, 0x2f, 0x2f, 0x20, 0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x73, 0x75, - 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x22, 0x75, 0x72, 0x69, 0x22, 0x2c, - 0x20, 0x22, 0x65, 0x6d, 0x61, 0x69, 0x6c, 0x22, 0x20, 0x73, 0x74, 0x72, - 0x69, 0x6e, 0x67, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x73, 0x0a, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x44, 0x41, 0x54, 0x45, 0x5f, 0x52, - 0x55, 0x4c, 0x45, 0x53, 0x20, 0x3d, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x27, 0x64, 0x61, 0x74, 0x65, 0x27, 0x20, 0x20, 0x20, 0x3a, 0x20, - 0x27, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, - 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, - 0x20, 0x22, 0x2d, 0x22, 0x20, 0x28, 0x20, 0x22, 0x30, 0x22, 0x20, 0x5b, - 0x31, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x31, 0x22, 0x20, 0x5b, - 0x30, 0x2d, 0x32, 0x5d, 0x20, 0x29, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x28, - 0x20, 0x5c, 0x22, 0x30, 0x5c, 0x22, 0x20, 0x5b, 0x31, 0x2d, 0x39, 0x5d, - 0x20, 0x7c, 0x20, 0x5b, 0x31, 0x2d, 0x32, 0x5d, 0x20, 0x5b, 0x30, 0x2d, - 0x39, 0x5d, 0x20, 0x7c, 0x20, 0x22, 0x33, 0x22, 0x20, 0x5b, 0x30, 0x2d, - 0x31, 0x5d, 0x20, 0x29, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x27, - 0x74, 0x69, 0x6d, 0x65, 0x27, 0x20, 0x20, 0x20, 0x3a, 0x20, 0x27, 0x28, - 0x5b, 0x30, 0x31, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, - 0x20, 0x22, 0x32, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x33, 0x5d, 0x29, 0x20, - 0x22, 0x3a, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x35, 0x5d, 0x20, 0x5b, 0x30, - 0x2d, 0x39, 0x5d, 0x20, 0x22, 0x3a, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x35, - 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x28, 0x20, 0x22, 0x2e, - 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, - 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x29, 0x3f, 0x20, 0x28, - 0x20, 0x22, 0x5a, 0x22, 0x20, 0x7c, 0x20, 0x28, 0x20, 0x22, 0x2b, 0x22, - 0x20, 0x7c, 0x20, 0x22, 0x2d, 0x22, 0x20, 0x29, 0x20, 0x28, 0x20, 0x5b, - 0x30, 0x31, 0x5d, 0x20, 0x5b, 0x30, 0x2d, 0x39, 0x5d, 0x20, 0x7c, 0x20, - 0x22, 0x32, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x33, 0x5d, 0x20, 0x29, 0x20, - 0x22, 0x3a, 0x22, 0x20, 0x5b, 0x30, 0x2d, 0x35, 0x5d, 0x20, 0x5b, 0x30, - 0x2d, 0x39, 0x5d, 0x20, 0x29, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x27, 0x64, 0x61, 0x74, 0x65, 0x2d, 0x74, 0x69, 0x6d, 0x65, 0x27, 0x3a, - 0x20, 0x27, 0x64, 0x61, 0x74, 0x65, 0x20, 0x22, 0x54, 0x22, 0x20, 0x74, - 0x69, 0x6d, 0x65, 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x27, 0x64, - 0x61, 0x74, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x3a, - 0x20, 0x27, 0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x64, 0x61, 0x74, 0x65, - 0x20, 0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, - 0x27, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x27, 0x74, 0x69, 0x6d, 0x65, - 0x2d, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x3a, 0x20, 0x27, 0x22, - 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x74, 0x69, 0x6d, 0x65, 0x20, 0x22, 0x5c, - 0x5c, 0x22, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x27, 0x64, 0x61, 0x74, 0x65, 0x2d, 0x74, 0x69, - 0x6d, 0x65, 0x2d, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x3a, 0x20, - 0x27, 0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x64, 0x61, 0x74, 0x65, 0x2d, - 0x74, 0x69, 0x6d, 0x65, 0x20, 0x22, 0x5c, 0x5c, 0x22, 0x22, 0x20, 0x73, - 0x70, 0x61, 0x63, 0x65, 0x27, 0x2c, 0x0a, 0x7d, 0x3b, 0x0a, 0x0a, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x52, 0x45, 0x53, 0x45, 0x52, 0x56, 0x45, - 0x44, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, 0x20, 0x3d, 0x20, 0x7b, 0x27, - 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x3a, 0x20, 0x74, 0x72, 0x75, 0x65, 0x2c, - 0x20, 0x2e, 0x2e, 0x2e, 0x50, 0x52, 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, - 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x2c, 0x20, 0x2e, 0x2e, 0x2e, - 0x44, 0x41, 0x54, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x7d, 0x3b, - 0x0a, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x49, 0x4e, 0x56, 0x41, - 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x43, 0x48, 0x41, - 0x52, 0x53, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5e, 0x5c, - 0x64, 0x41, 0x2d, 0x5a, 0x61, 0x2d, 0x7a, 0x2d, 0x5d, 0x2b, 0x2f, 0x67, - 0x3b, 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52, 0x41, 0x4d, - 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, - 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52, 0x45, 0x20, 0x3d, 0x20, - 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5d, 0x2f, 0x67, 0x3b, 0x0a, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, - 0x52, 0x5f, 0x52, 0x41, 0x4e, 0x47, 0x45, 0x5f, 0x4c, 0x49, 0x54, 0x45, - 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52, - 0x45, 0x20, 0x3d, 0x20, 0x2f, 0x5b, 0x5c, 0x6e, 0x5c, 0x72, 0x22, 0x5c, - 0x5d, 0x5c, 0x2d, 0x5c, 0x5c, 0x5d, 0x2f, 0x67, 0x3b, 0x0a, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, - 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, - 0x50, 0x45, 0x53, 0x20, 0x3d, 0x20, 0x7b, 0x20, 0x27, 0x5c, 0x72, 0x27, - 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x72, 0x27, 0x2c, 0x20, 0x27, 0x5c, 0x6e, - 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x6e, 0x27, 0x2c, 0x20, 0x27, 0x22, - 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x22, 0x27, 0x2c, 0x20, 0x27, 0x2d, - 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x2d, 0x27, 0x2c, 0x20, 0x27, 0x5d, - 0x27, 0x3a, 0x20, 0x27, 0x5c, 0x5c, 0x5d, 0x27, 0x20, 0x7d, 0x3b, 0x0a, - 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x4e, 0x4f, 0x4e, 0x5f, 0x4c, - 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x53, 0x45, 0x54, 0x20, 0x3d, - 0x20, 0x6e, 0x65, 0x77, 0x20, 0x53, 0x65, 0x74, 0x28, 0x27, 0x7c, 0x2e, - 0x28, 0x29, 0x5b, 0x5d, 0x7b, 0x7d, 0x2a, 0x2b, 0x3f, 0x27, 0x29, 0x3b, - 0x0a, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x45, 0x53, 0x43, 0x41, 0x50, - 0x45, 0x44, 0x5f, 0x49, 0x4e, 0x5f, 0x52, 0x45, 0x47, 0x45, 0x58, 0x50, - 0x53, 0x5f, 0x42, 0x55, 0x54, 0x5f, 0x4e, 0x4f, 0x54, 0x5f, 0x49, 0x4e, - 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x53, 0x20, 0x3d, 0x20, - 0x6e, 0x65, 0x77, 0x20, 0x53, 0x65, 0x74, 0x28, 0x27, 0x5b, 0x5d, 0x28, - 0x29, 0x7c, 0x7b, 0x7d, 0x2a, 0x2b, 0x3f, 0x27, 0x29, 0x3b, 0x0a, 0x0a, - 0x65, 0x78, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x63, 0x6c, 0x61, 0x73, 0x73, - 0x20, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x43, 0x6f, 0x6e, 0x76, 0x65, - 0x72, 0x74, 0x65, 0x72, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x6f, 0x72, 0x28, 0x6f, 0x70, 0x74, - 0x69, 0x6f, 0x6e, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, - 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, - 0x73, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, - 0x20, 0x7c, 0x7c, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x46, - 0x65, 0x74, 0x63, 0x68, 0x20, 0x3d, 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, - 0x6e, 0x73, 0x2e, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x5f, 0x66, 0x65, 0x74, - 0x63, 0x68, 0x20, 0x7c, 0x7c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x64, - 0x6f, 0x74, 0x61, 0x6c, 0x6c, 0x20, 0x3d, 0x20, 0x6f, 0x70, 0x74, 0x69, - 0x6f, 0x6e, 0x73, 0x2e, 0x64, 0x6f, 0x74, 0x61, 0x6c, 0x6c, 0x20, 0x7c, - 0x7c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, - 0x20, 0x3d, 0x20, 0x7b, 0x27, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x3a, - 0x20, 0x53, 0x50, 0x41, 0x43, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x7d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x72, 0x65, 0x66, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x65, 0x66, - 0x73, 0x42, 0x65, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x6f, 0x6c, 0x76, - 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x53, 0x65, 0x74, - 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, - 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, - 0x6c, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, - 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x4a, 0x53, 0x4f, - 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, - 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x2e, 0x72, 0x65, 0x70, - 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, 0x45, - 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52, - 0x45, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20, 0x3d, - 0x3e, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, - 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, - 0x53, 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x60, - 0x22, 0x24, 0x7b, 0x65, 0x73, 0x63, 0x61, 0x70, 0x65, 0x64, 0x7d, 0x22, - 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, 0x66, - 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x52, 0x61, 0x6e, 0x67, 0x65, 0x43, 0x68, - 0x61, 0x72, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x4a, 0x53, 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, - 0x69, 0x66, 0x79, 0x28, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, - 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x31, 0x2c, 0x20, 0x2d, 0x31, - 0x29, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, - 0x5f, 0x52, 0x41, 0x4e, 0x47, 0x45, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, - 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x5f, 0x52, 0x45, - 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x20, 0x3d, 0x3e, - 0x20, 0x47, 0x52, 0x41, 0x4d, 0x4d, 0x41, 0x52, 0x5f, 0x4c, 0x49, 0x54, - 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x45, 0x53, 0x43, 0x41, 0x50, 0x45, 0x53, - 0x5b, 0x6d, 0x5d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, - 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, - 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, - 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x6e, - 0x61, 0x6d, 0x65, 0x2e, 0x72, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x28, - 0x49, 0x4e, 0x56, 0x41, 0x4c, 0x49, 0x44, 0x5f, 0x52, 0x55, 0x4c, 0x45, - 0x5f, 0x43, 0x48, 0x41, 0x52, 0x53, 0x5f, 0x52, 0x45, 0x2c, 0x20, 0x27, - 0x2d, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, - 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x65, 0x73, 0x63, 0x4e, 0x61, - 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x69, 0x6e, 0x20, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, - 0x5b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x5d, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, - 0x20, 0x6b, 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, - 0x20, 0x69, 0x20, 0x3d, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x28, 0x60, 0x24, - 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, - 0x7d, 0x60, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x72, 0x75, 0x6c, 0x65, 0x73, 0x29, 0x20, 0x26, 0x26, 0x20, 0x28, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x5b, 0x60, - 0x24, 0x7b, 0x65, 0x73, 0x63, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, - 0x69, 0x7d, 0x60, 0x5d, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x72, 0x75, 0x6c, - 0x65, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b, 0x65, 0x73, 0x63, - 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x5b, 0x6b, - 0x65, 0x79, 0x5d, 0x20, 0x3d, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6b, - 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x61, - 0x73, 0x79, 0x6e, 0x63, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, - 0x52, 0x65, 0x66, 0x73, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, - 0x20, 0x75, 0x72, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x20, - 0x3d, 0x20, 0x61, 0x73, 0x79, 0x6e, 0x63, 0x20, 0x28, 0x6e, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, - 0x72, 0x72, 0x61, 0x79, 0x28, 0x6e, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x50, 0x72, 0x6f, 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x61, 0x6c, - 0x6c, 0x28, 0x6e, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x69, 0x73, 0x69, - 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x79, - 0x70, 0x65, 0x6f, 0x66, 0x20, 0x6e, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, - 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x20, 0x26, 0x26, 0x20, 0x6e, - 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, - 0x20, 0x72, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x6e, 0x2e, 0x24, 0x72, 0x65, - 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, - 0x65, 0x74, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, - 0x65, 0x66, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, 0x66, - 0x69, 0x6e, 0x65, 0x64, 0x20, 0x26, 0x26, 0x20, 0x21, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x72, 0x65, 0x66, 0x73, 0x5b, 0x72, 0x65, 0x66, 0x5d, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x66, 0x2e, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x68, 0x74, - 0x74, 0x70, 0x73, 0x3a, 0x2f, 0x2f, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, - 0x6c, 0x6c, 0x6f, 0x77, 0x46, 0x65, 0x74, 0x63, 0x68, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, - 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x27, 0x46, 0x65, 0x74, 0x63, - 0x68, 0x69, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x6d, 0x6f, 0x74, 0x65, 0x20, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x73, 0x20, 0x69, 0x73, 0x20, 0x6e, - 0x6f, 0x74, 0x20, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x65, 0x64, 0x20, 0x28, - 0x75, 0x73, 0x65, 0x20, 0x2d, 0x2d, 0x61, 0x6c, 0x6c, 0x6f, 0x77, 0x2d, - 0x66, 0x65, 0x74, 0x63, 0x68, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x66, 0x6f, - 0x72, 0x63, 0x65, 0x29, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x66, 0x65, 0x74, 0x63, 0x68, 0x20, 0x3d, 0x20, - 0x28, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x69, 0x6d, 0x70, 0x6f, 0x72, - 0x74, 0x28, 0x27, 0x6e, 0x6f, 0x64, 0x65, 0x2d, 0x66, 0x65, 0x74, 0x63, - 0x68, 0x27, 0x29, 0x29, 0x2e, 0x64, 0x65, 0x66, 0x61, 0x75, 0x6c, 0x74, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x66, 0x72, 0x61, - 0x67, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x66, - 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x23, 0x27, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x62, 0x61, 0x73, 0x65, 0x55, - 0x72, 0x6c, 0x20, 0x3d, 0x20, 0x66, 0x72, 0x61, 0x67, 0x53, 0x70, 0x6c, - 0x69, 0x74, 0x5b, 0x30, 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x61, 0x72, 0x67, - 0x65, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, - 0x65, 0x66, 0x73, 0x5b, 0x62, 0x61, 0x73, 0x65, 0x55, 0x72, 0x6c, 0x5d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x74, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x20, 0x3d, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x52, 0x65, - 0x66, 0x73, 0x28, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x66, 0x65, 0x74, - 0x63, 0x68, 0x28, 0x72, 0x65, 0x66, 0x29, 0x2e, 0x74, 0x68, 0x65, 0x6e, - 0x28, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x65, 0x73, 0x2e, - 0x6a, 0x73, 0x6f, 0x6e, 0x28, 0x29, 0x29, 0x2c, 0x20, 0x62, 0x61, 0x73, - 0x65, 0x55, 0x72, 0x6c, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x72, 0x65, 0x66, 0x73, 0x5b, 0x62, 0x61, 0x73, 0x65, - 0x55, 0x72, 0x6c, 0x5d, 0x20, 0x3d, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, - 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x66, 0x72, - 0x61, 0x67, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, - 0x74, 0x68, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x20, 0x7c, 0x7c, 0x20, - 0x66, 0x72, 0x61, 0x67, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x5b, 0x66, 0x72, - 0x61, 0x67, 0x53, 0x70, 0x6c, 0x69, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, - 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, - 0x27, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, - 0x66, 0x2e, 0x73, 0x74, 0x61, 0x72, 0x74, 0x73, 0x57, 0x69, 0x74, 0x68, - 0x28, 0x27, 0x23, 0x2f, 0x27, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x60, 0x24, 0x7b, - 0x75, 0x72, 0x6c, 0x7d, 0x24, 0x7b, 0x72, 0x65, 0x66, 0x7d, 0x60, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x6e, 0x2e, 0x24, 0x72, 0x65, 0x66, 0x20, 0x3d, 0x20, 0x72, 0x65, - 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, - 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, - 0x72, 0x28, 0x60, 0x55, 0x6e, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, - 0x65, 0x64, 0x20, 0x72, 0x65, 0x66, 0x20, 0x24, 0x7b, 0x72, 0x65, 0x66, - 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, - 0x6c, 0x65, 0x63, 0x74, 0x6f, 0x72, 0x73, 0x20, 0x3d, 0x20, 0x72, 0x65, - 0x66, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x23, 0x27, 0x29, - 0x5b, 0x31, 0x5d, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x2f, - 0x27, 0x29, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x31, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x65, - 0x6c, 0x20, 0x6f, 0x66, 0x20, 0x73, 0x65, 0x6c, 0x65, 0x63, 0x74, 0x6f, - 0x72, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x7c, 0x7c, 0x20, 0x21, 0x28, 0x73, - 0x65, 0x6c, 0x20, 0x69, 0x6e, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, - 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, - 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, - 0x45, 0x72, 0x72, 0x6f, 0x72, 0x20, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, - 0x69, 0x6e, 0x67, 0x20, 0x72, 0x65, 0x66, 0x20, 0x24, 0x7b, 0x72, 0x65, - 0x66, 0x7d, 0x3a, 0x20, 0x24, 0x7b, 0x73, 0x65, 0x6c, 0x7d, 0x20, 0x6e, - 0x6f, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, 0x4e, - 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, 0x74, - 0x61, 0x72, 0x67, 0x65, 0x74, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x20, 0x3d, 0x20, 0x74, 0x61, - 0x72, 0x67, 0x65, 0x74, 0x5b, 0x73, 0x65, 0x6c, 0x5d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x65, 0x66, 0x73, 0x5b, 0x72, 0x65, 0x66, - 0x5d, 0x20, 0x3d, 0x20, 0x74, 0x61, 0x72, 0x67, 0x65, 0x74, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x61, 0x77, 0x61, 0x69, 0x74, 0x20, 0x50, 0x72, 0x6f, - 0x6d, 0x69, 0x73, 0x65, 0x2e, 0x61, 0x6c, 0x6c, 0x28, 0x4f, 0x62, 0x6a, - 0x65, 0x63, 0x74, 0x2e, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x73, 0x28, 0x6e, - 0x29, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x69, 0x73, 0x69, 0x74, 0x29, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6e, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x76, 0x69, 0x73, - 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, 0x67, 0x65, 0x6e, 0x65, - 0x72, 0x61, 0x74, 0x65, 0x55, 0x6e, 0x69, 0x6f, 0x6e, 0x52, 0x75, 0x6c, - 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x6c, 0x74, 0x53, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x61, 0x6c, 0x74, - 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x73, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x28, 0x61, 0x6c, 0x74, 0x53, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, - 0x61, 0x6c, 0x74, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, 0x60, - 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x3f, 0x20, 0x27, 0x27, - 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x27, 0x2d, - 0x27, 0x20, 0x3a, 0x20, 0x27, 0x61, 0x6c, 0x74, 0x65, 0x72, 0x6e, 0x61, - 0x74, 0x69, 0x76, 0x65, 0x2d, 0x27, 0x7d, 0x24, 0x7b, 0x69, 0x7d, 0x60, - 0x29, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2e, 0x6a, 0x6f, - 0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, 0x76, 0x69, 0x73, 0x69, 0x74, - 0x50, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x28, 0x70, 0x61, 0x74, 0x74, - 0x65, 0x72, 0x6e, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2e, 0x73, 0x74, 0x61, 0x72, 0x74, 0x73, - 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x5e, 0x27, 0x29, 0x20, 0x7c, 0x7c, - 0x20, 0x21, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2e, 0x65, 0x6e, - 0x64, 0x73, 0x57, 0x69, 0x74, 0x68, 0x28, 0x27, 0x24, 0x27, 0x29, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, - 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, - 0x28, 0x27, 0x50, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x6d, 0x75, - 0x73, 0x74, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x77, 0x69, 0x74, - 0x68, 0x20, 0x22, 0x5e, 0x22, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x65, 0x6e, - 0x64, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, 0x22, 0x24, 0x22, 0x27, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d, 0x20, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, - 0x31, 0x2c, 0x20, 0x2d, 0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x75, 0x62, 0x52, 0x75, 0x6c, - 0x65, 0x49, 0x64, 0x73, 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x20, 0x3d, 0x20, - 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3d, 0x20, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x67, 0x65, 0x74, 0x44, 0x6f, 0x74, 0x20, 0x3d, 0x20, 0x28, 0x29, - 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x5f, 0x64, 0x6f, 0x74, 0x61, 0x6c, 0x6c, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, - 0x20, 0x3d, 0x20, 0x27, 0x5b, 0x5c, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, - 0x30, 0x30, 0x30, 0x30, 0x2d, 0x5c, 0x5c, 0x55, 0x30, 0x30, 0x31, 0x30, - 0x46, 0x46, 0x46, 0x46, 0x5d, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x41, 0x63, - 0x63, 0x65, 0x70, 0x74, 0x20, 0x61, 0x6e, 0x79, 0x20, 0x63, 0x68, 0x61, - 0x72, 0x61, 0x63, 0x74, 0x65, 0x72, 0x2e, 0x2e, 0x2e, 0x20, 0x65, 0x78, - 0x63, 0x65, 0x70, 0x74, 0x20, 0x5c, 0x6e, 0x20, 0x61, 0x6e, 0x64, 0x20, - 0x5c, 0x72, 0x20, 0x6c, 0x69, 0x6e, 0x65, 0x20, 0x62, 0x72, 0x65, 0x61, - 0x6b, 0x20, 0x63, 0x68, 0x61, 0x72, 0x73, 0x20, 0x28, 0x5c, 0x78, 0x30, - 0x41, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x5c, 0x78, 0x4f, 0x44, 0x29, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, - 0x20, 0x3d, 0x20, 0x27, 0x5b, 0x5c, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, - 0x30, 0x30, 0x30, 0x30, 0x2d, 0x5c, 0x5c, 0x78, 0x30, 0x39, 0x5c, 0x5c, - 0x78, 0x30, 0x42, 0x5c, 0x5c, 0x78, 0x30, 0x43, 0x5c, 0x5c, 0x78, 0x30, - 0x45, 0x2d, 0x5c, 0x5c, 0x55, 0x30, 0x30, 0x31, 0x30, 0x46, 0x46, 0x46, - 0x46, 0x5d, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, - 0x75, 0x6c, 0x65, 0x28, 0x27, 0x64, 0x6f, 0x74, 0x27, 0x2c, 0x20, 0x72, - 0x75, 0x6c, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, - 0x0a, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x74, 0x6f, 0x52, 0x75, 0x6c, 0x65, 0x20, 0x3d, 0x20, 0x28, 0x5b, - 0x73, 0x2c, 0x20, 0x69, 0x73, 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, - 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x69, 0x73, 0x4c, 0x69, 0x74, 0x65, - 0x72, 0x61, 0x6c, 0x20, 0x3f, 0x20, 0x22, 0x5c, 0x22, 0x22, 0x20, 0x2b, - 0x20, 0x73, 0x20, 0x2b, 0x20, 0x22, 0x5c, 0x22, 0x22, 0x20, 0x3a, 0x20, - 0x73, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x6f, 0x72, 0x6d, 0x20, - 0x3d, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x20, 0x3d, 0x20, 0x69, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x46, 0x6f, 0x72, 0x20, 0x65, 0x61, - 0x63, 0x68, 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, - 0x20, 0x6f, 0x66, 0x20, 0x74, 0x68, 0x69, 0x73, 0x20, 0x73, 0x65, 0x71, - 0x75, 0x65, 0x6e, 0x63, 0x65, 0x2c, 0x20, 0x73, 0x74, 0x6f, 0x72, 0x65, - 0x20, 0x69, 0x74, 0x73, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x20, - 0x72, 0x65, 0x70, 0x72, 0x65, 0x73, 0x65, 0x6e, 0x74, 0x61, 0x74, 0x69, - 0x6f, 0x6e, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x77, 0x68, 0x65, 0x74, 0x68, - 0x65, 0x72, 0x20, 0x69, 0x74, 0x27, 0x73, 0x20, 0x61, 0x20, 0x6c, 0x69, - 0x74, 0x65, 0x72, 0x61, 0x6c, 0x2e, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2f, 0x20, 0x57, 0x65, 0x20, 0x6f, 0x6e, 0x6c, 0x79, 0x20, - 0x6e, 0x65, 0x65, 0x64, 0x20, 0x61, 0x20, 0x66, 0x6c, 0x61, 0x74, 0x20, - 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x75, 0x72, 0x65, 0x20, 0x68, 0x65, - 0x72, 0x65, 0x20, 0x74, 0x6f, 0x20, 0x61, 0x70, 0x70, 0x6c, 0x79, 0x20, - 0x72, 0x65, 0x70, 0x65, 0x74, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x6f, - 0x70, 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x73, 0x20, 0x74, 0x6f, 0x20, - 0x74, 0x68, 0x65, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, - 0x6d, 0x2c, 0x20, 0x61, 0x6e, 0x64, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x2f, 0x2f, 0x20, 0x74, 0x6f, 0x20, 0x6d, 0x65, 0x72, 0x67, 0x65, - 0x20, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x73, 0x20, 0x61, 0x74, - 0x20, 0x74, 0x68, 0x65, 0x20, 0x61, 0x6e, 0x64, 0x20, 0x28, 0x77, 0x65, - 0x27, 0x72, 0x65, 0x20, 0x70, 0x61, 0x72, 0x73, 0x69, 0x6e, 0x67, 0x20, - 0x67, 0x72, 0x6f, 0x75, 0x70, 0x65, 0x64, 0x20, 0x28, 0x20, 0x73, 0x65, - 0x71, 0x75, 0x65, 0x6e, 0x63, 0x65, 0x73, 0x20, 0x29, 0x20, 0x72, 0x65, - 0x63, 0x75, 0x72, 0x73, 0x69, 0x76, 0x65, 0x6c, 0x79, 0x20, 0x61, 0x6e, - 0x64, 0x20, 0x64, 0x6f, 0x6e, 0x27, 0x74, 0x20, 0x74, 0x72, 0x65, 0x61, - 0x74, 0x20, 0x27, 0x7c, 0x27, 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x61, - 0x6c, 0x6c, 0x79, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x2f, 0x2f, - 0x20, 0x28, 0x47, 0x42, 0x4e, 0x46, 0x27, 0x73, 0x20, 0x73, 0x79, 0x6e, - 0x74, 0x61, 0x78, 0x20, 0x69, 0x73, 0x20, 0x6c, 0x75, 0x63, 0x6b, 0x69, - 0x6c, 0x79, 0x20, 0x76, 0x65, 0x72, 0x79, 0x20, 0x63, 0x6c, 0x6f, 0x73, - 0x65, 0x20, 0x74, 0x6f, 0x20, 0x72, 0x65, 0x67, 0x75, 0x6c, 0x61, 0x72, - 0x20, 0x65, 0x78, 0x70, 0x72, 0x65, 0x73, 0x73, 0x69, 0x6f, 0x6e, 0x73, - 0x21, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x73, 0x65, 0x71, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, - 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x6a, 0x6f, 0x69, 0x6e, 0x53, 0x65, 0x71, 0x20, 0x3d, 0x20, - 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, - 0x74, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x5b, 0x69, 0x73, 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, - 0x6c, 0x2c, 0x20, 0x67, 0x5d, 0x20, 0x6f, 0x66, 0x20, 0x67, 0x72, 0x6f, - 0x75, 0x70, 0x42, 0x79, 0x28, 0x73, 0x65, 0x71, 0x2c, 0x20, 0x78, 0x20, - 0x3d, 0x3e, 0x20, 0x78, 0x5b, 0x31, 0x5d, 0x29, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x69, 0x73, 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, - 0x5b, 0x5b, 0x2e, 0x2e, 0x2e, 0x67, 0x5d, 0x2e, 0x6d, 0x61, 0x70, 0x28, - 0x78, 0x20, 0x3d, 0x3e, 0x20, 0x78, 0x5b, 0x30, 0x5d, 0x29, 0x2e, 0x6a, - 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x27, 0x29, 0x2c, 0x20, 0x74, 0x72, 0x75, - 0x65, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x2e, 0x2e, 0x2e, - 0x67, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x72, 0x65, 0x74, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x72, 0x65, 0x74, 0x5b, 0x30, 0x5d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, - 0x5b, 0x72, 0x65, 0x74, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x78, 0x20, 0x3d, - 0x3e, 0x20, 0x74, 0x6f, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x78, 0x29, 0x29, - 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x20, 0x27, 0x29, 0x2c, 0x20, - 0x66, 0x61, 0x6c, 0x73, 0x65, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x69, 0x20, 0x3c, 0x20, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x63, - 0x20, 0x3d, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, - 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x63, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x2e, 0x27, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x73, 0x65, 0x71, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x5b, - 0x67, 0x65, 0x74, 0x44, 0x6f, 0x74, 0x28, 0x29, 0x2c, 0x20, 0x66, 0x61, - 0x6c, 0x73, 0x65, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, - 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x27, 0x28, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, - 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3c, 0x20, 0x6c, 0x65, 0x6e, - 0x67, 0x74, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, - 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x27, 0x3f, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, - 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, - 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, - 0x74, 0x65, 0x64, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, - 0x73, 0x79, 0x6e, 0x74, 0x61, 0x78, 0x20, 0x22, 0x24, 0x7b, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, 0x7d, 0x22, 0x20, 0x61, - 0x74, 0x20, 0x69, 0x6e, 0x64, 0x65, 0x78, 0x20, 0x24, 0x7b, 0x69, 0x7d, - 0x20, 0x6f, 0x66, 0x20, 0x2f, 0x24, 0x7b, 0x70, 0x61, 0x74, 0x74, 0x65, - 0x72, 0x6e, 0x7d, 0x2f, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x2e, - 0x70, 0x75, 0x73, 0x68, 0x28, 0x5b, 0x60, 0x28, 0x24, 0x7b, 0x74, 0x6f, - 0x52, 0x75, 0x6c, 0x65, 0x28, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x6f, - 0x72, 0x6d, 0x28, 0x29, 0x29, 0x7d, 0x29, 0x60, 0x2c, 0x20, 0x66, 0x61, - 0x6c, 0x73, 0x65, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x63, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x29, 0x27, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, - 0x74, 0x61, 0x72, 0x74, 0x20, 0x3c, 0x3d, 0x20, 0x30, 0x20, 0x7c, 0x7c, - 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x73, 0x74, 0x61, - 0x72, 0x74, 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x20, 0x21, 0x3d, 0x3d, 0x20, - 0x27, 0x28, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, - 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, - 0x55, 0x6e, 0x62, 0x61, 0x6c, 0x61, 0x6e, 0x63, 0x65, 0x64, 0x20, 0x70, - 0x61, 0x72, 0x65, 0x6e, 0x74, 0x68, 0x65, 0x73, 0x65, 0x73, 0x3b, 0x20, - 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, 0x3d, 0x20, 0x24, 0x7b, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x7d, 0x2c, 0x20, 0x69, 0x20, 0x3d, 0x20, 0x24, 0x7b, - 0x69, 0x7d, 0x2c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, - 0x3d, 0x20, 0x24, 0x7b, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x7d, - 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6a, 0x6f, 0x69, - 0x6e, 0x53, 0x65, 0x71, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x63, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x5b, 0x27, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, 0x71, 0x75, 0x61, 0x72, 0x65, - 0x42, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x63, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, - 0x28, 0x69, 0x20, 0x3c, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, - 0x26, 0x26, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, - 0x5d, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x5d, 0x27, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, - 0x5b, 0x69, 0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x5c, 0x5c, 0x27, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x71, 0x75, 0x61, 0x72, 0x65, - 0x42, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, 0x20, 0x2b, 0x3d, 0x20, - 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2e, 0x73, 0x6c, 0x69, 0x63, - 0x65, 0x28, 0x69, 0x2c, 0x20, 0x69, 0x20, 0x2b, 0x20, 0x32, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x32, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x71, - 0x75, 0x61, 0x72, 0x65, 0x42, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, - 0x20, 0x2b, 0x3d, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, - 0x69, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, 0x20, 0x3e, 0x3d, 0x20, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, - 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, - 0x28, 0x60, 0x55, 0x6e, 0x62, 0x61, 0x6c, 0x61, 0x6e, 0x63, 0x65, 0x64, - 0x20, 0x73, 0x71, 0x75, 0x61, 0x72, 0x65, 0x20, 0x62, 0x72, 0x61, 0x63, - 0x6b, 0x65, 0x74, 0x73, 0x3b, 0x20, 0x73, 0x74, 0x61, 0x72, 0x74, 0x20, - 0x3d, 0x20, 0x24, 0x7b, 0x73, 0x74, 0x61, 0x72, 0x74, 0x7d, 0x2c, 0x20, - 0x69, 0x20, 0x3d, 0x20, 0x24, 0x7b, 0x69, 0x7d, 0x2c, 0x20, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d, 0x20, 0x24, 0x7b, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x71, 0x75, 0x61, - 0x72, 0x65, 0x42, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, 0x20, 0x2b, - 0x3d, 0x20, 0x27, 0x5d, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, - 0x65, 0x71, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x5b, 0x73, 0x71, 0x75, - 0x61, 0x72, 0x65, 0x42, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, 0x2c, - 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, - 0x7c, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x2e, 0x70, 0x75, 0x73, 0x68, - 0x28, 0x5b, 0x27, 0x7c, 0x27, 0x2c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, - 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, - 0x2a, 0x27, 0x20, 0x7c, 0x7c, 0x20, 0x63, 0x20, 0x3d, 0x3d, 0x3d, 0x20, - 0x27, 0x2b, 0x27, 0x20, 0x7c, 0x7c, 0x20, 0x63, 0x20, 0x3d, 0x3d, 0x3d, - 0x20, 0x27, 0x3f, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x5b, 0x73, 0x65, - 0x71, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, - 0x5d, 0x20, 0x3d, 0x20, 0x5b, 0x74, 0x6f, 0x52, 0x75, 0x6c, 0x65, 0x28, - 0x73, 0x65, 0x71, 0x5b, 0x73, 0x65, 0x71, 0x2e, 0x6c, 0x65, 0x6e, 0x67, - 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x29, 0x20, 0x2b, 0x20, 0x63, - 0x2c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x5d, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, - 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x63, - 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x7b, 0x27, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, - 0x74, 0x20, 0x63, 0x75, 0x72, 0x6c, 0x79, 0x42, 0x72, 0x61, 0x63, 0x6b, - 0x65, 0x74, 0x73, 0x20, 0x3d, 0x20, 0x63, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, - 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x69, 0x20, 0x3c, 0x20, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x26, 0x26, 0x20, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, 0x20, 0x21, 0x3d, 0x3d, - 0x20, 0x27, 0x7d, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x75, 0x72, 0x6c, - 0x79, 0x42, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, 0x20, 0x2b, 0x3d, - 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x69, - 0x20, 0x3e, 0x3d, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, - 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, 0x62, 0x61, 0x6c, - 0x61, 0x6e, 0x63, 0x65, 0x64, 0x20, 0x63, 0x75, 0x72, 0x6c, 0x79, 0x20, - 0x62, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, 0x3b, 0x20, 0x73, 0x74, - 0x61, 0x72, 0x74, 0x20, 0x3d, 0x20, 0x24, 0x7b, 0x73, 0x74, 0x61, 0x72, - 0x74, 0x7d, 0x2c, 0x20, 0x69, 0x20, 0x3d, 0x20, 0x24, 0x7b, 0x69, 0x7d, - 0x2c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x20, 0x3d, 0x20, - 0x24, 0x7b, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x7d, 0x60, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x75, 0x72, 0x6c, 0x79, 0x42, 0x72, 0x61, 0x63, 0x6b, 0x65, 0x74, - 0x73, 0x20, 0x2b, 0x3d, 0x20, 0x27, 0x7d, 0x27, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, - 0x20, 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6e, 0x75, 0x6d, 0x73, - 0x20, 0x3d, 0x20, 0x63, 0x75, 0x72, 0x6c, 0x79, 0x42, 0x72, 0x61, 0x63, - 0x6b, 0x65, 0x74, 0x73, 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x31, - 0x2c, 0x20, 0x2d, 0x31, 0x29, 0x2e, 0x73, 0x70, 0x6c, 0x69, 0x74, 0x28, - 0x27, 0x2c, 0x27, 0x29, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x73, 0x20, 0x3d, - 0x3e, 0x20, 0x73, 0x2e, 0x74, 0x72, 0x69, 0x6d, 0x28, 0x29, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, - 0x65, 0x74, 0x20, 0x6d, 0x69, 0x6e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x2c, - 0x20, 0x6d, 0x61, 0x78, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x6e, 0x75, 0x6d, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, - 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, - 0x6e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x70, 0x61, 0x72, - 0x73, 0x65, 0x49, 0x6e, 0x74, 0x28, 0x6e, 0x75, 0x6d, 0x73, 0x5b, 0x30, - 0x5d, 0x2c, 0x20, 0x31, 0x30, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x61, 0x78, 0x54, - 0x69, 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x6d, 0x69, 0x6e, 0x54, 0x69, - 0x6d, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x6e, 0x75, 0x6d, 0x73, 0x2e, 0x6c, 0x65, 0x6e, - 0x67, 0x74, 0x68, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x32, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, 0x65, 0x77, - 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x49, 0x6e, 0x76, 0x61, - 0x6c, 0x69, 0x64, 0x20, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x66, 0x69, - 0x65, 0x72, 0x20, 0x24, 0x7b, 0x63, 0x75, 0x72, 0x6c, 0x79, 0x42, 0x72, - 0x61, 0x63, 0x6b, 0x65, 0x74, 0x73, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x6d, 0x69, 0x6e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x20, - 0x6e, 0x75, 0x6d, 0x73, 0x5b, 0x30, 0x5d, 0x20, 0x3f, 0x20, 0x70, 0x61, - 0x72, 0x73, 0x65, 0x49, 0x6e, 0x74, 0x28, 0x6e, 0x75, 0x6d, 0x73, 0x5b, - 0x30, 0x5d, 0x2c, 0x20, 0x31, 0x30, 0x29, 0x20, 0x3a, 0x20, 0x30, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x6d, 0x61, 0x78, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x20, - 0x6e, 0x75, 0x6d, 0x73, 0x5b, 0x31, 0x5d, 0x20, 0x3f, 0x20, 0x70, 0x61, - 0x72, 0x73, 0x65, 0x49, 0x6e, 0x74, 0x28, 0x6e, 0x75, 0x6d, 0x73, 0x5b, - 0x31, 0x5d, 0x2c, 0x20, 0x31, 0x30, 0x29, 0x20, 0x3a, 0x20, 0x49, 0x6e, - 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x5b, - 0x73, 0x75, 0x62, 0x2c, 0x20, 0x73, 0x75, 0x62, 0x49, 0x73, 0x4c, 0x69, - 0x74, 0x65, 0x72, 0x61, 0x6c, 0x5d, 0x20, 0x3d, 0x20, 0x73, 0x65, 0x71, - 0x5b, 0x73, 0x65, 0x71, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, - 0x2d, 0x20, 0x31, 0x5d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x69, 0x6e, - 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x30, 0x20, - 0x26, 0x26, 0x20, 0x6d, 0x61, 0x78, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, - 0x3d, 0x3d, 0x3d, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x5b, 0x73, 0x65, 0x71, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x20, - 0x3d, 0x20, 0x5b, 0x60, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x7d, 0x2a, 0x60, - 0x2c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x5d, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x69, 0x6e, 0x54, 0x69, - 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x30, 0x20, 0x26, 0x26, - 0x20, 0x6d, 0x61, 0x78, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x5b, 0x73, - 0x65, 0x71, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, - 0x31, 0x5d, 0x20, 0x3d, 0x20, 0x5b, 0x60, 0x24, 0x7b, 0x73, 0x75, 0x62, - 0x7d, 0x3f, 0x60, 0x2c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x5d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x69, - 0x6e, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x31, - 0x20, 0x26, 0x26, 0x20, 0x6d, 0x61, 0x78, 0x54, 0x69, 0x6d, 0x65, 0x73, - 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, - 0x79, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x5b, 0x73, 0x65, 0x71, - 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x5d, - 0x20, 0x3d, 0x20, 0x5b, 0x60, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x7d, 0x2b, - 0x60, 0x2c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x5d, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, - 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x73, - 0x75, 0x62, 0x49, 0x73, 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x69, 0x64, 0x20, - 0x3d, 0x20, 0x73, 0x75, 0x62, 0x52, 0x75, 0x6c, 0x65, 0x49, 0x64, 0x73, - 0x5b, 0x73, 0x75, 0x62, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x69, 0x64, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, - 0x66, 0x69, 0x6e, 0x65, 0x64, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x64, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x60, 0x24, 0x7b, 0x6e, - 0x61, 0x6d, 0x65, 0x7d, 0x2d, 0x24, 0x7b, 0x4f, 0x62, 0x6a, 0x65, 0x63, - 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x73, 0x28, 0x73, 0x75, 0x62, 0x52, 0x75, - 0x6c, 0x65, 0x49, 0x64, 0x73, 0x29, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, - 0x68, 0x20, 0x2b, 0x20, 0x31, 0x7d, 0x60, 0x2c, 0x20, 0x73, 0x75, 0x62, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x62, 0x52, 0x75, - 0x6c, 0x65, 0x49, 0x64, 0x73, 0x5b, 0x73, 0x75, 0x62, 0x5d, 0x20, 0x3d, - 0x20, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, - 0x75, 0x62, 0x20, 0x3d, 0x20, 0x69, 0x64, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, - 0x65, 0x64, 0x53, 0x75, 0x62, 0x20, 0x3d, 0x20, 0x41, 0x72, 0x72, 0x61, - 0x79, 0x2e, 0x66, 0x72, 0x6f, 0x6d, 0x28, 0x7b, 0x20, 0x6c, 0x65, 0x6e, - 0x67, 0x74, 0x68, 0x3a, 0x20, 0x6d, 0x69, 0x6e, 0x54, 0x69, 0x6d, 0x65, - 0x73, 0x20, 0x7d, 0x2c, 0x20, 0x28, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x73, - 0x75, 0x62, 0x49, 0x73, 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x20, - 0x3f, 0x20, 0x60, 0x22, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x2e, 0x73, 0x6c, - 0x69, 0x63, 0x65, 0x28, 0x31, 0x2c, 0x20, 0x2d, 0x31, 0x29, 0x2e, 0x72, - 0x65, 0x70, 0x65, 0x61, 0x74, 0x28, 0x6d, 0x69, 0x6e, 0x54, 0x69, 0x6d, - 0x65, 0x73, 0x29, 0x7d, 0x22, 0x60, 0x20, 0x3a, 0x20, 0x73, 0x75, 0x62, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x70, 0x74, - 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x53, 0x75, 0x62, 0x20, 0x3d, 0x20, 0x6d, - 0x61, 0x78, 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x21, 0x3d, 0x3d, 0x20, - 0x75, 0x6e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x3f, 0x20, - 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x66, 0x72, 0x6f, 0x6d, 0x28, 0x7b, - 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x3a, 0x20, 0x6d, 0x61, 0x78, - 0x54, 0x69, 0x6d, 0x65, 0x73, 0x20, 0x2d, 0x20, 0x6d, 0x69, 0x6e, 0x54, - 0x69, 0x6d, 0x65, 0x73, 0x20, 0x7d, 0x2c, 0x20, 0x28, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x60, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x7d, 0x3f, 0x60, 0x29, - 0x20, 0x3a, 0x20, 0x5b, 0x60, 0x24, 0x7b, 0x73, 0x75, 0x62, 0x7d, 0x2a, - 0x60, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x5b, 0x73, 0x65, 0x71, 0x2e, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x5d, 0x20, - 0x3d, 0x20, 0x5b, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x65, 0x64, 0x53, - 0x75, 0x62, 0x2e, 0x63, 0x6f, 0x6e, 0x63, 0x61, 0x74, 0x28, 0x6f, 0x70, - 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x53, 0x75, 0x62, 0x29, 0x2e, 0x6a, - 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x20, 0x27, 0x29, 0x2c, 0x20, 0x66, 0x61, - 0x6c, 0x73, 0x65, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, - 0x20, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x20, 0x3d, 0x20, 0x27, - 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x77, 0x68, 0x69, 0x6c, 0x65, 0x20, 0x28, 0x69, 0x20, 0x3c, 0x20, - 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, - 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x5c, 0x5c, 0x27, 0x20, 0x26, 0x26, - 0x20, 0x69, 0x20, 0x3c, 0x20, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, - 0x2d, 0x20, 0x31, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x6e, 0x65, 0x78, 0x74, 0x20, 0x3d, 0x20, 0x70, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x20, 0x2b, 0x20, 0x31, 0x5d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x45, 0x53, 0x43, 0x41, - 0x50, 0x45, 0x44, 0x5f, 0x49, 0x4e, 0x5f, 0x52, 0x45, 0x47, 0x45, 0x58, - 0x50, 0x53, 0x5f, 0x42, 0x55, 0x54, 0x5f, 0x4e, 0x4f, 0x54, 0x5f, 0x49, - 0x4e, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x53, 0x2e, 0x68, - 0x61, 0x73, 0x28, 0x6e, 0x65, 0x78, 0x74, 0x29, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x20, - 0x2b, 0x3d, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, - 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, - 0x31, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, - 0x6c, 0x20, 0x2b, 0x3d, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, - 0x2e, 0x73, 0x6c, 0x69, 0x63, 0x65, 0x28, 0x69, 0x2c, 0x20, 0x69, 0x20, - 0x2b, 0x20, 0x32, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, - 0x2b, 0x3d, 0x20, 0x32, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, - 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x70, 0x61, 0x74, - 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, 0x20, 0x3d, 0x3d, 0x3d, 0x20, - 0x27, 0x22, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x69, 0x74, - 0x65, 0x72, 0x61, 0x6c, 0x20, 0x2b, 0x3d, 0x20, 0x27, 0x5c, 0x5c, 0x22, - 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x21, 0x4e, 0x4f, 0x4e, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, - 0x5f, 0x53, 0x45, 0x54, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x70, 0x61, 0x74, - 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, 0x29, 0x20, 0x26, 0x26, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x28, 0x69, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x6c, - 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x2d, 0x20, 0x31, 0x20, 0x7c, 0x7c, - 0x20, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x20, 0x3d, 0x3d, 0x3d, - 0x20, 0x27, 0x27, 0x20, 0x7c, 0x7c, 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, - 0x72, 0x6e, 0x5b, 0x69, 0x20, 0x2b, 0x20, 0x31, 0x5d, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x27, 0x2e, 0x27, 0x20, 0x7c, 0x7c, 0x20, 0x21, 0x4e, 0x4f, - 0x4e, 0x5f, 0x4c, 0x49, 0x54, 0x45, 0x52, 0x41, 0x4c, 0x5f, 0x53, 0x45, - 0x54, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, - 0x6e, 0x5b, 0x69, 0x2b, 0x31, 0x5d, 0x29, 0x29, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x20, 0x2b, 0x3d, - 0x20, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x5b, 0x69, 0x5d, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x20, 0x2b, 0x3d, 0x20, 0x31, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x62, 0x72, - 0x65, 0x61, 0x6b, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6c, 0x69, 0x74, - 0x65, 0x72, 0x61, 0x6c, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x27, 0x27, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x73, 0x65, 0x71, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, - 0x5b, 0x6c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x2c, 0x20, 0x74, 0x72, - 0x75, 0x65, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x6a, 0x6f, 0x69, 0x6e, 0x53, 0x65, 0x71, 0x28, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, - 0x6d, 0x65, 0x2c, 0x20, 0x22, 0x5c, 0x22, 0x5c, 0x5c, 0x5c, 0x22, 0x5c, - 0x22, 0x20, 0x22, 0x20, 0x2b, 0x20, 0x74, 0x6f, 0x52, 0x75, 0x6c, 0x65, - 0x28, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x66, 0x6f, 0x72, 0x6d, 0x28, 0x29, - 0x29, 0x20, 0x2b, 0x20, 0x22, 0x20, 0x5c, 0x22, 0x5c, 0x5c, 0x5c, 0x22, - 0x5c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x22, 0x29, 0x0a, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, 0x72, 0x65, 0x73, 0x6f, 0x6c, - 0x76, 0x65, 0x52, 0x65, 0x66, 0x28, 0x72, 0x65, 0x66, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x72, 0x65, 0x66, - 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x72, 0x65, 0x66, 0x2e, 0x73, - 0x70, 0x6c, 0x69, 0x74, 0x28, 0x27, 0x2f, 0x27, 0x29, 0x2e, 0x70, 0x6f, - 0x70, 0x28, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x21, 0x28, 0x72, 0x65, 0x66, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x69, - 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, - 0x73, 0x29, 0x20, 0x26, 0x26, 0x20, 0x21, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x72, 0x65, 0x66, 0x73, 0x42, 0x65, 0x69, 0x6e, 0x67, 0x52, 0x65, - 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x64, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x72, - 0x65, 0x66, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x65, 0x66, 0x73, 0x42, - 0x65, 0x69, 0x6e, 0x67, 0x52, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x64, - 0x2e, 0x61, 0x64, 0x64, 0x28, 0x72, 0x65, 0x66, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, - 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x65, 0x66, 0x73, 0x5b, 0x72, 0x65, 0x66, - 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x66, - 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, - 0x65, 0x64, 0x2c, 0x20, 0x72, 0x65, 0x66, 0x4e, 0x61, 0x6d, 0x65, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x5f, 0x72, 0x65, 0x66, 0x73, 0x42, 0x65, 0x69, 0x6e, 0x67, 0x52, - 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x64, 0x2e, 0x64, 0x65, 0x6c, 0x65, - 0x74, 0x65, 0x28, 0x72, 0x65, 0x66, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x72, 0x65, 0x66, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, - 0x61, 0x74, 0x65, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x52, - 0x75, 0x6c, 0x65, 0x28, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x20, 0x21, 0x3d, - 0x3d, 0x20, 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, - 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, - 0x27, 0x4f, 0x6e, 0x6c, 0x79, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x73, 0x20, 0x61, - 0x72, 0x65, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, - 0x2c, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x27, 0x20, 0x2b, 0x20, 0x4a, 0x53, - 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, - 0x28, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, - 0x6d, 0x61, 0x74, 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x76, - 0x61, 0x6c, 0x75, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, - 0x20, 0x20, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x63, - 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x20, 0x73, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x74, 0x79, 0x70, 0x65, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x73, 0x63, - 0x68, 0x65, 0x6d, 0x61, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x20, 0x3d, - 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x66, 0x6f, 0x72, 0x6d, - 0x61, 0x74, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x69, 0x6e, 0x20, 0x52, 0x45, 0x53, - 0x45, 0x52, 0x56, 0x45, 0x44, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, 0x20, - 0x3f, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x2b, 0x20, 0x27, 0x2d, 0x27, - 0x20, 0x3a, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x20, 0x27, - 0x27, 0x20, 0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3a, - 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x66, 0x20, 0x3d, 0x20, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x24, 0x72, 0x65, 0x66, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x66, - 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x69, 0x6e, - 0x65, 0x64, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, - 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x72, 0x65, 0x73, 0x6f, 0x6c, 0x76, 0x65, 0x52, 0x65, 0x66, 0x28, - 0x72, 0x65, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x73, 0x63, - 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f, 0x66, 0x20, 0x7c, - 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6e, 0x79, - 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, - 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x55, 0x6e, 0x69, - 0x6f, 0x6e, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, - 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x6f, 0x6e, 0x65, 0x4f, - 0x66, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, - 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x41, 0x72, 0x72, 0x61, 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, - 0x79, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, - 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, - 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, 0x74, 0x65, 0x55, 0x6e, 0x69, 0x6f, - 0x6e, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x2e, 0x6d, - 0x61, 0x70, 0x28, 0x74, 0x20, 0x3d, 0x3e, 0x20, 0x28, 0x7b, 0x20, 0x74, - 0x79, 0x70, 0x65, 0x3a, 0x20, 0x74, 0x20, 0x7d, 0x29, 0x29, 0x29, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x27, - 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x74, 0x79, 0x70, 0x65, 0x6f, 0x66, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x21, 0x3d, 0x3d, 0x20, - 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, - 0x77, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, - 0x27, 0x4f, 0x6e, 0x6c, 0x79, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x73, 0x20, 0x61, - 0x72, 0x65, 0x20, 0x73, 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x65, 0x64, - 0x2c, 0x20, 0x67, 0x6f, 0x74, 0x20, 0x27, 0x20, 0x2b, 0x20, 0x4a, 0x53, - 0x4f, 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, - 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, - 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, - 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, - 0x2c, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x67, 0x65, 0x6e, 0x65, - 0x72, 0x61, 0x74, 0x65, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, - 0x52, 0x75, 0x6c, 0x65, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x27, 0x65, 0x6e, 0x75, 0x6d, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, - 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, - 0x20, 0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x65, 0x6e, - 0x75, 0x6d, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x76, 0x20, 0x3d, 0x3e, 0x20, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x67, 0x65, 0x6e, 0x65, 0x72, 0x61, - 0x74, 0x65, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x52, 0x75, - 0x6c, 0x65, 0x28, 0x76, 0x29, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, - 0x27, 0x20, 0x7c, 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, - 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, - 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x28, 0x73, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x75, - 0x6e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x7c, 0x7c, 0x20, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, - 0x3d, 0x3d, 0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x29, - 0x20, 0x26, 0x26, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x28, 0x27, 0x70, 0x72, 0x6f, - 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x27, 0x20, 0x69, 0x6e, 0x20, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x20, 0x7c, 0x7c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x28, 0x27, 0x61, 0x64, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, - 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, - 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x20, - 0x26, 0x26, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x61, 0x64, - 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, - 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x74, - 0x72, 0x75, 0x65, 0x29, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, - 0x75, 0x69, 0x72, 0x65, 0x64, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, - 0x53, 0x65, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x72, - 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x20, 0x7c, 0x7c, 0x20, 0x5b, - 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, - 0x65, 0x73, 0x20, 0x3d, 0x20, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, - 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65, 0x73, 0x28, 0x73, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, - 0x73, 0x20, 0x3f, 0x3f, 0x20, 0x7b, 0x7d, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, - 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x4f, 0x62, - 0x6a, 0x65, 0x63, 0x74, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x70, 0x72, 0x6f, - 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x2c, 0x20, 0x72, 0x65, 0x71, - 0x75, 0x69, 0x72, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, - 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x61, 0x64, 0x64, 0x69, - 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, - 0x74, 0x69, 0x65, 0x73, 0x29, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x28, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, - 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, - 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, - 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, - 0x63, 0x74, 0x27, 0x29, 0x20, 0x26, 0x26, 0x20, 0x27, 0x61, 0x6c, 0x6c, - 0x4f, 0x66, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, - 0x64, 0x20, 0x3d, 0x20, 0x6e, 0x65, 0x77, 0x20, 0x53, 0x65, 0x74, 0x28, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, - 0x73, 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x61, 0x64, 0x64, 0x43, - 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x20, 0x3d, 0x20, 0x28, - 0x63, 0x6f, 0x6d, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2c, 0x20, - 0x69, 0x73, 0x52, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x66, 0x20, 0x3d, - 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, - 0x24, 0x72, 0x65, 0x66, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x66, 0x20, 0x21, 0x3d, - 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6d, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x20, - 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x72, 0x65, 0x66, 0x73, - 0x5b, 0x72, 0x65, 0x66, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x70, 0x72, 0x6f, 0x70, 0x65, - 0x72, 0x74, 0x69, 0x65, 0x73, 0x27, 0x20, 0x69, 0x6e, 0x20, 0x63, 0x6f, - 0x6d, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, - 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5b, 0x70, 0x72, - 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, - 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x5d, 0x20, 0x6f, 0x66, 0x20, 0x4f, - 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65, - 0x73, 0x28, 0x63, 0x6f, 0x6d, 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, - 0x2e, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, - 0x65, 0x73, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x5b, 0x70, 0x72, 0x6f, - 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x53, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, - 0x28, 0x69, 0x73, 0x52, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, - 0x64, 0x2e, 0x61, 0x64, 0x64, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, - 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, - 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, - 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x20, 0x6f, 0x66, - 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x61, 0x6c, 0x6c, 0x4f, - 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x27, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x27, - 0x20, 0x69, 0x6e, 0x20, 0x74, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x74, 0x74, 0x20, 0x6f, 0x66, 0x20, - 0x74, 0x2e, 0x61, 0x6e, 0x79, 0x4f, 0x66, 0x29, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x61, - 0x64, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, 0x65, 0x6e, 0x74, 0x28, - 0x74, 0x74, 0x2c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, - 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x61, 0x64, 0x64, 0x43, 0x6f, 0x6d, 0x70, 0x6f, 0x6e, - 0x65, 0x6e, 0x74, 0x28, 0x74, 0x2c, 0x20, 0x74, 0x72, 0x75, 0x65, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, - 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x5f, 0x62, 0x75, 0x69, 0x6c, 0x64, 0x4f, 0x62, 0x6a, - 0x65, 0x63, 0x74, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x70, - 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x2c, 0x20, 0x72, 0x65, 0x71, 0x75, - 0x69, 0x72, 0x65, 0x64, 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, - 0x2f, 0x2a, 0x20, 0x61, 0x64, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x61, - 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x3d, - 0x20, 0x2a, 0x2f, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, 0x29, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x69, 0x66, 0x20, 0x28, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, - 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, - 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, - 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, - 0x27, 0x61, 0x72, 0x72, 0x61, 0x79, 0x27, 0x29, 0x20, 0x26, 0x26, 0x20, - 0x28, 0x27, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20, 0x69, 0x6e, 0x20, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x70, - 0x72, 0x65, 0x66, 0x69, 0x78, 0x49, 0x74, 0x65, 0x6d, 0x73, 0x27, 0x20, - 0x69, 0x6e, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x63, - 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3f, - 0x3f, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x70, 0x72, 0x65, - 0x66, 0x69, 0x78, 0x49, 0x74, 0x65, 0x6d, 0x73, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x41, 0x72, 0x72, 0x61, - 0x79, 0x2e, 0x69, 0x73, 0x41, 0x72, 0x72, 0x61, 0x79, 0x28, 0x69, 0x74, - 0x65, 0x6d, 0x73, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, - 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, 0x22, 0x5b, 0x22, - 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x27, 0x20, 0x2b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, - 0x74, 0x65, 0x6d, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x28, 0x69, 0x74, - 0x65, 0x6d, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x74, 0x68, - 0x69, 0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x69, 0x74, 0x65, - 0x6d, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, - 0x3f, 0x20, 0x27, 0x27, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, - 0x3f, 0x20, 0x27, 0x2d, 0x27, 0x20, 0x3a, 0x20, 0x27, 0x27, 0x7d, 0x74, - 0x75, 0x70, 0x6c, 0x65, 0x2d, 0x24, 0x7b, 0x69, 0x7d, 0x60, 0x29, 0x29, - 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x20, 0x22, 0x2c, 0x22, 0x20, - 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x27, 0x29, 0x20, 0x2b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x27, - 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x27, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, - 0x6e, 0x73, 0x74, 0x20, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c, 0x65, - 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x69, 0x74, 0x65, 0x6d, 0x73, 0x2c, - 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x3f, 0x20, - 0x27, 0x27, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, - 0x27, 0x2d, 0x27, 0x20, 0x3a, 0x20, 0x27, 0x27, 0x7d, 0x69, 0x74, 0x65, - 0x6d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6c, 0x69, 0x73, 0x74, 0x49, - 0x74, 0x65, 0x6d, 0x4f, 0x70, 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x20, - 0x3d, 0x20, 0x60, 0x28, 0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, - 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, 0x52, 0x75, 0x6c, - 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x29, 0x60, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x73, - 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x69, 0x76, 0x65, 0x49, 0x74, 0x65, - 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6d, 0x69, 0x6e, - 0x49, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x2e, 0x6d, 0x69, 0x6e, 0x49, 0x74, 0x65, 0x6d, 0x73, 0x20, - 0x7c, 0x7c, 0x20, 0x30, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6d, 0x61, 0x78, 0x49, - 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x2e, 0x6d, 0x61, 0x78, 0x49, 0x74, 0x65, 0x6d, 0x73, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x6d, 0x69, 0x6e, 0x49, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3e, 0x20, 0x30, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x69, 0x76, 0x65, - 0x49, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3d, 0x20, 0x6c, 0x69, 0x73, 0x74, - 0x49, 0x74, 0x65, 0x6d, 0x4f, 0x70, 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, - 0x2e, 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x28, 0x6d, 0x69, 0x6e, 0x49, - 0x74, 0x65, 0x6d, 0x73, 0x20, 0x2d, 0x20, 0x31, 0x29, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x6d, 0x69, 0x6e, - 0x49, 0x74, 0x65, 0x6d, 0x73, 0x2d, 0x2d, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6d, 0x61, 0x78, 0x49, 0x74, - 0x65, 0x6d, 0x73, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, - 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x26, 0x26, 0x20, 0x6d, 0x61, 0x78, - 0x49, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3e, 0x20, 0x6d, 0x69, 0x6e, 0x49, - 0x74, 0x65, 0x6d, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, - 0x73, 0x69, 0x76, 0x65, 0x49, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x2b, 0x3d, - 0x20, 0x60, 0x24, 0x7b, 0x6c, 0x69, 0x73, 0x74, 0x49, 0x74, 0x65, 0x6d, - 0x4f, 0x70, 0x65, 0x72, 0x61, 0x74, 0x6f, 0x72, 0x7d, 0x3f, 0x60, 0x2e, - 0x72, 0x65, 0x70, 0x65, 0x61, 0x74, 0x28, 0x6d, 0x61, 0x78, 0x49, 0x74, - 0x65, 0x6d, 0x73, 0x20, 0x2d, 0x20, 0x6d, 0x69, 0x6e, 0x49, 0x74, 0x65, - 0x6d, 0x73, 0x20, 0x2d, 0x20, 0x31, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x69, 0x76, 0x65, 0x49, 0x74, - 0x65, 0x6d, 0x73, 0x20, 0x2b, 0x3d, 0x20, 0x60, 0x24, 0x7b, 0x6c, 0x69, - 0x73, 0x74, 0x49, 0x74, 0x65, 0x6d, 0x4f, 0x70, 0x65, 0x72, 0x61, 0x74, - 0x6f, 0x72, 0x7d, 0x2a, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, - 0x3d, 0x20, 0x6d, 0x69, 0x6e, 0x49, 0x74, 0x65, 0x6d, 0x73, 0x20, 0x3d, - 0x3d, 0x3d, 0x20, 0x30, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x3f, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, 0x73, 0x70, - 0x61, 0x63, 0x65, 0x20, 0x28, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, - 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x24, 0x7b, - 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x69, 0x76, 0x65, 0x49, 0x74, - 0x65, 0x6d, 0x73, 0x7d, 0x20, 0x29, 0x3f, 0x20, 0x22, 0x5d, 0x22, 0x20, - 0x73, 0x70, 0x61, 0x63, 0x65, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x3a, 0x20, 0x60, 0x22, 0x5b, 0x22, 0x20, - 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, 0x7b, 0x69, 0x74, 0x65, 0x6d, - 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x24, 0x7b, - 0x73, 0x75, 0x63, 0x63, 0x65, 0x73, 0x73, 0x69, 0x76, 0x65, 0x49, 0x74, - 0x65, 0x6d, 0x73, 0x7d, 0x20, 0x22, 0x5d, 0x22, 0x20, 0x73, 0x70, 0x61, - 0x63, 0x65, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, - 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, - 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, 0x65, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, - 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x69, - 0x6e, 0x65, 0x64, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x73, - 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, 0x20, 0x26, 0x26, 0x20, 0x27, - 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x27, 0x20, 0x69, 0x6e, 0x20, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x76, 0x69, 0x73, 0x69, 0x74, 0x50, 0x61, - 0x74, 0x74, 0x65, 0x72, 0x6e, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, - 0x2e, 0x70, 0x61, 0x74, 0x74, 0x65, 0x72, 0x6e, 0x2c, 0x20, 0x72, 0x75, - 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, - 0x3d, 0x3d, 0x3d, 0x20, 0x75, 0x6e, 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, - 0x64, 0x20, 0x7c, 0x7c, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, - 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x73, 0x74, 0x72, - 0x69, 0x6e, 0x67, 0x27, 0x29, 0x20, 0x26, 0x26, 0x20, 0x2f, 0x5e, 0x75, - 0x75, 0x69, 0x64, 0x5b, 0x31, 0x2d, 0x35, 0x5d, 0x3f, 0x24, 0x2f, 0x2e, - 0x74, 0x65, 0x73, 0x74, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, - 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x20, 0x7c, 0x7c, 0x20, 0x27, 0x27, - 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x4e, - 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, 0x6f, 0x6f, - 0x74, 0x27, 0x20, 0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, 0x27, 0x20, - 0x3a, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x46, 0x6f, 0x72, 0x6d, - 0x61, 0x74, 0x2c, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, - 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x27, 0x75, 0x75, 0x69, 0x64, 0x27, - 0x5d, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, - 0x65, 0x20, 0x69, 0x66, 0x20, 0x28, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, - 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x75, 0x6e, - 0x64, 0x65, 0x66, 0x69, 0x6e, 0x65, 0x64, 0x20, 0x7c, 0x7c, 0x20, 0x73, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x3d, 0x3d, - 0x3d, 0x20, 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x29, 0x20, - 0x26, 0x26, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x2e, 0x66, 0x6f, - 0x72, 0x6d, 0x61, 0x74, 0x20, 0x69, 0x6e, 0x20, 0x44, 0x41, 0x54, 0x45, - 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, - 0x73, 0x74, 0x20, 0x5b, 0x74, 0x2c, 0x20, 0x72, 0x5d, 0x20, 0x6f, 0x66, - 0x20, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x2e, 0x65, 0x6e, 0x74, 0x72, - 0x69, 0x65, 0x73, 0x28, 0x44, 0x41, 0x54, 0x45, 0x5f, 0x52, 0x55, 0x4c, - 0x45, 0x53, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, - 0x52, 0x75, 0x6c, 0x65, 0x28, 0x74, 0x2c, 0x20, 0x72, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x73, 0x63, 0x68, - 0x65, 0x6d, 0x61, 0x46, 0x6f, 0x72, 0x6d, 0x61, 0x74, 0x20, 0x2b, 0x20, - 0x27, 0x2d, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x28, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, - 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, - 0x74, 0x27, 0x29, 0x20, 0x7c, 0x7c, 0x20, 0x28, 0x4f, 0x62, 0x6a, 0x65, - 0x63, 0x74, 0x2e, 0x6b, 0x65, 0x79, 0x73, 0x28, 0x73, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x29, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3d, - 0x3d, 0x3d, 0x20, 0x30, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x6e, 0x20, 0x6f, 0x66, 0x20, 0x4f, 0x42, 0x4a, 0x45, 0x43, - 0x54, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x5f, 0x4e, 0x41, 0x4d, 0x45, 0x53, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, - 0x65, 0x28, 0x6e, 0x2c, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49, 0x54, 0x49, - 0x56, 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x6e, 0x5d, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, - 0x28, 0x72, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x27, - 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x21, 0x28, 0x73, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x20, 0x69, 0x6e, - 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, - 0x55, 0x4c, 0x45, 0x53, 0x29, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x74, 0x68, 0x72, 0x6f, 0x77, 0x20, 0x6e, - 0x65, 0x77, 0x20, 0x45, 0x72, 0x72, 0x6f, 0x72, 0x28, 0x60, 0x55, 0x6e, - 0x72, 0x65, 0x63, 0x6f, 0x67, 0x6e, 0x69, 0x7a, 0x65, 0x64, 0x20, 0x73, - 0x63, 0x68, 0x65, 0x6d, 0x61, 0x3a, 0x20, 0x24, 0x7b, 0x4a, 0x53, 0x4f, - 0x4e, 0x2e, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x69, 0x66, 0x79, 0x28, - 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x29, 0x7d, 0x60, 0x29, 0x3b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x54, 0x4f, 0x44, 0x4f, 0x3a, 0x20, 0x73, - 0x75, 0x70, 0x70, 0x6f, 0x72, 0x74, 0x20, 0x6d, 0x69, 0x6e, 0x69, 0x6d, - 0x75, 0x6d, 0x2c, 0x20, 0x6d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x2c, - 0x20, 0x65, 0x78, 0x63, 0x6c, 0x75, 0x73, 0x69, 0x76, 0x65, 0x4d, 0x69, - 0x6e, 0x69, 0x6d, 0x75, 0x6d, 0x2c, 0x20, 0x65, 0x78, 0x63, 0x6c, 0x75, - 0x73, 0x69, 0x76, 0x65, 0x4d, 0x61, 0x78, 0x69, 0x6d, 0x75, 0x6d, 0x20, - 0x61, 0x74, 0x20, 0x6c, 0x65, 0x61, 0x73, 0x74, 0x20, 0x66, 0x6f, 0x72, - 0x20, 0x7a, 0x65, 0x72, 0x6f, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, - 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x72, 0x75, 0x6c, - 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x72, - 0x6f, 0x6f, 0x74, 0x27, 0x20, 0x3f, 0x20, 0x27, 0x72, 0x6f, 0x6f, 0x74, - 0x27, 0x20, 0x3a, 0x20, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x54, 0x79, - 0x70, 0x65, 0x2c, 0x20, 0x50, 0x52, 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, - 0x45, 0x5f, 0x52, 0x55, 0x4c, 0x45, 0x53, 0x5b, 0x73, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x54, 0x79, 0x70, 0x65, 0x5d, 0x29, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x5f, - 0x62, 0x75, 0x69, 0x6c, 0x64, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, 0x52, - 0x75, 0x6c, 0x65, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, - 0x65, 0x73, 0x2c, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, - 0x2c, 0x20, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x61, 0x64, 0x64, 0x69, - 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, - 0x74, 0x69, 0x65, 0x73, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, - 0x64, 0x65, 0x72, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, - 0x70, 0x72, 0x6f, 0x70, 0x4f, 0x72, 0x64, 0x65, 0x72, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x2f, 0x2f, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x20, 0x62, - 0x79, 0x20, 0x70, 0x6f, 0x73, 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x20, 0x69, - 0x6e, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x5f, 0x6f, 0x72, 0x64, 0x65, 0x72, - 0x20, 0x28, 0x69, 0x66, 0x20, 0x73, 0x70, 0x65, 0x63, 0x69, 0x66, 0x69, - 0x65, 0x64, 0x29, 0x20, 0x74, 0x68, 0x65, 0x6e, 0x20, 0x62, 0x79, 0x20, - 0x6f, 0x72, 0x69, 0x67, 0x69, 0x6e, 0x61, 0x6c, 0x20, 0x6f, 0x72, 0x64, - 0x65, 0x72, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x50, 0x72, 0x6f, 0x70, 0x73, - 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, - 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x28, 0x5b, 0x6b, 0x5d, 0x29, 0x20, - 0x3d, 0x3e, 0x20, 0x6b, 0x29, 0x2e, 0x73, 0x6f, 0x72, 0x74, 0x28, 0x28, - 0x61, 0x2c, 0x20, 0x62, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, - 0x72, 0x64, 0x65, 0x72, 0x41, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, - 0x4f, 0x72, 0x64, 0x65, 0x72, 0x5b, 0x61, 0x5d, 0x20, 0x7c, 0x7c, 0x20, - 0x49, 0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6f, 0x72, - 0x64, 0x65, 0x72, 0x42, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x4f, - 0x72, 0x64, 0x65, 0x72, 0x5b, 0x62, 0x5d, 0x20, 0x7c, 0x7c, 0x20, 0x49, - 0x6e, 0x66, 0x69, 0x6e, 0x69, 0x74, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x6f, 0x72, - 0x64, 0x65, 0x72, 0x41, 0x20, 0x2d, 0x20, 0x6f, 0x72, 0x64, 0x65, 0x72, - 0x42, 0x20, 0x7c, 0x7c, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, - 0x69, 0x65, 0x73, 0x2e, 0x66, 0x69, 0x6e, 0x64, 0x49, 0x6e, 0x64, 0x65, - 0x78, 0x28, 0x28, 0x5b, 0x6b, 0x5d, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x6b, - 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x61, 0x29, 0x20, 0x2d, 0x20, 0x70, 0x72, - 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x2e, 0x66, 0x69, 0x6e, - 0x64, 0x49, 0x6e, 0x64, 0x65, 0x78, 0x28, 0x28, 0x5b, 0x6b, 0x5d, 0x29, - 0x20, 0x3d, 0x3e, 0x20, 0x6b, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x62, 0x29, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x29, 0x3b, 0x0a, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x70, 0x72, 0x6f, - 0x70, 0x4b, 0x76, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x73, - 0x20, 0x3d, 0x20, 0x7b, 0x7d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x66, - 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5b, 0x70, - 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x70, 0x72, 0x6f, - 0x70, 0x53, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x5d, 0x20, 0x6f, 0x66, 0x20, - 0x70, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, - 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x76, 0x69, - 0x73, 0x69, 0x74, 0x28, 0x70, 0x72, 0x6f, 0x70, 0x53, 0x63, 0x68, 0x65, - 0x6d, 0x61, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, - 0x3f, 0x3f, 0x20, 0x27, 0x27, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, - 0x20, 0x3f, 0x20, 0x27, 0x2d, 0x27, 0x20, 0x3a, 0x20, 0x27, 0x27, 0x7d, - 0x24, 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x60, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, 0x6f, - 0x70, 0x4b, 0x76, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x73, - 0x5b, 0x70, 0x72, 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x5d, 0x20, 0x3d, - 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, - 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x3f, 0x20, 0x27, - 0x27, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x27, - 0x2d, 0x27, 0x20, 0x3a, 0x20, 0x27, 0x27, 0x7d, 0x24, 0x7b, 0x70, 0x72, - 0x6f, 0x70, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x2d, 0x6b, 0x76, 0x60, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x24, 0x7b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x66, 0x6f, 0x72, 0x6d, 0x61, 0x74, - 0x4c, 0x69, 0x74, 0x65, 0x72, 0x61, 0x6c, 0x28, 0x70, 0x72, 0x6f, 0x70, - 0x4e, 0x61, 0x6d, 0x65, 0x29, 0x7d, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, - 0x20, 0x22, 0x3a, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x24, - 0x7b, 0x70, 0x72, 0x6f, 0x70, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, - 0x65, 0x7d, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, - 0x64, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x6f, 0x72, - 0x74, 0x65, 0x64, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x66, 0x69, 0x6c, - 0x74, 0x65, 0x72, 0x28, 0x6b, 0x20, 0x3d, 0x3e, 0x20, 0x72, 0x65, 0x71, - 0x75, 0x69, 0x72, 0x65, 0x64, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x6b, 0x29, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, - 0x20, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, - 0x70, 0x73, 0x20, 0x3d, 0x20, 0x73, 0x6f, 0x72, 0x74, 0x65, 0x64, 0x50, - 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x66, 0x69, 0x6c, 0x74, 0x65, 0x72, 0x28, - 0x6b, 0x20, 0x3d, 0x3e, 0x20, 0x21, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, - 0x65, 0x64, 0x2e, 0x68, 0x61, 0x73, 0x28, 0x6b, 0x29, 0x29, 0x3b, 0x0a, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x74, 0x79, 0x70, - 0x65, 0x6f, 0x66, 0x20, 0x61, 0x64, 0x64, 0x69, 0x74, 0x69, 0x6f, 0x6e, - 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, 0x73, - 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x27, 0x6f, 0x62, 0x6a, 0x65, 0x63, 0x74, - 0x27, 0x20, 0x7c, 0x7c, 0x20, 0x61, 0x64, 0x64, 0x69, 0x74, 0x69, 0x6f, - 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, 0x74, 0x69, 0x65, - 0x73, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x74, 0x72, 0x75, 0x65, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x73, 0x75, 0x62, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, - 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x3f, 0x20, 0x27, - 0x27, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x27, - 0x2d, 0x27, 0x20, 0x3a, 0x20, 0x27, 0x27, 0x7d, 0x61, 0x64, 0x64, 0x69, - 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x60, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x76, 0x61, 0x6c, - 0x75, 0x65, 0x52, 0x75, 0x6c, 0x65, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x76, 0x69, 0x73, 0x69, 0x74, 0x28, 0x61, 0x64, 0x64, 0x69, - 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, 0x72, - 0x74, 0x69, 0x65, 0x73, 0x20, 0x3d, 0x3d, 0x3d, 0x20, 0x74, 0x72, 0x75, - 0x65, 0x20, 0x3f, 0x20, 0x7b, 0x7d, 0x20, 0x3a, 0x20, 0x61, 0x64, 0x64, - 0x69, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x65, - 0x72, 0x74, 0x69, 0x65, 0x73, 0x2c, 0x20, 0x60, 0x24, 0x7b, 0x73, 0x75, - 0x62, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x2d, 0x76, 0x61, 0x6c, 0x75, 0x65, - 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x70, 0x72, - 0x6f, 0x70, 0x4b, 0x76, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, - 0x73, 0x5b, 0x27, 0x2a, 0x27, 0x5d, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x24, 0x7b, 0x73, - 0x75, 0x62, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x2d, 0x6b, 0x76, 0x60, 0x2c, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, 0x24, 0x7b, - 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, - 0x65, 0x28, 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, 0x2c, 0x20, - 0x50, 0x52, 0x49, 0x4d, 0x49, 0x54, 0x49, 0x56, 0x45, 0x5f, 0x52, 0x55, - 0x4c, 0x45, 0x53, 0x5b, 0x27, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x27, - 0x5d, 0x29, 0x7d, 0x20, 0x22, 0x3a, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, - 0x65, 0x20, 0x24, 0x7b, 0x76, 0x61, 0x6c, 0x75, 0x65, 0x52, 0x75, 0x6c, - 0x65, 0x7d, 0x60, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, - 0x73, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x27, 0x2a, 0x27, 0x29, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x6c, 0x65, 0x74, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x3d, 0x20, 0x27, - 0x22, 0x7b, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x27, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, - 0x20, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x50, 0x72, 0x6f, - 0x70, 0x73, 0x2e, 0x6d, 0x61, 0x70, 0x28, 0x6b, 0x20, 0x3d, 0x3e, 0x20, - 0x70, 0x72, 0x6f, 0x70, 0x4b, 0x76, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, - 0x6d, 0x65, 0x73, 0x5b, 0x6b, 0x5d, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, - 0x28, 0x27, 0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, - 0x20, 0x27, 0x29, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x6f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, - 0x6f, 0x70, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, - 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27, 0x20, 0x28, 0x27, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, - 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x50, 0x72, 0x6f, 0x70, - 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27, 0x20, 0x22, 0x2c, - 0x22, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x28, 0x20, 0x27, 0x3b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x67, 0x65, - 0x74, 0x52, 0x65, 0x63, 0x75, 0x72, 0x73, 0x69, 0x76, 0x65, 0x52, 0x65, - 0x66, 0x73, 0x20, 0x3d, 0x20, 0x28, 0x6b, 0x73, 0x2c, 0x20, 0x66, 0x69, - 0x72, 0x73, 0x74, 0x49, 0x73, 0x4f, 0x70, 0x74, 0x69, 0x6f, 0x6e, 0x61, - 0x6c, 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x5b, 0x6b, - 0x2c, 0x20, 0x2e, 0x2e, 0x2e, 0x72, 0x65, 0x73, 0x74, 0x5d, 0x20, 0x3d, - 0x20, 0x6b, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6b, 0x76, 0x52, 0x75, 0x6c, - 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x3d, 0x20, 0x70, 0x72, 0x6f, 0x70, - 0x4b, 0x76, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x73, 0x5b, - 0x6b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x6c, 0x65, 0x74, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6b, 0x20, 0x3d, - 0x3d, 0x3d, 0x20, 0x27, 0x2a, 0x27, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x73, 0x20, 0x3d, 0x20, 0x74, 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, - 0x64, 0x52, 0x75, 0x6c, 0x65, 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x60, - 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x3f, 0x20, 0x27, 0x27, - 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x20, 0x27, 0x2d, - 0x27, 0x20, 0x3a, 0x20, 0x27, 0x27, 0x7d, 0x61, 0x64, 0x64, 0x69, 0x74, - 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x2d, 0x6b, 0x76, 0x73, 0x60, 0x2c, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x60, 0x24, 0x7b, 0x6b, 0x76, 0x52, 0x75, 0x6c, - 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x28, 0x20, 0x22, 0x2c, 0x22, - 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x20, 0x60, 0x20, 0x2b, 0x20, 0x6b, - 0x76, 0x52, 0x75, 0x6c, 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x20, 0x2b, 0x20, - 0x60, 0x20, 0x29, 0x2a, 0x60, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, 0x20, 0x69, - 0x66, 0x20, 0x28, 0x66, 0x69, 0x72, 0x73, 0x74, 0x49, 0x73, 0x4f, 0x70, - 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x73, 0x20, - 0x3d, 0x20, 0x60, 0x28, 0x20, 0x22, 0x2c, 0x22, 0x20, 0x73, 0x70, 0x61, - 0x63, 0x65, 0x20, 0x24, 0x7b, 0x6b, 0x76, 0x52, 0x75, 0x6c, 0x65, 0x4e, - 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x29, 0x3f, 0x60, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x20, 0x65, 0x6c, 0x73, 0x65, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x72, 0x65, 0x73, 0x20, 0x3d, 0x20, 0x6b, 0x76, 0x52, 0x75, 0x6c, - 0x65, 0x4e, 0x61, 0x6d, 0x65, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x69, 0x66, 0x20, 0x28, 0x72, 0x65, 0x73, 0x74, 0x2e, 0x6c, 0x65, - 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x73, 0x20, 0x2b, 0x3d, 0x20, 0x27, 0x20, 0x27, 0x20, 0x2b, 0x20, 0x74, - 0x68, 0x69, 0x73, 0x2e, 0x5f, 0x61, 0x64, 0x64, 0x52, 0x75, 0x6c, 0x65, - 0x28, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x60, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, 0x3f, - 0x20, 0x27, 0x27, 0x7d, 0x24, 0x7b, 0x6e, 0x61, 0x6d, 0x65, 0x20, 0x3f, - 0x20, 0x27, 0x2d, 0x27, 0x20, 0x3a, 0x20, 0x27, 0x27, 0x7d, 0x24, 0x7b, - 0x6b, 0x7d, 0x2d, 0x72, 0x65, 0x73, 0x74, 0x60, 0x2c, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x65, - 0x74, 0x52, 0x65, 0x63, 0x75, 0x72, 0x73, 0x69, 0x76, 0x65, 0x52, 0x65, - 0x66, 0x73, 0x28, 0x72, 0x65, 0x73, 0x74, 0x2c, 0x20, 0x74, 0x72, 0x75, - 0x65, 0x29, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, - 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x65, 0x73, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x20, 0x20, 0x7d, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x6f, 0x70, - 0x74, 0x69, 0x6f, 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x2e, - 0x6d, 0x61, 0x70, 0x28, 0x28, 0x5f, 0x2c, 0x20, 0x69, 0x29, 0x20, 0x3d, - 0x3e, 0x20, 0x67, 0x65, 0x74, 0x52, 0x65, 0x63, 0x75, 0x72, 0x73, 0x69, - 0x76, 0x65, 0x52, 0x65, 0x66, 0x73, 0x28, 0x6f, 0x70, 0x74, 0x69, 0x6f, - 0x6e, 0x61, 0x6c, 0x50, 0x72, 0x6f, 0x70, 0x73, 0x2e, 0x73, 0x6c, 0x69, - 0x63, 0x65, 0x28, 0x69, 0x29, 0x2c, 0x20, 0x66, 0x61, 0x6c, 0x73, 0x65, - 0x29, 0x29, 0x2e, 0x6a, 0x6f, 0x69, 0x6e, 0x28, 0x27, 0x20, 0x7c, 0x20, - 0x27, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x72, 0x65, 0x71, 0x75, 0x69, 0x72, 0x65, 0x64, 0x50, 0x72, - 0x6f, 0x70, 0x73, 0x2e, 0x6c, 0x65, 0x6e, 0x67, 0x74, 0x68, 0x20, 0x3e, - 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, 0x3d, 0x20, 0x27, 0x20, - 0x29, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, - 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x20, 0x2b, - 0x3d, 0x20, 0x27, 0x20, 0x29, 0x3f, 0x27, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x75, 0x6c, 0x65, - 0x20, 0x2b, 0x3d, 0x20, 0x27, 0x20, 0x22, 0x7d, 0x22, 0x20, 0x73, 0x70, - 0x61, 0x63, 0x65, 0x27, 0x3b, 0x0a, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, - 0x65, 0x74, 0x75, 0x72, 0x6e, 0x20, 0x72, 0x75, 0x6c, 0x65, 0x3b, 0x0a, - 0x20, 0x20, 0x7d, 0x0a, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x6d, 0x61, - 0x74, 0x47, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x28, 0x29, 0x20, 0x7b, - 0x0a, 0x20, 0x20, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x72, 0x61, - 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x3d, 0x20, 0x27, 0x27, 0x3b, 0x0a, 0x20, - 0x20, 0x20, 0x20, 0x66, 0x6f, 0x72, 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, - 0x74, 0x20, 0x5b, 0x6e, 0x61, 0x6d, 0x65, 0x2c, 0x20, 0x72, 0x75, 0x6c, - 0x65, 0x5d, 0x20, 0x6f, 0x66, 0x20, 0x4f, 0x62, 0x6a, 0x65, 0x63, 0x74, - 0x2e, 0x65, 0x6e, 0x74, 0x72, 0x69, 0x65, 0x73, 0x28, 0x74, 0x68, 0x69, - 0x73, 0x2e, 0x5f, 0x72, 0x75, 0x6c, 0x65, 0x73, 0x29, 0x2e, 0x73, 0x6f, - 0x72, 0x74, 0x28, 0x28, 0x5b, 0x61, 0x5d, 0x2c, 0x20, 0x5b, 0x62, 0x5d, - 0x29, 0x20, 0x3d, 0x3e, 0x20, 0x61, 0x2e, 0x6c, 0x6f, 0x63, 0x61, 0x6c, - 0x65, 0x43, 0x6f, 0x6d, 0x70, 0x61, 0x72, 0x65, 0x28, 0x62, 0x29, 0x29, - 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x67, 0x72, - 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x20, 0x2b, 0x3d, 0x20, 0x60, 0x24, 0x7b, - 0x6e, 0x61, 0x6d, 0x65, 0x7d, 0x20, 0x3a, 0x3a, 0x3d, 0x20, 0x24, 0x7b, - 0x72, 0x75, 0x6c, 0x65, 0x7d, 0x5c, 0x6e, 0x60, 0x3b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x72, 0x65, 0x74, 0x75, - 0x72, 0x6e, 0x20, 0x67, 0x72, 0x61, 0x6d, 0x6d, 0x61, 0x72, 0x3b, 0x0a, - 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a, 0x0a, 0x2f, 0x2f, 0x20, 0x48, 0x65, - 0x6c, 0x70, 0x65, 0x72, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x20, 0x74, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x65, - 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x73, 0x20, 0x62, 0x79, 0x20, 0x61, - 0x20, 0x6b, 0x65, 0x79, 0x20, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, - 0x6e, 0x0a, 0x66, 0x75, 0x6e, 0x63, 0x74, 0x69, 0x6f, 0x6e, 0x2a, 0x20, - 0x67, 0x72, 0x6f, 0x75, 0x70, 0x42, 0x79, 0x28, 0x69, 0x74, 0x65, 0x72, - 0x61, 0x62, 0x6c, 0x65, 0x2c, 0x20, 0x6b, 0x65, 0x79, 0x46, 0x6e, 0x29, - 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x6c, 0x61, 0x73, - 0x74, 0x4b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x6e, 0x75, 0x6c, 0x6c, 0x3b, - 0x0a, 0x20, 0x20, 0x6c, 0x65, 0x74, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, - 0x20, 0x3d, 0x20, 0x5b, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x66, 0x6f, 0x72, - 0x20, 0x28, 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x65, 0x6c, 0x65, 0x6d, - 0x65, 0x6e, 0x74, 0x20, 0x6f, 0x66, 0x20, 0x69, 0x74, 0x65, 0x72, 0x61, - 0x62, 0x6c, 0x65, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x63, - 0x6f, 0x6e, 0x73, 0x74, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x6b, - 0x65, 0x79, 0x46, 0x6e, 0x28, 0x65, 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, - 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x69, 0x66, 0x20, 0x28, 0x6c, - 0x61, 0x73, 0x74, 0x4b, 0x65, 0x79, 0x20, 0x21, 0x3d, 0x3d, 0x20, 0x6e, - 0x75, 0x6c, 0x6c, 0x20, 0x26, 0x26, 0x20, 0x6b, 0x65, 0x79, 0x20, 0x21, - 0x3d, 0x3d, 0x20, 0x6c, 0x61, 0x73, 0x74, 0x4b, 0x65, 0x79, 0x29, 0x20, - 0x7b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, - 0x64, 0x20, 0x5b, 0x6c, 0x61, 0x73, 0x74, 0x4b, 0x65, 0x79, 0x2c, 0x20, - 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5d, 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x20, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x3d, 0x20, 0x5b, 0x5d, - 0x3b, 0x0a, 0x20, 0x20, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x20, 0x20, - 0x67, 0x72, 0x6f, 0x75, 0x70, 0x2e, 0x70, 0x75, 0x73, 0x68, 0x28, 0x65, - 0x6c, 0x65, 0x6d, 0x65, 0x6e, 0x74, 0x29, 0x3b, 0x0a, 0x20, 0x20, 0x20, - 0x20, 0x6c, 0x61, 0x73, 0x74, 0x4b, 0x65, 0x79, 0x20, 0x3d, 0x20, 0x6b, - 0x65, 0x79, 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x20, 0x20, 0x69, 0x66, - 0x20, 0x28, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x2e, 0x6c, 0x65, 0x6e, 0x67, - 0x74, 0x68, 0x20, 0x3e, 0x20, 0x30, 0x29, 0x20, 0x7b, 0x0a, 0x20, 0x20, - 0x20, 0x20, 0x79, 0x69, 0x65, 0x6c, 0x64, 0x20, 0x5b, 0x6c, 0x61, 0x73, - 0x74, 0x4b, 0x65, 0x79, 0x2c, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x5d, - 0x3b, 0x0a, 0x20, 0x20, 0x7d, 0x0a, 0x7d, 0x0a -}; -size_t json_schema_to_grammar_mjs_len = 19964; diff --git a/examples/server/public/completion.js b/examples/server/public/completion.js index 835ce6e68..987b9a3b8 100644 --- a/examples/server/public/completion.js +++ b/examples/server/public/completion.js @@ -21,6 +21,7 @@ let generation_settings = null; // export async function* llama(prompt, params = {}, config = {}) { let controller = config.controller; + const api_url = config.api_url || ""; if (!controller) { controller = new AbortController(); @@ -28,7 +29,7 @@ export async function* llama(prompt, params = {}, config = {}) { const completionParams = { ...paramDefaults, ...params, prompt }; - const response = await fetch("/completion", { + const response = await fetch(`${api_url}/completion`, { method: 'POST', body: JSON.stringify(completionParams), headers: { @@ -193,9 +194,10 @@ export const llamaComplete = async (params, controller, callback) => { } // Get the model info from the server. This is useful for getting the context window and so on. -export const llamaModelInfo = async () => { +export const llamaModelInfo = async (config = {}) => { if (!generation_settings) { - const props = await fetch("/props").then(r => r.json()); + const api_url = config.api_url || ""; + const props = await fetch(`${api_url}/props`).then(r => r.json()); generation_settings = props.default_generation_settings; } return generation_settings; diff --git a/examples/server/public/favicon.ico b/examples/server/public/favicon.ico new file mode 100644 index 000000000..89e154a0a Binary files /dev/null and b/examples/server/public/favicon.ico differ diff --git a/examples/server/public/index.html b/examples/server/public/index.html index bbc5c2f68..2961999f2 100644 --- a/examples/server/public/index.html +++ b/examples/server/public/index.html @@ -199,10 +199,10 @@ + + + +
+ +
+
+ + + + diff --git a/examples/server/themes/wild/README.md b/examples/server/themes/wild/README.md new file mode 100644 index 000000000..560bcc81b --- /dev/null +++ b/examples/server/themes/wild/README.md @@ -0,0 +1,5 @@ +# LLaMA.cpp Server Wild Theme + +Simple tweaks to the UI. To use simply run server with `--path=themes/wild` + +![image](wild.png) diff --git a/examples/server/themes/wild/favicon.ico b/examples/server/themes/wild/favicon.ico new file mode 100644 index 000000000..89e154a0a Binary files /dev/null and b/examples/server/themes/wild/favicon.ico differ diff --git a/examples/server/themes/wild/index.html b/examples/server/themes/wild/index.html new file mode 100644 index 000000000..772e716cd --- /dev/null +++ b/examples/server/themes/wild/index.html @@ -0,0 +1,1061 @@ + + + + + + + llama.cpp - chat + + + + + + + +
+ +
+
+ + + + diff --git a/examples/server/themes/wild/llama_cpp.png b/examples/server/themes/wild/llama_cpp.png new file mode 100644 index 000000000..bad1dc9fc Binary files /dev/null and b/examples/server/themes/wild/llama_cpp.png differ diff --git a/examples/server/themes/wild/llamapattern.png b/examples/server/themes/wild/llamapattern.png new file mode 100644 index 000000000..2a159ce6a Binary files /dev/null and b/examples/server/themes/wild/llamapattern.png differ diff --git a/examples/server/themes/wild/wild.png b/examples/server/themes/wild/wild.png new file mode 100644 index 000000000..46ffa0f3e Binary files /dev/null and b/examples/server/themes/wild/wild.png differ diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index ff146893d..d872b63f5 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -3,6 +3,8 @@ #include "llama.h" #include "common.h" +// Change JSON_ASSERT from assert() to GGML_ASSERT: +#define JSON_ASSERT GGML_ASSERT #include "json.hpp" #include @@ -12,7 +14,7 @@ #define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" -using json = nlohmann::json; +using json = nlohmann::ordered_json; // https://community.openai.com/t/openai-chat-list-of-error-codes-and-types/357791/11 enum error_type { @@ -49,24 +51,35 @@ extern bool server_log_json; #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) +static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra); + template -static T json_value(const json &body, const std::string &key, const T &default_value) { +static T json_value(const json & body, const std::string & key, const T & default_value) { // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; + if (body.contains(key) && !body.at(key).is_null()) { + try { + return body.at(key); + } catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) { + std::stringstream ss; + ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value."; + LOG_WARNING(ss.str().c_str(), body); + return default_value; + } + } else { + return default_value; + } } -static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { +static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) { std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); - json log = nlohmann::ordered_json{ + json log = json{ {"tid", ss_tid.str()}, {"timestamp", time(nullptr)}, }; if (server_log_json) { - log.merge_patch( { + log.merge_patch({ {"level", level}, {"function", function}, {"line", line}, @@ -87,7 +100,7 @@ static inline void server_log(const char *level, const char *function, int line, } std::stringstream ss; ss << buf << " |"; - for (const auto& el : log.items()) + for (const auto & el : log.items()) { const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); ss << " " << el.key() << "=" << value; @@ -95,8 +108,8 @@ static inline void server_log(const char *level, const char *function, int line, const std::string str = ss.str(); printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); } + fflush(stdout); } // @@ -352,51 +365,67 @@ static json oaicompat_completion_params_parse( // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; llama_params["model"] = json_value(body, "model", std::string("unknown")); - llama_params["prompt"] = format_chat(model, chat_template, body["messages"]); - llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); - llama_params["temperature"] = json_value(body, "temperature", 0.0); - llama_params["top_k"] = json_value(body, "top_k", default_sparams.top_k); - llama_params["top_p"] = json_value(body, "top_p", 1.0); - llama_params["n_predict"] = json_value(body, "max_tokens", -1); - llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); llama_params["frequency_penalty"] = json_value(body, "frequency_penalty", 0.0); + llama_params["logit_bias"] = json_value(body, "logit_bias", json::object()); + llama_params["n_predict"] = json_value(body, "max_tokens", -1); llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0); llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED); llama_params["stream"] = json_value(body, "stream", false); - llama_params["mirostat"] = json_value(body, "mirostat", default_sparams.mirostat); - llama_params["mirostat_tau"] = json_value(body, "mirostat_tau", default_sparams.mirostat_tau); - llama_params["mirostat_eta"] = json_value(body, "mirostat_eta", default_sparams.mirostat_eta); - llama_params["penalize_nl"] = json_value(body, "penalize_nl", default_sparams.penalize_nl); - llama_params["typical_p"] = json_value(body, "typical_p", default_sparams.typical_p); - llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); - llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); - llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); - llama_params["n_keep"] = json_value(body, "n_keep", 0); + llama_params["temperature"] = json_value(body, "temperature", 0.0); + llama_params["top_p"] = json_value(body, "top_p", 1.0); - if (body.contains("grammar")) { - llama_params["grammar"] = json_value(body, "grammar", json::object()); - } + // Apply chat template to the list of messages + llama_params["prompt"] = format_chat(model, chat_template, body.at("messages")); - if (body.contains("response_format")) { - auto response_format = json_value(body, "response_format", json::object()); - if (response_format.contains("type")) { - if (response_format["type"] == "json_object") { - llama_params["json_schema"] = json_value(response_format, "schema", json::object()); - } else { - throw std::runtime_error("response_format type not supported: " + response_format["type"].dump()); - } - } - } - - // Handle 'stop' field - if (body.contains("stop") && body["stop"].is_string()) { - llama_params["stop"] = json::array({body["stop"].get()}); + // Handle "stop" field + if (body.contains("stop") && body.at("stop").is_string()) { + llama_params["stop"] = json::array({body.at("stop").get()}); } else { llama_params["stop"] = json_value(body, "stop", json::array()); } - // Ensure there is ChatML-specific end sequence among stop words - llama_params["stop"].push_back("<|im_end|>"); + // Handle "response_format" field + if (body.contains("response_format")) { + json response_format = json_value(body, "response_format", json::object()); + std::string response_type = json_value(response_format, "type", std::string()); + if (response_type == "json_object") { + llama_params["json_schema"] = json_value(response_format, "schema", json::object()); + } else if (!response_type.empty() && response_type != "text") { + throw std::runtime_error("response_format type must be one of \"text\" or \"json_object\", but got: " + response_type); + } + } + + // Handle "n" field + int n_choices = json_value(body, "n", 1); + if (n_choices != 1) { + throw std::runtime_error("Only one completion choice is allowed"); + } + + // Handle "logprobs" field + // TODO: The response format of this option is not yet OAI-compatible, but seems like no one really using it; We may need to fix it in the future + if (body.contains("logprobs")) { + llama_params["n_probs"] = json_value(body, "top_logprobs", 20); + } else if (body.contains("top_logprobs")) { + throw std::runtime_error("top_logprobs requires logprobs to be set to true"); + } + + // Params supported by OAI but unsupported by llama.cpp + static const std::vector unsupported_params { "tools", "tool_choice" }; + for (auto & param : unsupported_params) { + if (body.contains(param)) { + throw std::runtime_error("Unsupported param: " + param); + } + } + + // Copy remaining properties to llama_params + // This allows user to use llama.cpp-specific params like "mirostat", "tfs_z",... via OAI endpoint. + // See "launch_slot_with_task()" for a complete list of params supported by llama.cpp + for (const auto & item : body.items()) { + // Exception: if "n_predict" is present, we overwrite the value specified earlier by "max_tokens" + if (!llama_params.contains(item.key()) || item.key() == "n_predict") { + llama_params[item.key()] = item.value(); + } + } return llama_params; } @@ -536,6 +565,15 @@ static std::vector format_partial_response_oaicompat(json result, const st {"model", modelname}, {"object", "chat.completion.chunk"} }; + if (!finish_reason.empty()) { + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + ret.push_back({"usage", json { + {"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens} + }}); + } return std::vector({ret}); } diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 39e2d8ea4..b0f8e0fdc 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -133,8 +133,8 @@ int main(int argc, char ** argv) { // sample the most likely token const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - // is it an end of stream? - if (new_token_id == llama_token_eos(model) || n_cur == n_len) { + // is it an end of generation? + if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) { LOG_TEE("\n"); break; diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index e991b8846..12e46fbc9 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -65,7 +65,6 @@ int main(int argc, char ** argv) { llama_context * ctx_dft = NULL; // load the target model - params.logits_all = true; std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params); // load the draft model @@ -77,6 +76,28 @@ int main(int argc, char ** argv) { params.n_threads_batch = params.n_threads_batch_draft; std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params); + const bool vocab_type_tgt = llama_vocab_type(model_tgt); + LOG("vocab_type tgt: %d\n", vocab_type_tgt); + + const bool vocab_type_dft = llama_vocab_type(model_dft); + LOG("vocab_type dft: %d\n", vocab_type_dft); + + if (vocab_type_tgt != vocab_type_dft) { + fprintf(stderr, "%s: error: draft model vocab type must match target model to use speculation but ", __func__); + fprintf(stderr, "vocab_type_dft = %d while vocab_type_tgt = %d\n", vocab_type_dft, vocab_type_tgt); + return 1; + } + + if ( + llama_add_bos_token(model_tgt) != llama_add_bos_token(model_dft) || + llama_add_eos_token(model_tgt) != llama_add_eos_token(model_dft) || + llama_token_bos(model_tgt) != llama_token_bos(model_dft) || + llama_token_eos(model_tgt) != llama_token_eos(model_dft) + ) { + fprintf(stderr, "%s: error: draft model special tokens must match target model to use speculation\n", __func__); + return 1; + } + { const int n_vocab_tgt = llama_n_vocab(model_tgt); const int n_vocab_dft = llama_n_vocab(model_dft); @@ -106,20 +127,8 @@ int main(int argc, char ** argv) { // Tokenize the prompt - const bool add_bos_tgt = llama_should_add_bos_token(model_tgt); - LOG("add_bos tgt: %d\n", add_bos_tgt); - - const bool add_bos_dft = llama_should_add_bos_token(model_dft); - LOG("add_bos dft: %d\n", add_bos_dft); - - if (add_bos_tgt != add_bos_dft) { - fprintf(stderr, "%s: error: draft model add_bos must match target model to use speculation but ", __func__); - fprintf(stderr, "add_bos_dft = %d while add_bos_tgt = %d\n", add_bos_dft, add_bos_tgt); - return 1; - } - std::vector inp; - inp = ::llama_tokenize(ctx_tgt, params.prompt, add_bos_tgt, true); + inp = ::llama_tokenize(ctx_tgt, params.prompt, true, true); const int max_context_size = llama_n_ctx(ctx_tgt); const int max_tokens_list_size = max_context_size - 4; @@ -219,7 +228,8 @@ int main(int argc, char ** argv) { if (params.sparams.temp > 0) { // stochastic verification - llama_token_data_array dist_tgt = llama_sampling_probability_distribution(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft]); + llama_token_data_array dist_tgt = llama_sampling_prepare(ctx_sampling, ctx_tgt, NULL, drafts[s_keep].i_batch_tgt[i_dft], true, NULL); + llama_sample_softmax(ctx_tgt, &dist_tgt); float p_tgt = 0, p_dft = 0; // GGML_ASSERT(dist_tgt.size() == dist_dft.size()); @@ -350,7 +360,7 @@ int main(int argc, char ** argv) { } } - if (token_id == llama_token_eos(model_tgt)) { + if (llama_token_is_eog(model_tgt, token_id)) { has_eos = true; } ++n_predict; diff --git a/examples/sycl/README.md b/examples/sycl/README.md index b46f17f39..c589c2d3a 100644 --- a/examples/sycl/README.md +++ b/examples/sycl/README.md @@ -1,6 +1,6 @@ # llama.cpp/example/sycl -This example program provide the tools for llama.cpp for SYCL on Intel GPU. +This example program provides the tools for llama.cpp for SYCL on Intel GPU. ## Tool diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh index f20391d7a..db46d57ca 100755 --- a/examples/sycl/build.sh +++ b/examples/sycl/build.sh @@ -20,4 +20,4 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx #cmake --build . --config Release --target llama-bench #build all binary -cmake --build . --config Release -v +cmake --build . --config Release -j -v diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index c979a52f6..7b39a18c0 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -12,6 +12,7 @@ if [ $# -gt 0 ]; then GGML_SYCL_SINGLE_GPU=1 else GGML_SYCL_DEVICE=0 + GGML_SYCL_SINGLE_GPU=0 fi #export GGML_SYCL_DEBUG=1 diff --git a/examples/sycl/win-build-sycl.bat b/examples/sycl/win-build-sycl.bat index f9d43f8ed..1b0dc41ba 100644 --- a/examples/sycl/win-build-sycl.bat +++ b/examples/sycl/win-build-sycl.bat @@ -3,9 +3,13 @@ :: Copyright (C) 2024 Intel Corporation :: SPDX-License-Identifier: MIT -mkdir -p build + +IF not exist build (mkdir build) cd build +if %errorlevel% neq 0 goto ERROR + @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force +if %errorlevel% neq 0 goto ERROR :: for FP16 :: faster for long-prompt inference @@ -13,11 +17,18 @@ cd build :: for FP32 cmake -G "MinGW Makefiles" .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx -DCMAKE_BUILD_TYPE=Release - - +if %errorlevel% neq 0 goto ERROR :: build example/main only :: make main :: build all binary make -j +if %errorlevel% neq 0 goto ERROR + cd .. +exit /B 0 + +:ERROR +echo comomand error: %errorlevel% +exit /B %errorlevel% + diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index d95a92475..8b1baea80 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -26,11 +26,9 @@ int main(int argc, char ** argv) { llama_context_params ctx_params = llama_context_default_params(); llama_context * ctx = llama_new_context_with_model(model, ctx_params); - const bool add_bos = llama_should_add_bos_token(model); - std::vector tokens; - tokens = ::llama_tokenize(model, prompt, add_bos, true); + tokens = ::llama_tokenize(model, prompt, true, true); for (int i = 0; i < (int) tokens.size(); i++) { if (printing_ids) { diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index 7d06e401b..587418cc7 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -73,6 +73,7 @@ struct my_llama_model { static const char * LLM_KV_TRAINING_TYPE_TRAIN_MODEL = "train_model"; static const char * LLM_KV_TRAINING_TYPE = "training.type"; +static const char * LLM_KV_GENERAL_NAME = "general.name"; static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture"; static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type"; @@ -529,6 +530,7 @@ static void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_contex static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) { const char * arch = "llama"; + enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32; std::vector keybuf; @@ -540,6 +542,7 @@ static void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vo // set arch gguf_set_val_str(fctx, LLM_KV_GENERAL_ARCHITECTURE, arch); + gguf_set_val_str(fctx, LLM_KV_GENERAL_NAME, arch); gguf_set_val_u32(fctx, LLM_KV_GENERAL_FILE_TYPE, ftype); // set hparams diff --git a/examples/ts-type-to-grammar.sh b/examples/ts-type-to-grammar.sh index a7ad00173..9abba2a3d 100755 --- a/examples/ts-type-to-grammar.sh +++ b/examples/ts-type-to-grammar.sh @@ -1,7 +1,7 @@ #!/bin/bash # # ./examples/ts-type-to-grammar.sh "{a:string,b:string,c?:string}" -# python examples/json-schema-to-grammar.py https://json.schemastore.org/tsconfig.json +# python examples/json_schema_to_grammar.py https://json.schemastore.org/tsconfig.json # set -euo pipefail @@ -25,4 +25,4 @@ npx ts-json-schema-generator --unstable --no-top-ref --path "$DTS_FILE" --type M # https://github.com/YousefED/typescript-json-schema # npx typescript-json-schema --defaultProps --required "$DTS_FILE" MyType | tee "$SCHEMA_FILE" >&2 -./examples/json-schema-to-grammar.py "$SCHEMA_FILE" +./examples/json_schema_to_grammar.py "$SCHEMA_FILE" diff --git a/flake.lock b/flake.lock index 80de76dbf..c9ead0bf7 100644 --- a/flake.lock +++ b/flake.lock @@ -5,11 +5,11 @@ "nixpkgs-lib": "nixpkgs-lib" }, "locked": { - "lastModified": 1709336216, - "narHash": "sha256-Dt/wOWeW6Sqm11Yh+2+t0dfEWxoMxGBvv3JpIocFl9E=", + "lastModified": 1714641030, + "narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=", "owner": "hercules-ci", "repo": "flake-parts", - "rev": "f7b3c975cf067e56e7cda6cb098ebe3fb4d74ca2", + "rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e", "type": "github" }, "original": { @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1710451336, - "narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=", + "lastModified": 1714635257, + "narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "d691274a972b3165335d261cc4671335f5c67de9", + "rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f", "type": "github" }, "original": { @@ -36,20 +36,14 @@ }, "nixpkgs-lib": { "locked": { - "dir": "lib", - "lastModified": 1709237383, - "narHash": "sha256-cy6ArO4k5qTx+l5o+0mL9f5fa86tYUX3ozE1S+Txlds=", - "owner": "NixOS", - "repo": "nixpkgs", - "rev": "1536926ef5621b09bba54035ae2bb6d806d72ac8", - "type": "github" + "lastModified": 1714640452, + "narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=", + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" }, "original": { - "dir": "lib", - "owner": "NixOS", - "ref": "nixos-unstable", - "repo": "nixpkgs", - "type": "github" + "type": "tarball", + "url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz" } }, "root": { diff --git a/flake.nix b/flake.nix index 45f9deda0..9cd3756e5 100644 --- a/flake.nix +++ b/flake.nix @@ -145,6 +145,7 @@ # the same path you would with an overlay. legacyPackages = { llamaPackages = pkgs.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; + llamaPackagesWindows = pkgs.pkgsCross.mingwW64.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesCuda = pkgsCuda.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; llamaPackagesRocm = pkgsRocm.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; }; @@ -155,6 +156,7 @@ { default = config.legacyPackages.llamaPackages.llama-cpp; vulkan = config.packages.default.override { useVulkan = true; }; + windows = config.legacyPackages.llamaPackagesWindows.llama-cpp; } // lib.optionalAttrs pkgs.stdenv.isLinux { opencl = config.packages.default.override { useOpenCL = true; }; @@ -168,9 +170,14 @@ }; # Packages exposed in `.#checks` will be built by the CI and by - # `nix flake check`. Currently we expose all packages, but we could - # make more granular choices - checks = config.packages; + # `nix flake check`. + # + # We could test all outputs e.g. as `checks = confg.packages`. + # + # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed + checks = { + inherit (config.packages) default vulkan; + }; }; }; } diff --git a/ggml-alloc.c b/ggml-alloc.c index 643b2e55f..1fbd376ed 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -371,16 +371,16 @@ struct ggml_gallocr { }; ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { - ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1); + ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(1, sizeof(struct ggml_gallocr)); GGML_ASSERT(galloc != NULL); - galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1); + galloc->bufts = calloc(n_bufs, sizeof(ggml_backend_buffer_type_t)); GGML_ASSERT(galloc->bufts != NULL); - galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1); + galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t) * n_bufs); GGML_ASSERT(galloc->buffers != NULL); - galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1); + galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *)); GGML_ASSERT(galloc->buf_tallocs != NULL); for (int i = 0; i < n_bufs; i++) { @@ -646,8 +646,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c free(galloc->hash_set.keys); free(galloc->hash_values); galloc->hash_set.size = hash_size; - galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size); - galloc->hash_values = calloc(sizeof(struct hash_node), hash_size); + galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *)); + galloc->hash_values = calloc(hash_size, sizeof(struct hash_node)); GGML_ASSERT(galloc->hash_set.keys != NULL); GGML_ASSERT(galloc->hash_values != NULL); } else { @@ -667,7 +667,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c // set the node_allocs from the hash table if (galloc->n_nodes < graph->n_nodes) { free(galloc->node_allocs); - galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes); + galloc->node_allocs = calloc(graph->n_nodes, sizeof(struct node_alloc)); GGML_ASSERT(galloc->node_allocs != NULL); } galloc->n_nodes = graph->n_nodes; @@ -697,7 +697,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c } if (galloc->n_leafs < graph->n_leafs) { free(galloc->leaf_allocs); - galloc->leaf_allocs = calloc(sizeof(galloc->leaf_allocs[0]), graph->n_leafs); + galloc->leaf_allocs = calloc(graph->n_leafs, sizeof(galloc->leaf_allocs[0])); GGML_ASSERT(galloc->leaf_allocs != NULL); } galloc->n_leafs = graph->n_leafs; @@ -705,8 +705,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c struct ggml_tensor * leaf = graph->leafs[i]; struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf); galloc->leaf_allocs[i].buffer_id = hn->buffer_id; - galloc->leaf_allocs[i].leaf.offset = hn->offset; - galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); + if (leaf->view_src || leaf->data) { + galloc->leaf_allocs[i].leaf.offset = SIZE_MAX; + galloc->leaf_allocs[i].leaf.size_max = 0; + } else { + galloc->leaf_allocs[i].leaf.offset = hn->offset; + galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf); + } } // reallocate buffers if needed diff --git a/ggml-backend.c b/ggml-backend.c index 6026570ae..f5bdcf078 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -420,7 +420,7 @@ GGML_CALL static void ggml_backend_registry_init(void) { ggml_backend_register("CPU", ggml_backend_reg_cpu_init, ggml_backend_cpu_buffer_type(), NULL); // add forward decls here to avoid including the backend headers -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA extern GGML_CALL void ggml_backend_cuda_reg_devices(void); ggml_backend_cuda_reg_devices(); #endif @@ -822,7 +822,11 @@ GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { switch (op->op) { case GGML_OP_CPY: - return op->type != GGML_TYPE_IQ2_XXS && op->type != GGML_TYPE_IQ2_XS && op->type != GGML_TYPE_IQ1_S; // missing type_traits.from_float + return + op->type != GGML_TYPE_IQ2_XXS && + op->type != GGML_TYPE_IQ2_XS && + op->type != GGML_TYPE_IQ1_S && + op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float case GGML_OP_MUL_MAT: return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type; default: @@ -1721,23 +1725,23 @@ ggml_backend_sched_t ggml_backend_sched_new( GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU - struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); + struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched)); // initialize hash table sched->hash_set = ggml_hash_set_new(graph_size); - sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); - sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); + sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0])); + sched->tensor_copies = calloc(sched->hash_set.size, sizeof(sched->tensor_copies[0])); const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; - sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size); - sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size); + sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0])); + sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0])); sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; const int initial_splits_capacity = 16; - sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity); + sched->splits = calloc(initial_splits_capacity, sizeof(sched->splits[0])); sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { @@ -1780,12 +1784,14 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { void ggml_backend_sched_reset(ggml_backend_sched_t sched) { // reset state for the next run - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT - memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); - memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); + if (!sched->is_reset) { + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT + memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); + memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - sched->is_reset = true; + sched->is_reset = true; + } sched->is_alloc = false; } @@ -1968,10 +1974,10 @@ static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_te struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = { /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT + /* .keys = */ calloc(graph->visited_hash_table.size, sizeof(hash_set.keys[0])) // NOLINT }; - struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT - bool * node_init = calloc(sizeof(node_init[0]), hash_set.size); + struct ggml_tensor ** node_copies = calloc(hash_set.size, sizeof(node_copies[0])); // NOLINT + bool * node_init = calloc(hash_set.size, sizeof(node_init[0])); struct ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), diff --git a/ggml-backend.h b/ggml-backend.h index 422457ab6..744b6a774 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -137,7 +137,7 @@ extern "C" { /* Example usage: - // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be asigned + // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned // preferrably to run on the same backend as the buffer ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); diff --git a/ggml-common.h b/ggml-common.h index 0257c928c..43c7978a0 100644 --- a/ggml-common.h +++ b/ggml-common.h @@ -377,6 +377,27 @@ typedef struct { } block_iq1_s; static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding"); +// 1.75 bpw +typedef struct { + uint8_t qs[QK_K/8]; // grid index, low 8 bits + uint8_t qh[QK_K/16]; // grid index, high 3 bits + grid shift bit (for two groups of 8) +#if QK_K == 64 + ggml_half d; +#endif + uint8_t scales[QK_K/32]; // 3-bit block scales (4-bit if QK_K == 64) +} block_iq1_m; +#if QK_K == 64 +static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32 + sizeof(ggml_half), "wrong iq1_m block size/padding"); +#else +static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding"); +#endif + +// Used by IQ1_M quants +typedef union { + ggml_half f16; + uint16_t u16; +} iq1m_scale_t; + // Non-linear quants #define QK4_NL 32 typedef struct { @@ -426,10 +447,11 @@ static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_ #define GGML_COMMON_IMPL #elif defined(GGML_COMMON_IMPL_SYCL) + #include -#define GGML_TABLE_BEGIN(type, name, size) static dpct::global_memory name(sycl::range<1>(size), { -#define GGML_TABLE_END() }); +#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = { +#define GGML_TABLE_END() }; #define GGML_COMMON_IMPL #endif @@ -1050,6 +1072,7 @@ GGML_TABLE_END() #define NGRID_IQ1S 2048 #define IQ1S_DELTA 0.125f +#define IQ1M_DELTA 0.125f #if defined(GGML_COMMON_IMPL_C) GGML_TABLE_BEGIN(uint64_t, iq1s_grid, NGRID_IQ1S) 0xffffffffffffffff, 0xffffffffffffff01, 0xffffffffffff0000, 0xffffffffffff01ff, diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 14f409eb1..6f89a7cc3 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2,18 +2,37 @@ #include "ggml.h" #include "ggml-backend-impl.h" -#if defined(GGML_USE_HIPBLAS) -#define GGML_COMMON_DECL_HIP -#define GGML_COMMON_IMPL_HIP -#else -#define GGML_COMMON_DECL_CUDA -#define GGML_COMMON_IMPL_CUDA -#endif -#include "ggml-common.h" +#include "ggml-cuda/common.cuh" +#include "ggml-cuda/acc.cuh" +#include "ggml-cuda/alibi.cuh" +#include "ggml-cuda/arange.cuh" +#include "ggml-cuda/argsort.cuh" +#include "ggml-cuda/binbcast.cuh" +#include "ggml-cuda/clamp.cuh" +#include "ggml-cuda/concat.cuh" +#include "ggml-cuda/convert.cuh" +#include "ggml-cuda/cpy.cuh" +#include "ggml-cuda/diagmask.cuh" +#include "ggml-cuda/dmmv.cuh" +#include "ggml-cuda/fattn.cuh" +#include "ggml-cuda/getrows.cuh" +#include "ggml-cuda/im2col.cuh" +#include "ggml-cuda/mmq.cuh" +#include "ggml-cuda/mmvq.cuh" +#include "ggml-cuda/norm.cuh" +#include "ggml-cuda/pad.cuh" +#include "ggml-cuda/pool2d.cuh" +#include "ggml-cuda/quantize.cuh" +#include "ggml-cuda/rope.cuh" +#include "ggml-cuda/scale.cuh" +#include "ggml-cuda/softmax.cuh" +#include "ggml-cuda/sumrows.cuh" +#include "ggml-cuda/tsembd.cuh" +#include "ggml-cuda/unary.cuh" +#include "ggml-cuda/upscale.cuh" #include #include -#include #include #include #include @@ -28,161 +47,10 @@ #include #include -// stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string -#define STRINGIZE_IMPL(...) #__VA_ARGS__ -#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) - -#if defined(GGML_USE_HIPBLAS) -#include -#include -#include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ -#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F -#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F -#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F -#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT -#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT -#define CUBLAS_OP_N HIPBLAS_OP_N -#define CUBLAS_OP_T HIPBLAS_OP_T -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_TF32_TENSOR_OP_MATH 0 -#define CUDA_R_16F HIPBLAS_R_16F -#define CUDA_R_32F HIPBLAS_R_32F -#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) -#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 -#define cublasCreate hipblasCreate -#define cublasDestroy hipblasDestroy -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx -#define cublasHandle_t hipblasHandle_t -#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS -#define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm -#define cublasStatus_t hipblasStatus_t -#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 -#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer -#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess -#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess -#define cudaDeviceProp hipDeviceProp_t -#define cudaDeviceSynchronize hipDeviceSynchronize -#define cudaError_t hipError_t -#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled -#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled -#define cudaEventCreateWithFlags hipEventCreateWithFlags -#define cudaEventDisableTiming hipEventDisableTiming -#define cudaEventRecord hipEventRecord -#define cudaEventSynchronize hipEventSynchronize -#define cudaEvent_t hipEvent_t -#define cudaEventDestroy hipEventDestroy -#define cudaFree hipFree -#define cudaFreeHost hipHostFree -#define cudaGetDevice hipGetDevice -#define cudaGetDeviceCount hipGetDeviceCount -#define cudaGetDeviceProperties hipGetDeviceProperties -#define cudaGetErrorString hipGetErrorString -#define cudaGetLastError hipGetLastError -#define cudaHostRegister hipHostRegister -#define cudaHostRegisterPortable hipHostRegisterPortable -#define cudaHostRegisterReadOnly hipHostRegisterReadOnly -#define cudaHostUnregister hipHostUnregister -#define cudaLaunchHostFunc hipLaunchHostFunc -#ifdef GGML_HIP_UMA -#define cudaMalloc hipMallocManaged -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) -#else -#define cudaMalloc hipMalloc -#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) -#endif -#define cudaMemcpy hipMemcpy -#define cudaMemcpyAsync hipMemcpyAsync -#define cudaMemcpyPeerAsync hipMemcpyPeerAsync -#define cudaMemcpy2DAsync hipMemcpy2DAsync -#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost -#define cudaMemcpyHostToDevice hipMemcpyHostToDevice -#define cudaMemcpyKind hipMemcpyKind -#define cudaMemset hipMemset -#define cudaMemsetAsync hipMemsetAsync -#define cudaMemGetInfo hipMemGetInfo -#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize -#define cudaSetDevice hipSetDevice -#define cudaStreamCreateWithFlags hipStreamCreateWithFlags -#define cudaStreamDestroy hipStreamDestroy -#define cudaStreamFireAndForget hipStreamFireAndForget -#define cudaStreamNonBlocking hipStreamNonBlocking -#define cudaStreamPerThread hipStreamPerThread -#define cudaStreamSynchronize hipStreamSynchronize -#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) -#define cudaStream_t hipStream_t -#define cudaSuccess hipSuccess -#define __trap abort -#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS -#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED -#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED -#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE -#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH -#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR -#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED -#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR -#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED -#else -#include -#include -#include -#include - -#if CUDART_VERSION < 11020 -#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED -#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH -#define CUBLAS_COMPUTE_16F CUDA_R_16F -#define CUBLAS_COMPUTE_32F CUDA_R_32F -#define cublasComputeType_t cudaDataType_t -#endif // CUDART_VERSION < 11020 - -#endif // defined(GGML_USE_HIPBLAS) - -#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) - -#define CC_PASCAL 600 -#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products -#define CC_VOLTA 700 -#define CC_OFFSET_AMD 1000000 -#define CC_RDNA1 (CC_OFFSET_AMD + 1010) -#define CC_RDNA2 (CC_OFFSET_AMD + 1030) -#define CC_RDNA3 (CC_OFFSET_AMD + 1100) - -// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication -// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant -// for large computational tasks. the drawback is that this requires some extra amount of VRAM: -// - 7B quantum model: +100-200 MB -// - 13B quantum model: +200-400 MB -// -//#define GGML_CUDA_FORCE_MMQ - -// TODO: improve this to be correct for more hardware -// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores -#if !defined(GGML_CUDA_FORCE_MMQ) -#define CUDA_USE_TENSOR_CORES -#endif - -#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels -#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available - -#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses - - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); [[noreturn]] -static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) { +void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) { int id = -1; // in case cudaGetDevice fails cudaGetDevice(&id); @@ -193,65 +61,9 @@ static void ggml_cuda_error(const char * stmt, const char * func, const char * f GGML_ASSERT(!"CUDA error"); } -#define CUDA_CHECK_GEN(err, success, error_fn) \ - do { \ - auto err_ = (err); \ - if (err_ != (success)) { \ - ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ - } \ - } while (0) - -#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) - -#if CUDART_VERSION >= 12000 - static const char * cublas_get_error_str(const cublasStatus_t err) { - return cublasGetStatusString(err); - } -#else - static const char * cublas_get_error_str(const cublasStatus_t err) { - switch (err) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; - default: return "unknown error"; - } - } -#endif // CUDART_VERSION >= 12000 - -#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) - -#if !defined(GGML_USE_HIPBLAS) -static const char * cu_get_error_str(CUresult err) { - const char * err_str; - cuGetErrorString(err, &err_str); - return err_str; -} -#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) -#endif - -#if CUDART_VERSION >= 11100 -#define GGML_CUDA_ASSUME(x) __builtin_assume(x) -#else -#define GGML_CUDA_ASSUME(x) -#endif // CUDART_VERSION >= 11100 - - -#define GGML_CUDA_MAX_STREAMS 8 - -struct ggml_tensor_extra_gpu { - void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors - cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs -}; - // this is faster on Windows // probably because the Windows CUDA libraries forget to make this check before invoking the drivers -static void ggml_cuda_set_device(const int device) { +void ggml_cuda_set_device(int device) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); @@ -262,28 +74,12 @@ static void ggml_cuda_set_device(const int device) { CUDA_CHECK(cudaSetDevice(device)); } -static int ggml_cuda_get_device() { +int ggml_cuda_get_device() { int id; CUDA_CHECK(cudaGetDevice(&id)); return id; } -struct ggml_cuda_device_info { - int device_count; - - struct cuda_device_info { - int cc; // compute capability - size_t smpb; // max. shared memory per block - bool vmm; // virtual memory support - size_t vmm_granularity; // granularity of virtual memory - size_t total_vram; - }; - - cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; - - std::array default_tensor_split = {}; -}; - static ggml_cuda_device_info ggml_cuda_init() { #ifdef __HIP_PLATFORM_AMD__ // Workaround for a rocBLAS bug when using multiple graphics cards: @@ -317,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() { for (int id = 0; id < info.device_count; ++id) { int device_vmm = 0; -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) CUdevice device; CU_CHECK(cuDeviceGet(&device, id)); CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); @@ -345,6 +141,7 @@ static ggml_cuda_device_info ggml_cuda_init() { info.devices[id].cc = 100*prop.major + 10*prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) info.devices[id].smpb = prop.sharedMemPerBlock; + info.devices[id].nsm = prop.multiProcessorCount; } for (int id = 0; id < info.device_count; ++id) { @@ -357,7 +154,7 @@ static ggml_cuda_device_info ggml_cuda_init() { return info; } -static const ggml_cuda_device_info & get_cuda_global_info() { +const ggml_cuda_device_info & ggml_cuda_info() { static ggml_cuda_device_info info = ggml_cuda_init(); return info; } @@ -365,13 +162,6 @@ static const ggml_cuda_device_info & get_cuda_global_info() { // #define DEBUG_CUDA_MALLOC // buffer pool for cuda (legacy) -struct ggml_cuda_pool { - virtual ~ggml_cuda_pool() = default; - - virtual void * alloc(size_t size, size_t * actual_size) = 0; - virtual void free(void * ptr, size_t size) = 0; -}; - struct ggml_cuda_pool_leg : public ggml_cuda_pool { static const int MAX_BUFFERS = 256; @@ -469,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { }; // pool with virtual memory -#if !defined(GGML_USE_HIPBLAS) +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) struct ggml_cuda_pool_vmm : public ggml_cuda_pool { static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB @@ -481,7 +271,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { explicit ggml_cuda_pool_vmm(int device) : device(device), - granularity(get_cuda_global_info().devices[device].vmm_granularity) { + granularity(ggml_cuda_info().devices[device].vmm_granularity) { } ~ggml_cuda_pool_vmm() { @@ -535,7 +325,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { pool_size += reserve_size; //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n", - // id, (unsigned long long) (pool_size[id]/1024/1024), + // device, (unsigned long long) (pool_size/1024/1024), // (unsigned long long) (reserve_size/1024/1024)); } @@ -565,130 +355,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { }; #endif // !defined(GGML_USE_HIPBLAS) -template -struct ggml_cuda_pool_alloc { - ggml_cuda_pool * pool = nullptr; - T * ptr = nullptr; - size_t actual_size = 0; - - ggml_cuda_pool_alloc() = default; - - explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) { +std::unique_ptr ggml_backend_cuda_context::new_pool_for_device(int device) { +#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM) + if (ggml_cuda_info().devices[device].vmm) { + return std::unique_ptr(new ggml_cuda_pool_vmm(device)); } - - ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) { - alloc(size); - } - - ~ggml_cuda_pool_alloc() { - if (ptr != nullptr) { - pool->free(ptr, actual_size); - } - } - - // size is in number of elements - T * alloc(size_t size) { - GGML_ASSERT(pool != nullptr); - GGML_ASSERT(ptr == nullptr); - ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); - return ptr; - } - - T * alloc(ggml_cuda_pool & pool, size_t size) { - this->pool = &pool; - return alloc(size); - } - - T * get() { - return ptr; - } - - ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete; - ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete; - ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete; - ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete; -}; - - -// backend interface - -struct ggml_backend_cuda_context { - int device; - std::string name; - cudaEvent_t copy_event = nullptr; - - cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; - cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - - explicit ggml_backend_cuda_context(int device) : - device(device), - name(GGML_CUDA_NAME + std::to_string(device)) { - } - - ~ggml_backend_cuda_context() { - if (copy_event != nullptr) { - CUDA_CHECK(cudaEventDestroy(copy_event)); - } - for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { - for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { - if (streams[i][j] != nullptr) { - CUDA_CHECK(cudaStreamDestroy(streams[i][j])); - } - } - if (cublas_handles[i] != nullptr) { - CUBLAS_CHECK(cublasDestroy(cublas_handles[i])); - } - } - } - - cudaStream_t stream(int device, int stream) { - if (streams[device][stream] == nullptr) { - ggml_cuda_set_device(device); - CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking)); - } - return streams[device][stream]; - } - - cudaStream_t stream() { - return stream(device, 0); - } - - cublasHandle_t cublas_handle(int device) { - if (cublas_handles[device] == nullptr) { - ggml_cuda_set_device(device); - CUBLAS_CHECK(cublasCreate(&cublas_handles[device])); - CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH)); - } - return cublas_handles[device]; - } - - cublasHandle_t cublas_handle() { - return cublas_handle(device); - } - - // pool - std::unique_ptr pools[GGML_CUDA_MAX_DEVICES]; - - static std::unique_ptr new_pool_for_device(int device) { -#if !defined(GGML_USE_HIPBLAS) - if (get_cuda_global_info().devices[device].vmm) { - return std::unique_ptr(new ggml_cuda_pool_vmm(device)); - } #endif - return std::unique_ptr(new ggml_cuda_pool_leg(device)); - } - - ggml_cuda_pool & pool(int device) { - if (pools[device] == nullptr) { - pools[device] = new_pool_for_device(device); - } - return *pools[device]; - } - - ggml_cuda_pool & pool() { - return pool(device); - } -}; + return std::unique_ptr(new ggml_cuda_pool_leg(device)); +} // cuda buffer @@ -729,10 +403,8 @@ GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - if (tensor->view_src != NULL && tensor->view_offs == 0) { + if (tensor->view_src != NULL) { assert(tensor->view_src->buffer->buft == buffer->buft); - tensor->backend = tensor->view_src->backend; - tensor->extra = tensor->view_src->extra; return; } @@ -771,7 +443,11 @@ GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t if (src_ctx->device == dst_ctx->device) { CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); } else { +#ifdef GGML_CUDA_NO_PEER_COPY + return false; +#else CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread)); +#endif } CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); return true; @@ -907,11 +583,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array get_cuda_global_info().devices[id].cc) { - min_compute_capability = get_cuda_global_info().devices[id].cc; + if (min_compute_capability > ggml_cuda_info().devices[id].cc) { + min_compute_capability = ggml_cuda_info().devices[id].cc; } - if (max_compute_capability < get_cuda_global_info().devices[id].cc) { - max_compute_capability = get_cuda_global_info().devices[id].cc; + if (max_compute_capability < ggml_cuda_info().devices[id].cc) { + max_compute_capability = ggml_cuda_info().devices[id].cc; } } } @@ -939,6 +615,7 @@ static int64_t get_row_rounding(ggml_type type, const std::arraybuft->iface.get_name == ggml_backend_cuda_host_buffer_type_name; //} - - /// kernels - -#if defined(GGML_USE_HIPBLAS) -#define __CUDA_ARCH__ 1300 - -#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ - defined(__gfx1150__) || defined(__gfx1151__) -#define RDNA3 -#endif - -#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ - defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) -#define RDNA2 -#endif - -#ifndef __has_builtin - #define __has_builtin(x) 0 -#endif - -typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); -typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); -static __device__ __forceinline__ int __vsubss4(const int a, const int b) { - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); -#if __has_builtin(__builtin_elementwise_sub_sat) - const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); - return reinterpret_cast(c); -#else - int8x4_t c; - int16_t tmp; -#pragma unroll - for (int i = 0; i < 4; i++) { - tmp = va[i] - vb[i]; - if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); - if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); - c[i] = tmp; - } - return reinterpret_cast(c); -#endif // __has_builtin(__builtin_elementwise_sub_sat) -} - -static __device__ __forceinline__ int __vsub4(const int a, const int b) { - return __vsubss4(a, b); -} - -static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { - const uint8x4_t& va = reinterpret_cast(a); - const uint8x4_t& vb = reinterpret_cast(b); - unsigned int c; - uint8x4_t& vc = reinterpret_cast(c); -#pragma unroll - for (int i = 0; i < 4; ++i) { - vc[i] = va[i] == vb[i] ? 0xff : 0x00; - } - return c; -} - -static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { -#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) - c = __builtin_amdgcn_sdot4(a, b, c, false); -#elif defined(RDNA3) - c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); -#elif defined(__gfx1010__) || defined(__gfx900__) - int tmp1; - int tmp2; - asm("\n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ - v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ - v_add3_u32 %0, %1, %2, %0 \n \ - " - : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) - : "v"(a), "v"(b) - ); -#else - const int8x4_t va = reinterpret_cast(a); - const int8x4_t vb = reinterpret_cast(b); - c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; -#endif - return c; -} -#endif // defined(GGML_USE_HIPBLAS) - - -#ifdef GGML_CUDA_F16 -typedef half dfloat; // dequantize float -typedef half2 dfloat2; -#else -typedef float dfloat; // dequantize float -typedef float2 dfloat2; -#endif //GGML_CUDA_F16 - -static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { - const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment - - int x32 = 0; - x32 |= x16[0] << 0; - x32 |= x16[1] << 16; - - return x32; -} - -static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) { - const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment - - int x32 = 0; - x32 |= x16[0] << 0; - x32 |= x16[1] << 16; - - return x32; -} - -static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) { - return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment -} - -static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) { - return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment -} - -template -using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int k, cudaStream_t stream); -typedef to_t_cuda_t to_fp32_cuda_t; -typedef to_t_cuda_t to_fp16_cuda_t; - -typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); -typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); -typedef void (*cpy_kernel_t)(const char * cx, char * cdst); - -typedef void (*ggml_cuda_func_t)(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); - typedef void (*ggml_cuda_op_mul_mat_t)( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); -typedef void (*ggml_cuda_op_flatten_t)( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream); - -typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); -typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); -typedef void (*load_tiles_cuda_t)( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row); -typedef float (*vec_dot_q_mul_mat_cuda_t)( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); - -#define WARP_SIZE 32 - -#define CUDA_GELU_BLOCK_SIZE 256 -#define CUDA_SILU_BLOCK_SIZE 256 -#define CUDA_TANH_BLOCK_SIZE 256 -#define CUDA_RELU_BLOCK_SIZE 256 -#define CUDA_HARDSIGMOID_BLOCK_SIZE 256 -#define CUDA_HARDSWISH_BLOCK_SIZE 256 -#define CUDA_SQR_BLOCK_SIZE 256 -#define CUDA_CPY_BLOCK_SIZE 32 -#define CUDA_SCALE_BLOCK_SIZE 256 -#define CUDA_CLAMP_BLOCK_SIZE 256 -#define CUDA_ROPE_BLOCK_SIZE 256 -#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 -#define CUDA_ALIBI_BLOCK_SIZE 32 -#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 -#define CUDA_QUANTIZE_BLOCK_SIZE 256 -#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 -#define CUDA_GET_ROWS_BLOCK_SIZE 256 -#define CUDA_UPSCALE_BLOCK_SIZE 256 -#define CUDA_CONCAT_BLOCK_SIZE 256 -#define CUDA_PAD_BLOCK_SIZE 256 -#define CUDA_ARANGE_BLOCK_SIZE 256 -#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 -#define CUDA_ACC_BLOCK_SIZE 256 -#define CUDA_IM2COL_BLOCK_SIZE 256 -#define CUDA_POOL2D_BLOCK_SIZE 256 - -#define CUDA_Q8_0_NE_ALIGN 2048 - -// dmmv = dequantize_mul_mat_vec -#ifndef GGML_CUDA_DMMV_X -#define GGML_CUDA_DMMV_X 32 -#endif -#ifndef GGML_CUDA_MMV_Y -#define GGML_CUDA_MMV_Y 1 -#endif - -#ifndef K_QUANTS_PER_ITERATION -#define K_QUANTS_PER_ITERATION 2 -#else -static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); -#endif - #ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE #define GGML_CUDA_PEER_MAX_BATCH_SIZE 128 #endif // GGML_CUDA_PEER_MAX_BATCH_SIZE #define MUL_MAT_SRC1_COL_STRIDE 128 -[[noreturn]] -static __device__ void no_device_code( - const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", - file_name, line, function_name, arch); - GGML_UNUSED(arch_list); -#else - printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", - file_name, line, function_name, arch, arch_list); -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - __trap(); - - GGML_UNUSED(no_device_code); // suppress unused function warning -} - -#ifdef __CUDA_ARCH__ -#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) -#else -//#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") -#define NO_DEVICE_CODE -#endif // __CUDA_ARCH__ - -static __device__ __forceinline__ float warp_reduce_sum(float x) { -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x += __shfl_xor_sync(0xffffffff, x, mask, 32); - } - return x; -} - -static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); - a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); - } - return a; -} - -#ifdef GGML_CUDA_F16 -static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); - } - return a; -#else - GGML_UNUSED(a); - NO_DEVICE_CODE; -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL -} -#endif // GGML_CUDA_F16 - -static __device__ __forceinline__ float warp_reduce_max(float x) { -#pragma unroll - for (int mask = 16; mask > 0; mask >>= 1) { - x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); - } - return x; -} - -//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX -//#pragma unroll -// for (int mask = 16; mask > 0; mask >>= 1) { -// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); -// } -// return x; -//#else -// GGML_UNUSED(x); -// NO_DEVICE_CODE; -//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX -//} - -static __device__ __forceinline__ float op_repeat(const float a, const float b) { - return b; - GGML_UNUSED(a); -} - -static __device__ __forceinline__ float op_add(const float a, const float b) { - return a + b; -} - -static __device__ __forceinline__ float op_mul(const float a, const float b) { - return a * b; -} - -static __device__ __forceinline__ float op_div(const float a, const float b) { - return a / b; -} - -template -static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13) { - const int i0s = blockDim.x*blockIdx.x + threadIdx.x; - const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); - const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; - const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; - - if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); - } -} - -template -static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, - int ne0, int ne1, int ne2, int ne3, - int ne10, int ne11, int ne12, int ne13, - /*int s0, */ int s1, int s2, int s3, - /*int s10,*/ int s11, int s12, int s13) { - - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - const int i3 = i/(ne2*ne1*ne0); - const int i2 = (i/(ne1*ne0)) % ne2; - const int i1 = (i/ne0) % ne1; - const int i0 = i % ne0; - - if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { - return; - } - - const int i11 = i1 % ne11; - const int i12 = i2 % ne12; - const int i13 = i3 % ne13; - - const size_t i_src0 = i3*s3 + i2*s2 + i1*s1; - const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; - const size_t i_dst = i_src0; - - const src0_t * src0_row = src0 + i_src0; - const src1_t * src1_row = src1 + i_src1; - dst_t * dst_row = dst + i_dst; - - const int i10 = i0 % ne10; - dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); -} - -static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, int offset) { - const int i = blockDim.x * blockIdx.x + threadIdx.x; - if (i >= ne) { - return; - } - int src1_idx = i - offset; - int oz = src1_idx / nb2; - int oy = (src1_idx - (oz * nb2)) / nb1; - int ox = src1_idx % nb1; - if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { - dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; - } else { - dst[i] = x[i]; - } -} - -static __global__ void gelu_f32(const float * x, float * dst, const int k) { - const float GELU_COEF_A = 0.044715f; - const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - - float xi = x[i]; - dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))); -} - -static __global__ void silu_f32(const float * x, float * dst, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - dst[i] = x[i] / (1.0f + expf(-x[i])); -} - -static __global__ void gelu_quick_f32(const float * x, float * dst, int k) { - const float GELU_QUICK_COEF = -1.702f; - const int i = blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { - return; - } - dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i]))); -} - -static __global__ void tanh_f32(const float * x, float * dst, int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { - return; - } - dst[i] = tanhf(x[i]); -} - -static __global__ void relu_f32(const float * x, float * dst, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - dst[i] = fmaxf(x[i], 0); -} - -static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); -} - -static __global__ void hardswish_f32(const float * x, float * dst, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); -} - -static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - if (i >= k) { - return; - } - dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope; -} - -static __global__ void sqr_f32(const float * x, float * dst, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - dst[i] = x[i] * x[i]; -} - -template -static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) { - const int row = blockIdx.x*blockDim.y + threadIdx.y; - const int tid = threadIdx.x; - - float2 mean_var = make_float2(0.f, 0.f); - - for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; - mean_var.x += xi; - mean_var.y += xi * xi; - } - - // sum up partial sums - mean_var = warp_reduce_sum(mean_var); - if (block_size > WARP_SIZE) { - __shared__ float2 s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = mean_var; - } - __syncthreads(); - mean_var = s_sum[lane_id]; - mean_var = warp_reduce_sum(mean_var); - } - - const float mean = mean_var.x / ncols; - const float var = mean_var.y / ncols - mean * mean; - const float inv_std = rsqrtf(var + eps); - - for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; - } -} - -static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) { - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - // operation - int offset_dst = - nidx + - blockIdx.y * ne0 + - blockIdx.z * ne0 * gridDim.y; - if (blockIdx.z < ne02) { // src0 - int offset_src = - nidx + - blockIdx.y * ne0 + - blockIdx.z * ne0 * gridDim.y; - dst[offset_dst] = x[offset_src]; - } else { - int offset_src = - nidx + - blockIdx.y * ne0 + - (blockIdx.z - ne02) * ne0 * gridDim.y; - dst[offset_dst] = y[offset_src]; - } -} - -static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) { - // blockIdx.z: idx of ne02*ne03 - // blockIdx.y: idx of ne01*scale_factor, aka ne1 - // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE - // ne00xne01: ne00 * ne01 - int ne0 = ne00 * scale_factor; - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - // operation - int i00 = nidx / scale_factor; - int i01 = blockIdx.y / scale_factor; - int offset_src = - i00 + - i01 * ne00 + - blockIdx.z * ne00xne01; - int offset_dst = - nidx + - blockIdx.y * ne0 + - blockIdx.z * ne0 * gridDim.y; - dst[offset_dst] = x[offset_src]; -} - -static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { - // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 - // blockIdx.y: idx of ne1 - // blockIDx.x: idx of ne0 / BLOCK_SIZE - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - - // operation - int offset_dst = - nidx + - blockIdx.y * ne0 + - blockIdx.z * ne0 * gridDim.y; - if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { - int offset_src = - nidx + - blockIdx.y * ne00 + - blockIdx.z * ne00 * ne01; - dst[offset_dst] = x[offset_src]; - } else { - dst[offset_dst] = 0.0f; - } -} - -static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { - // blockIDx.x: idx of ne0 / BLOCK_SIZE - int nidx = threadIdx.x + blockIdx.x * blockDim.x; - if (nidx >= ne0) { - return; - } - dst[nidx] = start + step * nidx; -} - -static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { - // blockIDx.y: idx of timesteps->ne[0] - // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE - int i = blockIdx.y; - int j = threadIdx.x + blockIdx.x * blockDim.x; - float * embed_data = (float *)((char *)dst + i*nb1); - - if (dim % 2 != 0 && j == ((dim + 1) / 2)) { - embed_data[dim] = 0.f; - } - - int half = dim / 2; - if (j >= half) { - return; - } - - float timestep = timesteps[i]; - float freq = (float)expf(-logf(max_period) * j / half); - float arg = timestep * freq; - embed_data[j] = cosf(arg); - embed_data[j + half] = sinf(arg); -} - -template -static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { - // blockIdx.x: num_groups idx - // threadIdx.x: block_size idx - int start = blockIdx.x * group_size; - int end = start + group_size; - - start += threadIdx.x; - - if (end >= ne_elements) { - end = ne_elements; - } - - float tmp = 0.0f; // partial sum for thread in warp - - for (int j = start; j < end; j += block_size) { - tmp += x[j]; - } - - tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { - __shared__ float s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - __syncthreads(); - tmp = s_sum[lane_id]; - tmp = warp_reduce_sum(tmp); - } - - float mean = tmp / group_size; - tmp = 0.0f; - - for (int j = start; j < end; j += block_size) { - float xi = x[j] - mean; - dst[j] = xi; - tmp += xi * xi; - } - - tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { - __shared__ float s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - __syncthreads(); - tmp = s_sum[lane_id]; - tmp = warp_reduce_sum(tmp); - } - - float variance = tmp / group_size; - float scale = rsqrtf(variance + eps); - for (int j = start; j < end; j += block_size) { - dst[j] *= scale; - } -} - -template -static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) { - const int row = blockIdx.x*blockDim.y + threadIdx.y; - const int tid = threadIdx.x; - - float tmp = 0.0f; // partial sum for thread in warp - - for (int col = tid; col < ncols; col += block_size) { - const float xi = x[row*ncols + col]; - tmp += xi * xi; - } - - // sum up partial sums - tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { - __shared__ float s_sum[32]; - int warp_id = threadIdx.x / WARP_SIZE; - int lane_id = threadIdx.x % WARP_SIZE; - if (lane_id == 0) { - s_sum[warp_id] = tmp; - } - __syncthreads(); - tmp = s_sum[lane_id]; - tmp = warp_reduce_sum(tmp); - } - - const float mean = tmp / ncols; - const float scale = rsqrtf(mean + eps); - - for (int col = tid; col < ncols; col += block_size) { - dst[row*ncols + col] = scale * x[row*ncols + col]; - } -} - -static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q4_0 * x = (const block_q4_0 *) vx; - - const dfloat d = x[ib].d; - - const int vui = x[ib].qs[iqs]; - - v.x = vui & 0xF; - v.y = vui >> 4; - -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {8.0f, 8.0f}); - v = __hmul2(v, {d, d}); -#else - v.x = (v.x - 8.0f) * d; - v.y = (v.y - 8.0f) * d; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q4_1 * x = (const block_q4_1 *) vx; - - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); - - const int vui = x[ib].qs[iqs]; - - v.x = vui & 0xF; - v.y = vui >> 4; - -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -#else - v.x = (v.x * d) + m; - v.y = (v.y * d) + m; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q5_0 * x = (const block_q5_0 *) vx; - - const dfloat d = x[ib].d; - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_CUDA_F16 - v = __hsub2(v, {16.0f, 16.0f}); - v = __hmul2(v, {d, d}); -#else - v.x = (v.x - 16.0f) * d; - v.y = (v.y - 16.0f) * d; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q5_1 * x = (const block_q5_1 *) vx; - - const dfloat d = __low2half(x[ib].dm); - const dfloat m = __high2half(x[ib].dm); - - uint32_t qh; - memcpy(&qh, x[ib].qh, sizeof(qh)); - - const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; - const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; - - v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); - v.y = ((x[ib].qs[iqs] >> 4) | xh_1); - -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); - v = __hadd2(v, {m, m}); -#else - v.x = (v.x * d) + m; - v.y = (v.y * d) + m; -#endif // GGML_CUDA_F16 -} - -static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const block_q8_0 * x = (const block_q8_0 *) vx; - - const dfloat d = x[ib].d; - - v.x = x[ib].qs[iqs + 0]; - v.y = x[ib].qs[iqs + 1]; - -#ifdef GGML_CUDA_F16 - v = __hmul2(v, {d, d}); -#else - v.x *= d; - v.y *= d; -#endif // GGML_CUDA_F16 -} - -template -static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { - - const int i = blockIdx.x; - - // assume 32 threads - const int tid = threadIdx.x; - const int il = tid/8; - const int ir = tid%8; - const int ib = 8*i + ir; - if (ib >= nb32) { - return; - } - - dst_t * y = yy + 256*i + 32*ir + 4*il; - - const block_q4_0 * x = (const block_q4_0 *)vx + ib; - const float d = __half2float(x->d); - const float dm = -8*d; - - const uint8_t * q = x->qs + 4*il; - - for (int l = 0; l < 4; ++l) { - y[l+ 0] = d * (q[l] & 0xF) + dm; - y[l+16] = d * (q[l] >> 4) + dm; - } -} - -template -static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { - - const int i = blockIdx.x; - - // assume 32 threads - const int tid = threadIdx.x; - const int il = tid/8; - const int ir = tid%8; - const int ib = 8*i + ir; - if (ib >= nb32) { - return; - } - - dst_t * y = yy + 256*i + 32*ir + 4*il; - - const block_q4_1 * x = (const block_q4_1 *)vx + ib; - const float2 d = __half22float2(x->dm); - - const uint8_t * q = x->qs + 4*il; - - for (int l = 0; l < 4; ++l) { - y[l+ 0] = d.x * (q[l] & 0xF) + d.y; - y[l+16] = d.x * (q[l] >> 4) + d.y; - } -} - -//================================== k-quants - -template -static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_q2_K * x = (const block_q2_K *) vx; - - const int tid = threadIdx.x; -#if QK_K == 256 - const int n = tid/32; - const int l = tid - 32*n; - const int is = 8*n + l/16; - - const uint8_t q = x[i].qs[32*n + l]; - dst_t * y = yy + i*QK_K + 128*n; - - float dall = __low2half(x[i].dm); - float dmin = __high2half(x[i].dm); - y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); - y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); - y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); -#else - const int is = tid/16; // 0 or 1 - const int il = tid%16; // 0...15 - const uint8_t q = x[i].qs[il] >> (2*is); - dst_t * y = yy + i*QK_K + 16*is + il; - float dall = __low2half(x[i].dm); - float dmin = __high2half(x[i].dm); - y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); - y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); -#endif - -} - -template -static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_q3_K * x = (const block_q3_K *) vx; - -#if QK_K == 256 - const int r = threadIdx.x/4; - const int tid = r/2; - const int is0 = r%2; - const int l0 = 16*is0 + 4*(threadIdx.x%4); - const int n = tid / 4; - const int j = tid - 4*n; - - uint8_t m = 1 << (4*n + j); - int is = 8*n + 2*j + is0; - int shift = 2*j; - - int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : - is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : - is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : - (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); - float d_all = x[i].d; - float dl = d_all * (us - 32); - - dst_t * y = yy + i*QK_K + 128*n + 32*j; - const uint8_t * q = x[i].qs + 32*n; - const uint8_t * hm = x[i].hmask; - - for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); -#else - const int tid = threadIdx.x; - const int is = tid/16; // 0 or 1 - const int il = tid%16; // 0...15 - const int im = il/8; // 0...1 - const int in = il%8; // 0...7 - - dst_t * y = yy + i*QK_K + 16*is + il; - - const uint8_t q = x[i].qs[il] >> (2*is); - const uint8_t h = x[i].hmask[in] >> (2*is + im); - const float d = (float)x[i].d; - - if (is == 0) { - y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } else { - y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); - y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); - } -#endif - -} - -#if QK_K == 256 -static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { - if (j < 4) { - d = q[j] & 63; m = q[j + 4] & 63; - } else { - d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); - m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); - } -} -#endif - -template -static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const block_q4_K * x = (const block_q4_K *) vx; - - const int i = blockIdx.x; - -#if QK_K == 256 - // assume 32 threads - const int tid = threadIdx.x; - const int il = tid/8; - const int ir = tid%8; - const int is = 2*il; - const int n = 4; - - dst_t * y = yy + i*QK_K + 64*il + n*ir; - - const float dall = __low2half(x[i].dm); - const float dmin = __high2half(x[i].dm); - - const uint8_t * q = x[i].qs + 32*il + n*ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const float d1 = dall * sc; const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const float d2 = dall * sc; const float m2 = dmin * m; - for (int l = 0; l < n; ++l) { - y[l + 0] = d1 * (q[l] & 0xF) - m1; - y[l +32] = d2 * (q[l] >> 4) - m2; - } -#else - const int tid = threadIdx.x; - const uint8_t * q = x[i].qs; - dst_t * y = yy + i*QK_K; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); - y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); -#endif -} - -template -static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const block_q5_K * x = (const block_q5_K *) vx; - - const int i = blockIdx.x; - -#if QK_K == 256 - // assume 64 threads - this is very slightly better than the one below - const int tid = threadIdx.x; - const int il = tid/16; // il is in 0...3 - const int ir = tid%16; // ir is in 0...15 - const int is = 2*il; // is is in 0...6 - - dst_t * y = yy + i*QK_K + 64*il + 2*ir; - - const float dall = __low2half(x[i].dm); - const float dmin = __high2half(x[i].dm); - - const uint8_t * ql = x[i].qs + 32*il + 2*ir; - const uint8_t * qh = x[i].qh + 2*ir; - - uint8_t sc, m; - get_scale_min_k4(is + 0, x[i].scales, sc, m); - const float d1 = dall * sc; const float m1 = dmin * m; - get_scale_min_k4(is + 1, x[i].scales, sc, m); - const float d2 = dall * sc; const float m2 = dmin * m; - - uint8_t hm = 1 << (2*il); - y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; - y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; - hm <<= 1; - y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; - y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; -#else - const int tid = threadIdx.x; - const uint8_t q = x[i].qs[tid]; - const int im = tid/8; // 0...3 - const int in = tid%8; // 0...7 - const int is = tid/16; // 0 or 1 - const uint8_t h = x[i].qh[in] >> im; - const float d = x[i].d; - dst_t * y = yy + i*QK_K + tid; - y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); - y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); -#endif -} - -template -static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const block_q6_K * x = (const block_q6_K *) vx; - - const int i = blockIdx.x; -#if QK_K == 256 - - // assume 64 threads - this is very slightly better than the one below - const int tid = threadIdx.x; - const int ip = tid/32; // ip is 0 or 1 - const int il = tid - 32*ip; // 0...32 - const int is = 8*ip + il/16; - - dst_t * y = yy + i*QK_K + 128*ip + il; - - const float d = x[i].d; - - const uint8_t * ql = x[i].ql + 64*ip + il; - const uint8_t qh = x[i].qh[32*ip + il]; - const int8_t * sc = x[i].scales + is; - - y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); - y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); - y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); -#else - - // assume 32 threads - const int tid = threadIdx.x; - const int ip = tid/16; // 0 or 1 - const int il = tid - 16*ip; // 0...15 - - dst_t * y = yy + i*QK_K + 16*ip + il; - - const float d = x[i].d; - - const uint8_t ql = x[i].ql[16*ip + il]; - const uint8_t qh = x[i].qh[il] >> (2*ip); - const int8_t * sc = x[i].scales; - - y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); - y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); -#endif -} - -inline bool ggml_cuda_supports_mmq(enum ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - return true; - default: - return false; - } -} - -template -static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_iq2_xxs * x = (const block_iq2_xxs *) vx; - - const int tid = threadIdx.x; -#if QK_K == 256 - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; - const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); -#endif - -} - -template -static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_iq2_xs * x = (const block_iq2_xs *) vx; - - const int tid = threadIdx.x; -#if QK_K == 256 - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint16_t * q2 = x[i].qs + 4*ib; - const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); -#endif - -} - -template -static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_iq2_s * x = (const block_iq2_s *) vx; - - const int tid = threadIdx.x; -#if QK_K == 256 - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; - const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; - for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); -#else - assert(false); -#endif - -} - -template -static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_iq3_xxs * x = (const block_iq3_xxs *) vx; - - const int tid = threadIdx.x; -#if QK_K == 256 - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * q3 = x[i].qs + 8*ib; - const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); - const uint32_t aux32 = gas[0] | (gas[1] << 16); - const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; - const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -#else - assert(false); -#endif - -} - -template -static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_iq3_s * x = (const block_iq3_s *) vx; - - const int tid = threadIdx.x; -#if QK_K == 256 - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); - const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); - const uint8_t signs = x[i].signs[4*ib + il]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); - } -#else - assert(false); -#endif - -} - -template -static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_iq1_s * x = (const block_iq1_s *) vx; - - const int tid = threadIdx.x; -#if QK_K == 256 - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; - const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); - uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; - grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; - grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; - grid32[0] &= 0x0f0f0f0f; - for (int j = 0; j < 8; ++j) { - y[j] = d * (q[j] + delta); - } -#else - assert(false); -#endif - -} - -static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; - -template -static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { - - const int i = blockIdx.x; - const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); - - const int tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[ib].qs + 4*il; - const float d = (float)x[ib].d; - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } - -} - -#if QK_K != 64 -template -static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { - const int i = blockIdx.x; - const block_iq4_xs * x = (const block_iq4_xs *)vx; - - const int tid = threadIdx.x; - const int il = tid/8; // 0...3 - const int ib = tid%8; // 0...7 - dst_t * y = yy + i*QK_K + 32*ib + 4*il; - const uint8_t * q4 = x[i].qs + 16*ib + 4*il; - const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); - for (int j = 0; j < 4; ++j) { - y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; - y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; - } -} -#endif - -static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - - const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q2_K * x = (const block_q2_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - -#if QK_K == 256 - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 16/K_QUANTS_PER_ITERATION; - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int s_offset = 8*im; - const int y_offset = 128*im + l0; - - uint32_t aux[4]; - const uint8_t * d = (const uint8_t *)aux; - const uint8_t * m = (const uint8_t *)(aux + 2); - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - - const float dall = __low2half(x[i].dm); - const float dmin = __high2half(x[i].dm); - - const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); - aux[0] = a[0] & 0x0f0f0f0f; - aux[1] = a[1] & 0x0f0f0f0f; - aux[2] = (a[0] >> 4) & 0x0f0f0f0f; - aux[3] = (a[1] >> 4) & 0x0f0f0f0f; - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) - + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) - + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) - + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) - + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) - + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) - + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) - +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); - sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] - + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; - - } - tmp += dall * sum1 - dmin * sum2; - - } -#else - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; - - uint32_t uaux[2]; - const uint8_t * d = (const uint8_t *)uaux; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint32_t * s = (const uint32_t *)x[i].scales; - - uaux[0] = s[0] & 0x0f0f0f0f; - uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; - - const float2 dall = __half22float2(x[i].dm); - - float sum1 = 0, sum2 = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t ql = q[l]; - sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) - + y[l+16] * d[1] * ((ql >> 2) & 3) - + y[l+32] * d[2] * ((ql >> 4) & 3) - + y[l+48] * d[3] * ((ql >> 6) & 3); - sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; - } - tmp += dall.x * sum1 - dall.y * sum2; - } -#endif - - // sum up partial sums and write back result - tmp = warp_reduce_sum(tmp); - - if (threadIdx.x == 0) { - dst[row] = tmp; - } -} - -static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - - const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q3_K * x = (const block_q3_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - -#if QK_K == 256 - - const uint16_t kmask1 = 0x0303; - const uint16_t kmask2 = 0x0f0f; - - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop - const int step = 16/K_QUANTS_PER_ITERATION; - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0....15 or 0...7 - - const uint8_t m = 1 << (4*im); - - const int l0 = n*in; // 0...15 or 0...14 in steps of 2 - const int q_offset = 32*im + l0; - const int y_offset = 128*im + l0; - - uint16_t utmp[4]; - const int8_t * s = (const int8_t *)utmp; - - const uint16_t s_shift = 4*im; - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * q = x[i].qs + q_offset; - const uint8_t * h = x[i].hmask + l0; - - const uint16_t * a = (const uint16_t *)x[i].scales; - utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); - utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); - utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); - utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); - - const float d = x[i].d; - - float sum = 0; - for (int l = 0; l < n; ++l) { - sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) - + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) - + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) - + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); - sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) - + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) - + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) - + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); - } - tmp += d * sum; - - } -#else - - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 - const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 - const int in = offset/8; // 0 or 1 - const int im = offset%8; // 0...7 - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + offset; - const uint8_t * q = x[i].qs + offset; - const uint8_t * s = x[i].scales; - - const float dall = (float)x[i].d; - - float sum = 0; - for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - const uint8_t hl = x[i].hmask[im+l] >> in; - const uint8_t ql = q[l]; - sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) - + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) - + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) - + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); - } - tmp += sum; - } -#endif - - // sum up partial sums and write back result - tmp = warp_reduce_sum(tmp); - - if (threadIdx.x == 0) { - dst[row] = tmp; - } -} - -static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - - const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q4_K * x = (const block_q4_K *)vx + ib0; - -#if QK_K == 256 - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 - - const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 - - const int il = tid/step; // 0...3 - const int ir = tid - step*il; // 0...7 or 0...3 - const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - -#if K_QUANTS_PER_ITERATION == 2 - uint32_t q32[4]; - const uint8_t * q4 = (const uint8_t *)q32; -#else - uint16_t q16[4]; - const uint8_t * q4 = (const uint8_t *)q16; -#endif - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - - const float dall = __low2half(x[i].dm); - const float dmin = __high2half(x[i].dm); - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - -#if K_QUANTS_PER_ITERATION == 2 - const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); - const uint32_t * q2 = q1 + 16; - - q32[0] = q1[0] & 0x0f0f0f0f; - q32[1] = q1[0] & 0xf0f0f0f0; - q32[2] = q2[0] & 0x0f0f0f0f; - q32[3] = q2[0] & 0xf0f0f0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 4; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4]; - s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; -#else - const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); - const uint16_t * q2 = q1 + 32; - - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[0] & 0xf0f0; - q16[2] = q2[0] & 0x0f0f; - q16[3] = q2[0] & 0xf0f0; - - float4 s = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - for (int l = 0; l < 2; ++l) { - s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; - s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; - smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; - } - tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; -#endif - - } -#else - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); - - const int step = tid * K_QUANTS_PER_ITERATION; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - float tmp = 0; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const float * y = yy + i*QK_K + step; - const uint16_t * a = (const uint16_t *)x[i].scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - const float d = (float)x[i].dm[0]; - const float m = (float)x[i].dm[1]; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) - + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) - + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) - + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); - } - tmp += sum; - } - -#endif - - // sum up partial sums and write back result - tmp = warp_reduce_sum(tmp); - - if (tid == 0) { - dst[row] = tmp; - } -} - -static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) { - - const int row = blockIdx.x; - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q5_K * x = (const block_q5_K *)vx + ib0; - - float tmp = 0; // partial sum for thread in warp - -#if QK_K == 256 - const uint16_t kmask1 = 0x3f3f; - const uint16_t kmask2 = 0x0f0f; - const uint16_t kmask3 = 0xc0c0; - - const int tid = threadIdx.x/2; // 0...15 - const int ix = threadIdx.x%2; - - const int il = tid/4; // 0...3 - const int ir = tid - 4*il;// 0...3 - const int n = 2; - - const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 - const int in = il%2; - - const int l0 = n*(2*ir + in); - const int q_offset = 32*im + l0; - const int y_offset = 64*im + l0; - - const uint8_t hm1 = 1 << (2*im); - const uint8_t hm2 = hm1 << 4; - - uint16_t aux[4]; - const uint8_t * sc = (const uint8_t *)aux; - - uint16_t q16[8]; - const uint8_t * q4 = (const uint8_t *)q16; - - for (int i = ix; i < num_blocks_per_row; i += 2) { - - const uint8_t * ql1 = x[i].qs + q_offset; - const uint8_t * qh = x[i].qh + l0; - const float * y1 = yy + i*QK_K + y_offset; - const float * y2 = y1 + 128; - - const float dall = __low2half(x[i].dm); - const float dmin = __high2half(x[i].dm); - - const uint16_t * a = (const uint16_t *)x[i].scales; - aux[0] = a[im+0] & kmask1; - aux[1] = a[im+2] & kmask1; - aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); - aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); - - float4 sum = {0.f, 0.f, 0.f, 0.f}; - float smin = 0; - const uint16_t * q1 = (const uint16_t *)ql1; - const uint16_t * q2 = q1 + 32; - q16[0] = q1[0] & 0x0f0f; - q16[1] = q1[8] & 0x0f0f; - q16[2] = (q1[0] >> 4) & 0x0f0f; - q16[3] = (q1[8] >> 4) & 0x0f0f; - q16[4] = q2[0] & 0x0f0f; - q16[5] = q2[8] & 0x0f0f; - q16[6] = (q2[0] >> 4) & 0x0f0f; - q16[7] = (q2[8] >> 4) & 0x0f0f; - for (int l = 0; l < n; ++l) { - sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) - + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0)); - sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) - + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0)); - sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) - + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0)); - sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) - + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0)); - smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] - + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; - } - tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; - } - -#else - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); - const int step = tid * K_QUANTS_PER_ITERATION; - const int im = step/8; - const int in = step%8; - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - const uint8_t * q = x[i].qs + step; - const int8_t * s = x[i].scales; - const float * y = yy + i*QK_K + step; - const float d = x[i].d; - float sum = 0.f; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - const uint8_t h = x[i].qh[in+j] >> im; - sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) - + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) - + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) - + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); - } - tmp += sum; - } -#endif - - // sum up partial sums and write back result - tmp = warp_reduce_sum(tmp); - - if (threadIdx.x == 0) { - dst[row] = tmp; - } -} - -static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { - - static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); - - const int row = blockIdx.x*blockDim.y + threadIdx.y; - if (row > nrows) return; - - const int num_blocks_per_row = ncols / QK_K; - const int ib0 = row*num_blocks_per_row; - - const block_q6_K * x = (const block_q6_K *)vx + ib0; - -#if QK_K == 256 - - const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 - const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 - - const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 - - const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... - const int in = tid - step*im; // 0...15 or 0...7 - -#if K_QUANTS_PER_ITERATION == 1 - const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 - const int is = 0; -#else - const int l0 = 4 * in; // 0, 4, 8, ..., 28 - const int is = in / 4; -#endif - const int ql_offset = 64*im + l0; - const int qh_offset = 32*im + l0; - const int s_offset = 8*im + is; - const int y_offset = 128*im + l0; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + y_offset; - const uint8_t * ql = x[i].ql + ql_offset; - const uint8_t * qh = x[i].qh + qh_offset; - const int8_t * s = x[i].scales + s_offset; - - const float d = x[i].d; - -#if K_QUANTS_PER_ITERATION == 1 - float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) - + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) - + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) - + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) - + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) - + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) - + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) - +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); - tmp += sum; -#else - float sum = 0; - for (int l = 0; l < 4; ++l) { - sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) - + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) - + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) - + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); - } - tmp += sum; -#endif - - } - -#else - - const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7 - const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3 - - const int step = tid * K_QUANTS_PER_ITERATION; - - float tmp = 0; // partial sum for thread in warp - - for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { - - const float * y = yy + i * QK_K + step; - const uint8_t * ql = x[i].ql + step; - const uint8_t * qh = x[i].qh + step; - const int8_t * s = x[i].scales; - - const float d = x[i+0].d; - - float sum = 0; - for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { - sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) - + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) - + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) - + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); - } - tmp += sum; - - } - -#endif - - // sum up partial sums and write back result - tmp = warp_reduce_sum(tmp); - - if (tid == 0) { - dst[row] = tmp; - } -} - -static __device__ void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){ - const half * x = (const half *) vx; - - // automatic half -> float type cast if dfloat == float - v.x = x[ib + iqs + 0]; - v.y = x[ib + iqs + 1]; -} - -static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) { - const int ix = blockDim.x*blockIdx.x + threadIdx.x; - - if (ix >= kx_padded) { - return; - } - - const int iy = blockDim.y*blockIdx.y + threadIdx.y; - - const int i_padded = iy*kx_padded + ix; - - block_q8_1 * y = (block_q8_1 *) vy; - - const int ib = i_padded / QK8_1; // block index - const int iqs = i_padded % QK8_1; // quant index - - const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; - float amax = fabsf(xi); - float sum = xi; - - amax = warp_reduce_max(amax); - sum = warp_reduce_sum(sum); - - const float d = amax / 127; - const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); - - y[ib].qs[iqs] = q; - - if (iqs > 0) { - return; - } - - reinterpret_cast(y[ib].ds.x) = d; - reinterpret_cast(y[ib].ds.y) = sum; -} - -template -static __global__ void k_get_rows( - const void * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12/*, size_t s13*/) { - - const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; - const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; - const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; - - const int ib = i00/qk; // block index - const int iqs = (i00%qk)/qr; // quant index - const int iybs = i00 - i00%qk; // dst block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(src0_row, ib, iqs, v); - - dst_row[iybs + iqs + 0] = v.x; - dst_row[iybs + iqs + y_offset] = v.y; -} - -template -static __global__ void k_get_rows_float( - const src0_t * src0, const int32_t * src1, dst_t * dst, - int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ - /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ - /*size_t s0,*/ size_t s1, size_t s2, size_t s3, - /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, - size_t s10, size_t s11, size_t s12/*, size_t s13*/) { - - const int i00 = blockIdx.x*blockDim.x + threadIdx.x; - const int i10 = blockDim.y*blockIdx.y + threadIdx.y; - const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; - const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; - - if (i00 >= ne00) { - return; - } - - const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; - - dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; - const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); - - dst_row[i00] = src0_row[i00]; -} - -template -static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { - const int i = 2*(blockDim.x*blockIdx.x + threadIdx.x); - - if (i >= k) { - return; - } - - const int ib = i/qk; // block index - const int iqs = (i%qk)/qr; // quant index - const int iybs = i - i%qk; // y block start index - const int y_offset = qr == 1 ? 1 : qk/2; - - // dequantize - dfloat2 v; - dequantize_kernel(vx, ib, iqs, v); - - y[iybs + iqs + 0] = v.x; - y[iybs + iqs + y_offset] = v.y; -} - -template -static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - - const src_t * x = (src_t *) vx; - - y[i] = x[i]; -} - -template -static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int k) { -#if __CUDA_ARCH__ >= CC_PASCAL - constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; - - const int i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; - const int * x0 = ((int *) vx) + blockIdx.x * nint; - half2 * y2 = (half2 *) (y + i0); - - __shared__ int vals[nint]; - -#pragma unroll - for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) { - if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) { - break; - } - - const int ix = ix0 + threadIdx.x; - vals[ix] = x0[ix]; - } - -#pragma unroll - for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) { - if (need_check && i0 + iy + 2*threadIdx.x >= k) { - return; - } - - const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0); - const half d = *b0; - const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)]; - - y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d)); - } -#else - GGML_UNUSED(vx); - GGML_UNUSED(y); - GGML_UNUSED(k); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_PASCAL -} - -// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called -// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q - -#define VDR_Q4_0_Q8_1_MMVQ 2 -#define VDR_Q4_0_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( - const int * v, const int * u, const float & d4, const half2 & ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; - const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); - } - - const float2 ds8f = __half22float2(ds8); - - // second part effectively subtracts 8 from each quant value - return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q4_1_Q8_1_MMVQ 2 -#define VDR_Q4_1_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( - const int * v, const int * u, const half2 & dm4, const half2 & ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; - const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; - - // SIMD dot product of quantized values - sumi = __dp4a(vi0, u[2*i+0], sumi); - sumi = __dp4a(vi1, u[2*i+1], sumi); - } - -#ifdef GGML_CUDA_F16 - const float2 tmp = __half22float2(__hmul2(dm4, ds8)); - const float d4d8 = tmp.x; - const float m4s8 = tmp.y; -#else - const float2 dm4f = __half22float2(dm4); - const float2 ds8f = __half22float2(ds8); - const float d4d8 = dm4f.x * ds8f.x; - const float m4s8 = dm4f.y * ds8f.y; -#endif // GGML_CUDA_F16 - - // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it - return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q5_0_Q8_1_MMVQ 2 -#define VDR_Q5_0_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( - const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits - vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 - vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 - vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 - vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values - - int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits - vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 - vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values - } - - const float2 ds8f = __half22float2(ds8); - - // second part effectively subtracts 16 from each quant value - return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q5_1_Q8_1_MMVQ 2 -#define VDR_Q5_1_Q8_1_MMQ 4 - -template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( - const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits - vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 - vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 - vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 - vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 - sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values - - int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits - vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 - vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 - vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 - vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 - sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values - } - -#ifdef GGML_CUDA_F16 - const float2 tmp = __half22float2(__hmul2(dm5, ds8)); - const float d5d8 = tmp.x; - const float m5s8 = tmp.y; -#else - const float2 dm5f = __half22float2(dm5); - const float2 ds8f = __half22float2(ds8); - const float d5d8 = dm5f.x * ds8f.x; - const float m5s8 = dm5f.y * ds8f.y; -#endif // GGML_CUDA_F16 - - // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it - return sumi*d5d8 + m5s8 / (QI5_1 / vdr); - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q8_0_Q8_1_MMVQ 2 -#define VDR_Q8_0_Q8_1_MMQ 8 - -template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( - const int * v, const int * u, const float & d8_0, const float & d8_1) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); - } - - return d8_0*d8_1 * sumi; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( - const int * v, const int * u, const half2 & dm8, const half2 & ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi = 0; - -#pragma unroll - for (int i = 0; i < vdr; ++i) { - // SIMD dot product of quantized values - sumi = __dp4a(v[i], u[i], sumi); - } - -#ifdef GGML_CUDA_F16 - const float2 tmp = __half22float2(__hmul2(dm8, ds8)); - const float d8d8 = tmp.x; - const float m8s8 = tmp.y; -#else - const float2 dm8f = __half22float2(dm8); - const float2 ds8f = __half22float2(ds8); - const float d8d8 = dm8f.x * ds8f.x; - const float m8s8 = dm8f.y * ds8f.y; -#endif // GGML_CUDA_F16 - - // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it - return sumi*d8d8 + m8s8 / (QI8_1 / vdr); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q2_K_Q8_1_MMVQ 1 -#define VDR_Q2_K_Q8_1_MMQ 2 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( - const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const half2 & dm2, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR2_K; ++i) { - const int sc = scales[2*i]; - - const int vi = (v >> (2*i)) & 0x03030303; - - sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product - - // fill int with 4x m - int m = sc >> 4; - m |= m << 8; - m |= m << 16; - sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values - } - - const float2 dm2f = __half22float2(dm2); - - return dm2f.x*sumf_d - dm2f.y*sumf_m; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -// contiguous u/y values -static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const half2 & dm2, const float & d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi_d = 0; - int sumi_m = 0; - -#pragma unroll - for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { - int sumi_d_sc = 0; - - const int sc = scales[i0 / (QI8_1/2)]; - - // fill int with 4x m - int m = sc >> 4; - m |= m << 8; - m |= m << 16; - -#pragma unroll - for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product - sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m - } - - sumi_d += sumi_d_sc * (sc & 0xF); - } - - const float2 dm2f = __half22float2(dm2); - - return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q3_K_Q8_1_MMVQ 1 -#define VDR_Q3_K_Q8_1_MMQ 2 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( - const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, - const int & scale_offset, const float & d3, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR3_K; ++i) { - const int isc = scale_offset + 2*i; - - const int isc_low = isc % (QK_K/32); - const int sc_shift_low = 4 * (isc / (QK_K/32)); - const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; - - const int isc_high = isc % (QK_K/64); - const int sc_shift_high = 2 * (isc / (QK_K/64)); - const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; - - const int sc = (sc_low | sc_high) - 32; - - const int vil = (vl >> (2*i)) & 0x03030303; - - const int vih = ((vh >> i) << 2) & 0x04040404; - - const int vi = __vsubss4(vil, vih); - - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d3 * sumf; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -// contiguous u/y values -static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, - const float & d3, const float & d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - int sumi = 0; - -#pragma unroll - for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { - int sumi_sc = 0; - - for (int i = i0; i < i0 + QI8_1/2; ++i) { - sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product - } - - sumi += sumi_sc * scales[i0 / (QI8_1/2)]; - } - - return d3*d8 * sumi; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q4_K_Q8_1_MMVQ 2 -#define VDR_Q4_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR4_K; ++i) { - const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; - const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; - - const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u - - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values - } - - const float2 dm4f = __half22float2(dm4); - - return dm4f.x*sumf_d - dm4f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -// contiguous u/y values -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { - int sumi_d = 0; - -#pragma unroll - for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product - } - - const float2 ds8f = __half22float2(ds8[i]); - - sumf_d += ds8f.x * (sc[i] * sumi_d); - sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val - } - - const float2 dm4f = __half22float2(dm4); - - return dm4f.x*sumf_d - dm4f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q5_K_Q8_1_MMVQ 2 -#define VDR_Q5_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( - const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR5_K; ++i) { - const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; - const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; - - const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; - const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; - - const int v0i = vl0i | vh0i; - const int v1i = vl1i | vh1i; - - const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product - const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u - - sumf_d += d8[i] * (dot1 * sc[i]); - sumf_m += d8[i] * (dot2 * m[i]); - - } - - const float2 dm5f = __half22float2(dm5); - - return dm5f.x*sumf_d - dm5f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -// contiguous u/y values -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, - const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - float sumf_m = 0.0f; - -#pragma unroll - for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { - int sumi_d = 0; - -#pragma unroll - for (int j = 0; j < QI8_1; ++j) { - sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product - } - - const float2 ds8f = __half22float2(ds8[i]); - - sumf_d += ds8f.x * (sc[i] * sumi_d); - sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val - } - - const float2 dm4f = __half22float2(dm4); - - return dm4f.x*sumf_d - dm4f.y*sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -#define VDR_Q6_K_Q8_1_MMVQ 1 -#define VDR_Q6_K_Q8_1_MMQ 8 - -// contiguous v/x values -static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( - const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, - const float & d, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf = 0.0f; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - const int sc = scales[4*i]; - - const int vil = (vl >> (4*i)) & 0x0F0F0F0F; - - const int vih = ((vh >> (4*i)) << 4) & 0x30303030; - - const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 - - sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product - } - - return d*sumf; -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -// contiguous u/y values -static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( - const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, - const float & d6, const float * __restrict__ d8) { - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - float sumf_d = 0.0f; - -#pragma unroll - for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { - int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale - -#pragma unroll - for (int i = i0; i < i0 + 2; ++i) { - sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product - sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product - - sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product - sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product - } - - sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); - } - - return d6 * sumf_d; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A -} - -static __device__ __forceinline__ float vec_dot_q4_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; - - int v[VDR_Q4_0_Q8_1_MMVQ]; - int u[2*VDR_Q4_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); - } - - return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); - GGML_UNUSED(x_sc); - - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; - - *x_ql = tile_x_qs; - *x_dm = (half2 *) tile_x_d; -} - -template static __device__ __forceinline__ void load_tiles_q4_0( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI4_0; - const int kqsx = k % QI4_0; - - const block_q4_0 * bx0 = (const block_q4_0 *) vx; - - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { - int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; - } -} - -static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const float * x_dmf = (const float *) x_dm; - - int u[2*VDR_Q4_0_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; - } - - return vec_dot_q4_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q4_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; - - int v[VDR_Q4_1_Q8_1_MMVQ]; - int u[2*VDR_Q4_1_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); - } - - return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; - - *x_ql = tile_x_qs; - *x_dm = tile_x_dm; -} - -template static __device__ __forceinline__ void load_tiles_q4_1( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI4_1; - const int kqsx = k % QI4_1; - - const block_q4_1 * bx0 = (const block_q4_1 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { - int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; - } -} - -static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - - int u[2*VDR_Q4_1_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; - } - - return vec_dot_q4_1_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], - y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q5_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; - - int vl[VDR_Q5_0_Q8_1_MMVQ]; - int vh[VDR_Q5_0_Q8_1_MMVQ]; - int u[2*VDR_Q5_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); - vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); - } - - return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; - - *x_ql = tile_x_ql; - *x_dm = (half2 *) tile_x_d; -} - -template static __device__ __forceinline__ void load_tiles_q5_0( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI5_0; - const int kqsx = k % QI5_0; - - const block_q5_0 * bx0 = (const block_q5_0 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; - - const int ql = get_int_from_uint8(bxi->qs, kqsx); - const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); - - int qs0 = (ql >> 0) & 0x0F0F0F0F; - qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 - qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 - qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 - qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - qs0 = __vsubss4(qs0, 0x10101010); // subtract 16 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; - - int qs1 = (ql >> 4) & 0x0F0F0F0F; - qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 - qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; - const int kbxd = k % blocks_per_tile_x_row; - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { - int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; - } -} - -static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - int u[2*VDR_Q5_0_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; - } - - return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q5_1_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; - - int vl[VDR_Q5_1_Q8_1_MMVQ]; - int vh[VDR_Q5_1_Q8_1_MMVQ]; - int u[2*VDR_Q5_1_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { - vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); - vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); - u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); - } - - return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); -} - -template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; -} - -template static __device__ __forceinline__ void load_tiles_q5_1( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI5_1; - const int kqsx = k % QI5_1; - - const block_q5_1 * bx0 = (const block_q5_1 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; - - const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); - const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); - - int qs0 = (ql >> 0) & 0x0F0F0F0F; - qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 - qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 - qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 - qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; - - int qs1 = (ql >> 4) & 0x0F0F0F0F; - qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 - qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 - qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 - qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 - - x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { - int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; - } -} - -static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); - const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; - - int u[2*VDR_Q5_1_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { - u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; - u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; - } - - return vec_dot_q8_1_q8_1_impl - (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); -} - -static __device__ __forceinline__ float vec_dot_q8_0_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; - - int v[VDR_Q8_0_Q8_1_MMVQ]; - int u[VDR_Q8_0_Q8_1_MMVQ]; - -#pragma unroll - for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { - v[i] = get_int_from_int8(bq8_0->qs, iqs + i); - u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); - } - - return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); -} - -template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; - - *x_ql = tile_x_qs; - *x_dm = (half2 *) tile_x_d; -} - -template static __device__ __forceinline__ void load_tiles_q8_0( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI8_0; - const int kqsx = k % QI8_0; - float * x_dmf = (float *) x_dm; - - const block_q8_0 * bx0 = (const block_q8_0 *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { - int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; - - if (need_check) { - i = min(i, i_max); - } - - const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; - } -} - -static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); - - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - return vec_dot_q8_0_q8_1_impl - (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], - y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q2_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q2_K * bq2_K = (const block_q2_K *) vbq; - - const int bq8_offset = QR2_K * (iqs / QI8_1); - const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); - - const uint8_t * scales = bq2_K->scales + scale_offset; - - const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); - int u[QR2_K]; - float d8[QR2_K]; - -#pragma unroll - for (int i = 0; i < QR2_K; ++ i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + i].ds); - } - - return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); - - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q2_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI2_K; - const int kqsx = k % QI2_K; - - const block_q2_K * bx0 = (const block_q2_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; - const int kbxd = k % blocks_per_tile_x_row; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { - int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); - - if (need_check) { - i = min(i, i_max); - } - - const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); - - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); - } -} - -static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); - - const int kbx = k / QI2_K; - const int ky = (k % QI2_K) * QR2_K; - const float * y_df = (const float *) y_ds; - - int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; - - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); - const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); - -#pragma unroll - for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { - v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; - } - - const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; - - const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; - return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q3_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q3_K * bq3_K = (const block_q3_K *) vbq; - - const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); - const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); - - const float d = bq3_K->d; - - const int vl = get_int_from_uint8(bq3_K->qs, iqs); - - // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; - - int u[QR3_K]; - float d8[QR3_K]; - -#pragma unroll - for (int i = 0; i < QR3_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + i].ds); - } - - return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; - __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_qh = tile_x_qh; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q3_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI3_K; - const int kqsx = k % QI3_K; - - const block_q3_K * bx0 = (const block_q3_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; - const int kbxd = k % blocks_per_tile_x_row; - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { - int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { - int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); - - if (need_check) { - i = min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); - - // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted - x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { - int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); - - if (need_check) { - i = min(i, i_max); - } - - const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); - - const int ksc = k % (QI3_K/4); - - const int ksc_low = ksc % (QI3_K/8); - const int shift_low = 4 * (ksc / (QI3_K/8)); - const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; - - const int ksc_high = QI3_K/8; - const int shift_high = 2 * ksc; - const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; - - const int sc = __vsubss4(sc_low | sc_high, 0x20202020); - - x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; - } -} - -static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - - const int kbx = k / QI3_K; - const int ky = (k % QI3_K) * QR3_K; - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; - - int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; - -#pragma unroll - for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { - const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); - const int shift = 2 * ((ky % 32) / 8); - const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; - - const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); - const int vlh = (vh << 2) & 0x04040404; - - v[l] = __vsubss4(vll, vlh); - } - - const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; - return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q4_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - -#ifndef GGML_QKK_64 - const block_q4_K * bq4_K = (const block_q4_K *) vbq; - - int v[2]; - int u[2*QR4_K]; - float d8[QR4_K]; - - // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 - const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); - - // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 - // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 - // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 - // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 - - const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); - v[0] = q4[0]; - v[1] = q4[4]; - - const uint16_t * scales = (const uint16_t *)bq4_K->scales; - uint16_t aux[2]; - const int j = bq8_offset/2; - if (j < 2) { - aux[0] = scales[j+0] & 0x3f3f; - aux[1] = scales[j+2] & 0x3f3f; - } else { - aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); - aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - - for (int i = 0; i < QR4_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = __low2float(bq8i->ds); - - const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); - u[2*i+0] = q8[0]; - u[2*i+1] = q8[4]; - } - - return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); - -#else - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q4_K * bq4_K = (const block_q4_K *) vbq; - - float sumf_d = 0.0f; - float sumf_m = 0.0f; - - uint16_t aux16[2]; - const uint8_t * s = (const uint8_t *)aux16; - - const uint16_t * a = (const uint16_t *)bq4_K->scales; - aux16[0] = a[0] & 0x0f0f; - aux16[1] = (a[0] >> 4) & 0x0f0f; - - const float dall = bq4_K->dm[0]; - const float dmin = bq4_K->dm[1]; - - const float d8_1 = __low2float(bq8_1[0].ds); - const float d8_2 = __low2float(bq8_1[1].ds); - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * q4 = (const int *)bq4_K->qs + (iqs/2); - const int v1 = q4[0]; - const int v2 = q4[4]; - - const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); - const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); - const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); - const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); - - sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); - sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); - - return dall * sumf_d - dmin * sumf_m; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A - -#endif -} - -template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); - - __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q4_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI4_K; // == 0 if QK_K == 256 - const int kqsx = k % QI4_K; // == k if QK_K == 256 - - const block_q4_K * bx0 = (const block_q4_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; - - x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { - int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; - -#if QK_K == 256 - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; -#else - x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; -#endif - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); - - const int * scales = (const int *) bxi->scales; - - const int ksc = k % (WARP_SIZE/8); - - // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 - int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits - scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; - } -} - -static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); - - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); - - const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; - return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q5_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - -#ifndef GGML_QKK_64 - const block_q5_K * bq5_K = (const block_q5_K *) vbq; - - int vl[2]; - int vh[2]; - int u[2*QR5_K]; - float d8[QR5_K]; - - const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); - const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); - const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); - - vl[0] = ql[0]; - vl[1] = ql[4]; - - vh[0] = qh[0] >> bq8_offset; - vh[1] = qh[4] >> bq8_offset; - - const uint16_t * scales = (const uint16_t *)bq5_K->scales; - uint16_t aux[2]; - const int j = bq8_offset/2; - if (j < 2) { - aux[0] = scales[j+0] & 0x3f3f; - aux[1] = scales[j+2] & 0x3f3f; - } else { - aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); - aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); - } - const uint8_t * sc = (const uint8_t *)aux; - const uint8_t * m = sc + 2; - -#pragma unroll - for (int i = 0; i < QR5_K; ++i) { - const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; - d8[i] = __low2float(bq8i->ds); - - const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); - u[2*i+0] = q8[0]; - u[2*i+1] = q8[4]; - } - - return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); - -#else - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const block_q5_K * bq5_K = (const block_q5_K *) vbq; - - const int8_t * s = bq5_K->scales; - - const float d = bq5_K->d; - - const float d8_1 = __low2half(bq8_1[0].ds); - const float d8_2 = __low2half(bq8_1[1].ds); - - const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); - const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); - const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); - const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); - - const int * ql = (const int *)bq5_K->qs + (iqs/2); - const int vl1 = ql[0]; - const int vl2 = ql[4]; - - const int step = 4 * (iqs/2); // 0, 4, 8, 12 - const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 - const int in = step%8; // 0, 4, 0, 4 - const int vh = (*((const int *)(bq5_K->qh + in))) >> im; - - const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); - const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); - const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); - const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); - - const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1]) - + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]); - - return d * sumf_d; - -#else - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= MIN_CC_DP4A - -#endif -} - -template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); - - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q5_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI5_K; // == 0 if QK_K == 256 - const int kqsx = k % QI5_K; // == k if QK_K == 256 - - const block_q5_K * bx0 = (const block_q5_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; - const int ky = QR5_K*kqsx; - - const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); - const int ql0 = (ql >> 0) & 0x0F0F0F0F; - const int ql1 = (ql >> 4) & 0x0F0F0F0F; - - const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); - const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; - const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; - - const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; - const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); - - x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; - x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { - int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; - -#if QK_K == 256 - x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; -#endif - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); - - const int * scales = (const int *) bxi->scales; - - const int ksc = k % (WARP_SIZE/8); - - // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 - int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits - scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits - - x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; - } -} - -static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); - - const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); - - const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; - const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; - return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, - x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_q6_K_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_q6_K * bq6_K = (const block_q6_K *) vbq; - - const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); - const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); - const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); - - const int vl = get_int_from_uint8(bq6_K->ql, iqs); - const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; - - const int8_t * scales = bq6_K->scales + scale_offset; - - int u[QR6_K]; - float d8[QR6_K]; - -#pragma unroll - for (int i = 0; i < QR6_K; ++i) { - u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); - d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds); - } - - return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); -} - -template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - GGML_UNUSED(x_qh); - - __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; - __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; - __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; - - *x_ql = tile_x_ql; - *x_dm = tile_x_dm; - *x_sc = tile_x_sc; -} - -template static __device__ __forceinline__ void load_tiles_q6_K( - const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, - int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - GGML_UNUSED(x_qh); - - GGML_CUDA_ASSUME(i_offset >= 0); - GGML_CUDA_ASSUME(i_offset < nwarps); - GGML_CUDA_ASSUME(k >= 0); - GGML_CUDA_ASSUME(k < WARP_SIZE); - - const int kbx = k / QI6_K; // == 0 if QK_K == 256 - const int kqsx = k % QI6_K; // == k if QK_K == 256 - - const block_q6_K * bx0 = (const block_q6_K *) vx; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { - int i = i0 + i_offset; - - if (need_check) { - i = min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; - const int ky = QR6_K*kqsx; - - const int ql = get_int_from_uint8(bxi->ql, kqsx); - const int ql0 = (ql >> 0) & 0x0F0F0F0F; - const int ql1 = (ql >> 4) & 0x0F0F0F0F; - - const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); - const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; - const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; - - const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; - const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); - - x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); - x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); - } - - const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 - const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 - float * x_dmf = (float *) x_dm; - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { - int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; - - x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; - } - -#pragma unroll - for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { - int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; - - if (need_check) { - i = min(i, i_max); - } - - const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; - - x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); - } -} - -static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( - const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, - const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - GGML_UNUSED(x_qh); - - const float * x_dmf = (const float *) x_dm; - const float * y_df = (const float *) y_ds; - - const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); - - const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; - const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; - return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); -} - -static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if QK_K == 256 - const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; - -#if QR2_XXS == 8 - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = q2[2] | (q2[3] << 16); - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); - const uint8_t signs = ksigns_iq2xs[aux32 & 127]; - for (int j = 0; j < 8; ++j) { - sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); - } - q8 += 8; - aux32 >>= 7; - } - const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f; - return d * sumi; -#else - // iqs is 0...15 - const int ib32 = iqs/2; - const int il = iqs%2; - const uint16_t * q2 = bq2->qs + 4*ib32; - const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); - const uint32_t aux32 = q2[2] | (q2[3] << 16); - const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f; - const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; - const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; - const int8_t * q8 = bq8_1[ib32].qs + 16*il; - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 8; ++j) { - sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1); - sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1); - } - return d * (sumi1 + sumi2); -#endif -#else - assert(false); - return 0.f; -#endif -} - -static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 - const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; - - const int ib32 = iqs; - const uint16_t * q2 = bq2->qs + 4*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; - int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); - const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); - const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); - sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); - sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); - q8 += 8; - } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); - const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); - const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); - sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); - sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); - q8 += 8; - } - const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#else - GGML_UNUSED(ksigns64); - assert(false); - return 0.f; -#endif -#else - GGML_UNUSED(ksigns64); - assert(false); - return 0.f; -#endif -} - -// TODO -static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 - const block_iq2_s * bq2 = (const block_iq2_s *) vbq; - - const int ib32 = iqs; - const int8_t * q8 = bq8_1[ib32].qs; - const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; - const uint8_t ls1 = bq2->scales[ib32] & 0xf; - const uint8_t ls2 = bq2->scales[ib32] >> 4; - int sumi1 = 0; - for (int l = 0; l < 2; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid[1] ^ signs1, signs1); - sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); - sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); - q8 += 8; - } - int sumi2 = 0; - for (int l = 2; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); - const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid[1] ^ signs1, signs1); - sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); - sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); - q8 += 8; - } - const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; - return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); -#else - GGML_UNUSED(ksigns64); - assert(false); - return 0.f; -#endif -#else - GGML_UNUSED(ksigns64); - assert(false); - return 0.f; -#endif -} - -static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 - const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; - - const int ib32 = iqs; - const uint8_t * q3 = bq2->qs + 8*ib32; - const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - uint32_t aux32 = gas[0] | (gas[1] << 16); - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; - const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; - const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); - const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]); - const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]); - sumi = __dp4a(grid_l, *((int *)q8+0), sumi); - sumi = __dp4a(grid_h, *((int *)q8+1), sumi); - q8 += 8; - aux32 >>= 7; - } - const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f; - return d * sumi; -#else - assert(false); - return 0.f; -#endif -#else - assert(false); - return 0.f; -#endif -} - -// TODO: don't use lookup table for signs -static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -#if QK_K == 256 - const block_iq3_s * bq2 = (const block_iq3_s *) vbq; - - const int ib32 = iqs; - const uint8_t * qs = bq2->qs + 8*ib32; - const int8_t * q8 = bq8_1[ib32].qs; - int sumi = 0; - for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); - const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); - uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); - uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); - const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); - const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); - sumi = __dp4a(grid_l, *((int *)q8+0), sumi); - sumi = __dp4a(grid_h, *((int *)q8+1), sumi); - q8 += 8; - } - const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds); - return d * sumi; -#else - assert(false); - return 0.f; -#endif -#else - assert(false); - return 0.f; -#endif -} - -static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { -#if QK_K == 256 - const block_iq1_s * bq1 = (const block_iq1_s *) vbq; - - const int ib32 = iqs; - int sumi = 0; -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const int * q8 = (const int *)bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); - int grid0 = grid[0] & 0x0f0f0f0f; - int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; - sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi)); - } -#else - const int8_t * q8 = bq8_1[ib32].qs; - for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); - for (int j = 0; j < 4; ++j) { - sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4); - } - q8 += 8; - } -#endif - const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; - const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); - const float d = d1q * __low2float (bq8_1[ib32].ds); - const float m = d1q * __high2float(bq8_1[ib32].ds); - return d * sumi + m * delta; -#else - assert(false); - return 0.f; -#endif -} - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics -static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values, - int & val1, int & val2) { - - uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; - aux32 = q4 & 0x0f0f0f0f; - uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); - uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); - val1 = v1 | (v2 << 16); - aux32 = (q4 >> 4) & 0x0f0f0f0f; - v1 = values[q8[0]] | (values[q8[1]] << 8); - v2 = values[q8[2]] | (values[q8[3]] << 8); - val2 = v1 | (v2 << 16); -} -#endif - -static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - - const block_iq4_nl * bq = (const block_iq4_nl *) vbq; - -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; - const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; - - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - - int v1, v2; - int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { - const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); - get_int_from_table_16(aux, values, v1, v2); - sumi1 = __dp4a(v1, q8[l+0], sumi1); - sumi2 = __dp4a(v2, q8[l+4], sumi2); - } - -#else - const uint8_t * q4 = bq->qs + 4*iqs; - const int8_t * q8 = bq8_1->qs + 4*iqs; - - int sumi1 = 0, sumi2 = 0; - for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) { - sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf]; - sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4]; - } -#endif - const float d = (float)bq->d * __low2float(bq8_1->ds); - return d * (sumi1 + sumi2); -} - -static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( - const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { - -#if QK_K == 256 -#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics - - const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; - const uint8_t * values = (const uint8_t *)kvalues_iq4nl; - - //// iqs is 0...7 - //const int ib64 = iqs/2; - //const int il = iqs%2; - //const int32_t * q8_1 = (const int *)bq8_1[2*ib64+0].qs + 2*il; - //const int32_t * q8_2 = (const int *)bq8_1[2*ib64+1].qs + 2*il; - //const uint32_t * q4_1 = (const uint32_t *)bq4->qs + 8*ib64 + 2*il; - //const uint32_t * q4_2 = q4_1 + 4; - //const int8_t ls1 = (bq4->scales_l[ib64] & 0xf) | (((bq4->scales_h >> (4*ib64+0)) & 3) << 4); - //const int8_t ls2 = (bq4->scales_l[ib64] >> 4) | (((bq4->scales_h >> (4*ib64+2)) & 3) << 4); - //const float d1 = (float)bq4->d * (ls1 - 32) * __low2float(bq8_1[2*ib64+0].ds); - //const float d2 = (float)bq4->d * (ls2 - 32) * __low2float(bq8_1[2*ib64+1].ds); - //int v1, v2; - //int sumi1 = 0, sumi2 = 0; - //for (int j = 0; j < 2; ++j) { - // get_int_from_table_16(q4_1[j], values, v1, v2); - // sumi1 = __dp4a(v2, q8_1[j+4], __dp4a(v1, q8_1[j+0], sumi1)); - // get_int_from_table_16(q4_2[j], values, v1, v2); - // sumi2 = __dp4a(v2, q8_2[j+4], __dp4a(v1, q8_2[j+0], sumi2)); - //} - //return d1 * sumi1 + d2 * sumi2; - - // iqs is 0...7 - const int ib32 = iqs; - const int32_t * q8 = (const int *)bq8_1[ib32].qs; - const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; - const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); - const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds); - int v1, v2; - int sumi1 = 0, sumi2 = 0; - for (int j = 0; j < 4; ++j) { - get_int_from_table_16(q4[j], values, v1, v2); - sumi1 = __dp4a(v1, q8[j+0], sumi1); - sumi2 = __dp4a(v2, q8[j+4], sumi2); - } - return d * (sumi1 + sumi2); - - //// iqs is 0...15 - //const int ib32 = iqs/2; - //const int il = iqs%2; - //const int32_t * q8 = (const int *)bq8_1[ib32].qs + 2*il; - //const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32 + 2*il; - //const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); - //const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds); - //int v1, v2; - //int sumi1 = 0, sumi2 = 0; - //for (int j = 0; j < 2; ++j) { - // get_int_from_table_16(q4[j], values, v1, v2); - // sumi1 = __dp4a(v1, q8[j+0], sumi1); - // sumi2 = __dp4a(v2, q8[j+4], sumi2); - //} - //return d * (sumi1 + sumi2); -#else - assert(false); - return 0.f; -#endif -#else - return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs); -#endif -} - -template -static __device__ __forceinline__ void mul_mat_q( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; - const int blocks_per_warp = WARP_SIZE / qi; - - const int & ncols_dst = ncols_y; - - const int row_dst_0 = blockIdx.x*mmq_y; - const int & row_x_0 = row_dst_0; - - const int col_dst_0 = blockIdx.y*mmq_x; - const int & col_y_0 = col_dst_0; - - int * tile_x_ql = nullptr; - half2 * tile_x_dm = nullptr; - int * tile_x_qh = nullptr; - int * tile_x_sc = nullptr; - - allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); - - __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; - __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; - - float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; - - for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { - - load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, - threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); - -#pragma unroll - for (int ir = 0; ir < qr; ++ir) { - const int kqs = ir*WARP_SIZE + threadIdx.x; - const int kbxd = kqs / QI8_1; - -#pragma unroll - for (int i = 0; i < mmq_x; i += nwarps) { - const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses - - const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; - - const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; - tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); - } - -#pragma unroll - for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { - const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; - const int kby = threadIdx.x % (WARP_SIZE/QI8_1); - const int col_y_eff = min(col_y_0 + ids, ncols_y-1); - - // if the sum is not needed it's faster to transform the scale to f32 ahead of time - const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; - half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; - if (need_sum) { - *dsi_dst = *dsi_src; - } else { - float * dfi_dst = (float *) dsi_dst; - *dfi_dst = __low2float(*dsi_src); - } - } - - __syncthreads(); - -// #pragma unroll // unrolling this loop causes too much register pressure - for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { - sum[i/WARP_SIZE][j/nwarps] += vec_dot( - tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, - threadIdx.x + i, threadIdx.y + j, k); - } - } - } - - __syncthreads(); - } - } - -#pragma unroll - for (int j = 0; j < mmq_x; j += nwarps) { - const int col_dst = col_dst_0 + j + threadIdx.y; - - if (col_dst >= ncols_dst) { - return; - } - -#pragma unroll - for (int i = 0; i < mmq_y; i += WARP_SIZE) { - const int row_dst = row_dst_0 + threadIdx.x + i; - - if (row_dst >= nrows_dst) { - continue; - } - - dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; - } - } -} - -#define MMQ_X_Q4_0_RDNA2 64 -#define MMQ_Y_Q4_0_RDNA2 128 -#define NWARPS_Q4_0_RDNA2 8 -#define MMQ_X_Q4_0_RDNA1 64 -#define MMQ_Y_Q4_0_RDNA1 64 -#define NWARPS_Q4_0_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q4_0_AMPERE 4 -#define MMQ_Y_Q4_0_AMPERE 32 -#define NWARPS_Q4_0_AMPERE 4 -#else -#define MMQ_X_Q4_0_AMPERE 64 -#define MMQ_Y_Q4_0_AMPERE 128 -#define NWARPS_Q4_0_AMPERE 4 -#endif -#define MMQ_X_Q4_0_PASCAL 64 -#define MMQ_Y_Q4_0_PASCAL 64 -#define NWARPS_Q4_0_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - mul_mat_q4_0( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_0_RDNA2; - const int mmq_y = MMQ_Y_Q4_0_RDNA2; - const int nwarps = NWARPS_Q4_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_0_RDNA1; - const int mmq_y = MMQ_Y_Q4_0_RDNA1; - const int nwarps = NWARPS_Q4_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q4_0_AMPERE; - const int mmq_y = MMQ_Y_Q4_0_AMPERE; - const int nwarps = NWARPS_Q4_0_AMPERE; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_0_PASCAL; - const int mmq_y = MMQ_Y_Q4_0_PASCAL; - const int nwarps = NWARPS_Q4_0_PASCAL; - - mul_mat_q, - load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q4_1_RDNA2 64 -#define MMQ_Y_Q4_1_RDNA2 128 -#define NWARPS_Q4_1_RDNA2 8 -#define MMQ_X_Q4_1_RDNA1 64 -#define MMQ_Y_Q4_1_RDNA1 64 -#define NWARPS_Q4_1_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q4_1_AMPERE 4 -#define MMQ_Y_Q4_1_AMPERE 32 -#define NWARPS_Q4_1_AMPERE 4 -#else -#define MMQ_X_Q4_1_AMPERE 64 -#define MMQ_Y_Q4_1_AMPERE 128 -#define NWARPS_Q4_1_AMPERE 4 -#endif -#define MMQ_X_Q4_1_PASCAL 64 -#define MMQ_Y_Q4_1_PASCAL 64 -#define NWARPS_Q4_1_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA - mul_mat_q4_1( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_1_RDNA2; - const int mmq_y = MMQ_Y_Q4_1_RDNA2; - const int nwarps = NWARPS_Q4_1_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_1_RDNA1; - const int mmq_y = MMQ_Y_Q4_1_RDNA1; - const int nwarps = NWARPS_Q4_1_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q4_1_AMPERE; - const int mmq_y = MMQ_Y_Q4_1_AMPERE; - const int nwarps = NWARPS_Q4_1_AMPERE; - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_1_PASCAL; - const int mmq_y = MMQ_Y_Q4_1_PASCAL; - const int nwarps = NWARPS_Q4_1_PASCAL; - - mul_mat_q, - load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q5_0_RDNA2 64 -#define MMQ_Y_Q5_0_RDNA2 128 -#define NWARPS_Q5_0_RDNA2 8 -#define MMQ_X_Q5_0_RDNA1 64 -#define MMQ_Y_Q5_0_RDNA1 64 -#define NWARPS_Q5_0_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q5_0_AMPERE 4 -#define MMQ_Y_Q5_0_AMPERE 32 -#define NWARPS_Q5_0_AMPERE 4 -#else -#define MMQ_X_Q5_0_AMPERE 128 -#define MMQ_Y_Q5_0_AMPERE 64 -#define NWARPS_Q5_0_AMPERE 4 -#endif -#define MMQ_X_Q5_0_PASCAL 64 -#define MMQ_Y_Q5_0_PASCAL 64 -#define NWARPS_Q5_0_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - mul_mat_q5_0( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_0_RDNA2; - const int mmq_y = MMQ_Y_Q5_0_RDNA2; - const int nwarps = NWARPS_Q5_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_0_RDNA1; - const int mmq_y = MMQ_Y_Q5_0_RDNA1; - const int nwarps = NWARPS_Q5_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q5_0_AMPERE; - const int mmq_y = MMQ_Y_Q5_0_AMPERE; - const int nwarps = NWARPS_Q5_0_AMPERE; - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_0_PASCAL; - const int mmq_y = MMQ_Y_Q5_0_PASCAL; - const int nwarps = NWARPS_Q5_0_PASCAL; - - mul_mat_q, - load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q5_1_RDNA2 64 -#define MMQ_Y_Q5_1_RDNA2 128 -#define NWARPS_Q5_1_RDNA2 8 -#define MMQ_X_Q5_1_RDNA1 64 -#define MMQ_Y_Q5_1_RDNA1 64 -#define NWARPS_Q5_1_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q5_1_AMPERE 4 -#define MMQ_Y_Q5_1_AMPERE 32 -#define NWARPS_Q5_1_AMPERE 4 -#else -#define MMQ_X_Q5_1_AMPERE 128 -#define MMQ_Y_Q5_1_AMPERE 64 -#define NWARPS_Q5_1_AMPERE 4 -#endif -#define MMQ_X_Q5_1_PASCAL 64 -#define MMQ_Y_Q5_1_PASCAL 64 -#define NWARPS_Q5_1_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -mul_mat_q5_1( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_1_RDNA2; - const int mmq_y = MMQ_Y_Q5_1_RDNA2; - const int nwarps = NWARPS_Q5_1_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_1_RDNA1; - const int mmq_y = MMQ_Y_Q5_1_RDNA1; - const int nwarps = NWARPS_Q5_1_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q5_1_AMPERE; - const int mmq_y = MMQ_Y_Q5_1_AMPERE; - const int nwarps = NWARPS_Q5_1_AMPERE; - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_1_PASCAL; - const int mmq_y = MMQ_Y_Q5_1_PASCAL; - const int nwarps = NWARPS_Q5_1_PASCAL; - - mul_mat_q, - load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q8_0_RDNA2 64 -#define MMQ_Y_Q8_0_RDNA2 128 -#define NWARPS_Q8_0_RDNA2 8 -#define MMQ_X_Q8_0_RDNA1 64 -#define MMQ_Y_Q8_0_RDNA1 64 -#define NWARPS_Q8_0_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q8_0_AMPERE 4 -#define MMQ_Y_Q8_0_AMPERE 32 -#define NWARPS_Q8_0_AMPERE 4 -#else -#define MMQ_X_Q8_0_AMPERE 128 -#define MMQ_Y_Q8_0_AMPERE 64 -#define NWARPS_Q8_0_AMPERE 4 -#endif -#define MMQ_X_Q8_0_PASCAL 64 -#define MMQ_Y_Q8_0_PASCAL 64 -#define NWARPS_Q8_0_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - mul_mat_q8_0( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q8_0_RDNA2; - const int mmq_y = MMQ_Y_Q8_0_RDNA2; - const int nwarps = NWARPS_Q8_0_RDNA2; -#else - const int mmq_x = MMQ_X_Q8_0_RDNA1; - const int mmq_y = MMQ_Y_Q8_0_RDNA1; - const int nwarps = NWARPS_Q8_0_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q8_0_AMPERE; - const int mmq_y = MMQ_Y_Q8_0_AMPERE; - const int nwarps = NWARPS_Q8_0_AMPERE; - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q8_0_PASCAL; - const int mmq_y = MMQ_Y_Q8_0_PASCAL; - const int nwarps = NWARPS_Q8_0_PASCAL; - - mul_mat_q, - load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q2_K_RDNA2 64 -#define MMQ_Y_Q2_K_RDNA2 128 -#define NWARPS_Q2_K_RDNA2 8 -#define MMQ_X_Q2_K_RDNA1 128 -#define MMQ_Y_Q2_K_RDNA1 32 -#define NWARPS_Q2_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q2_K_AMPERE 4 -#define MMQ_Y_Q2_K_AMPERE 32 -#define NWARPS_Q2_K_AMPERE 4 -#else -#define MMQ_X_Q2_K_AMPERE 64 -#define MMQ_Y_Q2_K_AMPERE 128 -#define NWARPS_Q2_K_AMPERE 4 -#endif -#define MMQ_X_Q2_K_PASCAL 64 -#define MMQ_Y_Q2_K_PASCAL 64 -#define NWARPS_Q2_K_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -mul_mat_q2_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q2_K_RDNA2; - const int mmq_y = MMQ_Y_Q2_K_RDNA2; - const int nwarps = NWARPS_Q2_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q2_K_RDNA1; - const int mmq_y = MMQ_Y_Q2_K_RDNA1; - const int nwarps = NWARPS_Q2_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q2_K_AMPERE; - const int mmq_y = MMQ_Y_Q2_K_AMPERE; - const int nwarps = NWARPS_Q2_K_AMPERE; - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q2_K_PASCAL; - const int mmq_y = MMQ_Y_Q2_K_PASCAL; - const int nwarps = NWARPS_Q2_K_PASCAL; - - mul_mat_q, - load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q3_K_RDNA2 128 -#define MMQ_Y_Q3_K_RDNA2 64 -#define NWARPS_Q3_K_RDNA2 8 -#define MMQ_X_Q3_K_RDNA1 32 -#define MMQ_Y_Q3_K_RDNA1 128 -#define NWARPS_Q3_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q3_K_AMPERE 4 -#define MMQ_Y_Q3_K_AMPERE 32 -#define NWARPS_Q3_K_AMPERE 4 -#else -#define MMQ_X_Q3_K_AMPERE 128 -#define MMQ_Y_Q3_K_AMPERE 128 -#define NWARPS_Q3_K_AMPERE 4 -#endif -#define MMQ_X_Q3_K_PASCAL 64 -#define MMQ_Y_Q3_K_PASCAL 64 -#define NWARPS_Q3_K_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA - mul_mat_q3_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q3_K_RDNA2; - const int mmq_y = MMQ_Y_Q3_K_RDNA2; - const int nwarps = NWARPS_Q3_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q3_K_RDNA1; - const int mmq_y = MMQ_Y_Q3_K_RDNA1; - const int nwarps = NWARPS_Q3_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q3_K_AMPERE; - const int mmq_y = MMQ_Y_Q3_K_AMPERE; - const int nwarps = NWARPS_Q3_K_AMPERE; - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q3_K_PASCAL; - const int mmq_y = MMQ_Y_Q3_K_PASCAL; - const int nwarps = NWARPS_Q3_K_PASCAL; - - mul_mat_q, - load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q4_K_RDNA2 64 -#define MMQ_Y_Q4_K_RDNA2 128 -#define NWARPS_Q4_K_RDNA2 8 -#define MMQ_X_Q4_K_RDNA1 32 -#define MMQ_Y_Q4_K_RDNA1 64 -#define NWARPS_Q4_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q4_K_AMPERE 4 -#define MMQ_Y_Q4_K_AMPERE 32 -#define NWARPS_Q4_K_AMPERE 4 -#else -#define MMQ_X_Q4_K_AMPERE 64 -#define MMQ_Y_Q4_K_AMPERE 128 -#define NWARPS_Q4_K_AMPERE 4 -#endif -#define MMQ_X_Q4_K_PASCAL 64 -#define MMQ_Y_Q4_K_PASCAL 64 -#define NWARPS_Q4_K_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA - mul_mat_q4_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q4_K_RDNA2; - const int mmq_y = MMQ_Y_Q4_K_RDNA2; - const int nwarps = NWARPS_Q4_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q4_K_RDNA1; - const int mmq_y = MMQ_Y_Q4_K_RDNA1; - const int nwarps = NWARPS_Q4_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q4_K_AMPERE; - const int mmq_y = MMQ_Y_Q4_K_AMPERE; - const int nwarps = NWARPS_Q4_K_AMPERE; - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q4_K_PASCAL; - const int mmq_y = MMQ_Y_Q4_K_PASCAL; - const int nwarps = NWARPS_Q4_K_PASCAL; - - mul_mat_q, - load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q5_K_RDNA2 64 -#define MMQ_Y_Q5_K_RDNA2 128 -#define NWARPS_Q5_K_RDNA2 8 -#define MMQ_X_Q5_K_RDNA1 32 -#define MMQ_Y_Q5_K_RDNA1 64 -#define NWARPS_Q5_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q5_K_AMPERE 4 -#define MMQ_Y_Q5_K_AMPERE 32 -#define NWARPS_Q5_K_AMPERE 4 -#else -#define MMQ_X_Q5_K_AMPERE 64 -#define MMQ_Y_Q5_K_AMPERE 128 -#define NWARPS_Q5_K_AMPERE 4 -#endif -#define MMQ_X_Q5_K_PASCAL 64 -#define MMQ_Y_Q5_K_PASCAL 64 -#define NWARPS_Q5_K_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -mul_mat_q5_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q5_K_RDNA2; - const int mmq_y = MMQ_Y_Q5_K_RDNA2; - const int nwarps = NWARPS_Q5_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q5_K_RDNA1; - const int mmq_y = MMQ_Y_Q5_K_RDNA1; - const int nwarps = NWARPS_Q5_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q5_K_AMPERE; - const int mmq_y = MMQ_Y_Q5_K_AMPERE; - const int nwarps = NWARPS_Q5_K_AMPERE; - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q5_K_PASCAL; - const int mmq_y = MMQ_Y_Q5_K_PASCAL; - const int nwarps = NWARPS_Q5_K_PASCAL; - - mul_mat_q, - load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -#define MMQ_X_Q6_K_RDNA2 64 -#define MMQ_Y_Q6_K_RDNA2 128 -#define NWARPS_Q6_K_RDNA2 8 -#define MMQ_X_Q6_K_RDNA1 32 -#define MMQ_Y_Q6_K_RDNA1 64 -#define NWARPS_Q6_K_RDNA1 8 -#if defined(CUDA_USE_TENSOR_CORES) -#define MMQ_X_Q6_K_AMPERE 4 -#define MMQ_Y_Q6_K_AMPERE 32 -#define NWARPS_Q6_K_AMPERE 4 -#else -#define MMQ_X_Q6_K_AMPERE 64 -#define MMQ_Y_Q6_K_AMPERE 64 -#define NWARPS_Q6_K_AMPERE 4 -#endif -#define MMQ_X_Q6_K_PASCAL 64 -#define MMQ_Y_Q6_K_PASCAL 64 -#define NWARPS_Q6_K_PASCAL 8 - -template static __global__ void -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) -#endif // defined(RDNA3) || defined(RDNA2) -#elif __CUDA_ARCH__ < CC_VOLTA - __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) -#endif // __CUDA_ARCH__ < CC_VOLTA - mul_mat_q6_K( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -#if defined(RDNA3) || defined(RDNA2) - const int mmq_x = MMQ_X_Q6_K_RDNA2; - const int mmq_y = MMQ_Y_Q6_K_RDNA2; - const int nwarps = NWARPS_Q6_K_RDNA2; -#else - const int mmq_x = MMQ_X_Q6_K_RDNA1; - const int mmq_y = MMQ_Y_Q6_K_RDNA1; - const int nwarps = NWARPS_Q6_K_RDNA1; -#endif // defined(RDNA3) || defined(RDNA2) - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= CC_VOLTA - const int mmq_x = MMQ_X_Q6_K_AMPERE; - const int mmq_y = MMQ_Y_Q6_K_AMPERE; - const int nwarps = NWARPS_Q6_K_AMPERE; - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - -#elif __CUDA_ARCH__ >= MIN_CC_DP4A - const int mmq_x = MMQ_X_Q6_K_PASCAL; - const int mmq_y = MMQ_Y_Q6_K_PASCAL; - const int nwarps = NWARPS_Q6_K_PASCAL; - - mul_mat_q, - load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); -#else - GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat); - NO_DEVICE_CODE; -#endif // __CUDA_ARCH__ >= CC_VOLTA -} - -template -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -// tell the compiler to use as many registers as it wants, see nwarps definition below -__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1) -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) -static __global__ void mul_mat_vec_q( - const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) - constexpr int nwarps = 1; - constexpr int rows_per_cuda_block = 1; -#else - constexpr int nwarps = ncols_y <= 4 ? 4 : 2; - constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) - - const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; - const int row0 = rows_per_cuda_block*blockIdx.x; - const int blocks_per_row_x = ncols_x / qk; - const int blocks_per_col_y = nrows_y / QK8_1; - constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi; - -// partial sum for each thread - float tmp[ncols_y][rows_per_cuda_block] = {0.0f}; - - const block_q_t * x = (const block_q_t *) vx; - const block_q8_1 * y = (const block_q8_1 *) vy; - - for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { - const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx - - // x block quant index when casting the quants to int - const int kqs = vdr * (tid % (qi/vdr)); - -#pragma unroll - for (int j = 0; j < ncols_y; ++j) { -#pragma unroll - for (int i = 0; i < rows_per_cuda_block; ++i) { - tmp[j][i] += vec_dot_q_cuda( - &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs); - } - } - } - - __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE]; - if (threadIdx.y > 0) { -#pragma unroll - for (int j = 0; j < ncols_y; ++j) { -#pragma unroll - for (int i = 0; i < rows_per_cuda_block; ++i) { - tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; - } - } - } - __syncthreads(); - if (threadIdx.y > 0) { - return; - } - - // sum up partial sums and write back result -#pragma unroll - for (int j = 0; j < ncols_y; ++j) { -#pragma unroll - for (int i = 0; i < rows_per_cuda_block; ++i) { -#pragma unroll - for (int l = 0; l < nwarps-1; ++l) { - tmp[j][i] += tmp_shared[l][j][i][threadIdx.x]; - } - tmp[j][i] = warp_reduce_sum(tmp[j][i]); - } - - if (threadIdx.x < rows_per_cuda_block) { - dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x]; - } - } -} - -template -static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { - // qk = quantized weights per x block - // qr = number of quantized weights per data value in x block - const int row = blockIdx.x*blockDim.y + threadIdx.y; - - if (row >= nrows) { - return; - } - - const int tid = threadIdx.x; - - const int iter_stride = 2*GGML_CUDA_DMMV_X; - const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter - const int y_offset = qr == 1 ? 1 : qk/2; - -// partial sum for each thread -#ifdef GGML_CUDA_F16 - half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics -#else - float tmp = 0.0f; -#endif // GGML_CUDA_F16 - - for (int i = 0; i < ncols; i += iter_stride) { - const int col = i + vals_per_iter*tid; - const int ib = (row*ncols + col)/qk; // x block index - const int iqs = (col%qk)/qr; // x quant index - const int iybs = col - col%qk; // y block start index - -// processing >2 values per i iter is faster for fast GPUs -#pragma unroll - for (int j = 0; j < vals_per_iter; j += 2) { - // process 2 vals per j iter - - // dequantize - // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val - dfloat2 v; - dequantize_kernel(vx, ib, iqs + j/qr, v); - - // matrix multiplication - // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 -#ifdef GGML_CUDA_F16 - tmp += __hmul2(v, { - y[iybs + iqs + j/qr + 0], - y[iybs + iqs + j/qr + y_offset] - }); -#else - tmp += v.x * y[iybs + iqs + j/qr + 0]; - tmp += v.y * y[iybs + iqs + j/qr + y_offset]; -#endif // GGML_CUDA_F16 - } - } - - // sum up partial sums and write back result - tmp = warp_reduce_sum(tmp); - - if (tid == 0) { -#ifdef GGML_CUDA_F16 - dst[row] = tmp.x + tmp.y; -#else - dst[row] = tmp; -#endif // GGML_CUDA_F16 - } -} - static __global__ void mul_mat_p021_f16_f32( const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) { @@ -6614,1916 +1151,6 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous } } -static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - half * dsti = (half *) cdsti; - - *dsti = __float2half(*xi); -} - -static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) { - const half * xi = (const half *) cxi; - half * dsti = (half *) cdsti; - - *dsti = *xi; -} - -static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { - const half * xi = (const half *) cxi; - float * dsti = (float *) cdsti; - - *dsti = *xi; -} - -template -static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13) { - const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= ne) { - return; - } - - // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor - // then combine those indices with the corresponding byte offsets to get the total offsets - const int64_t i03 = i/(ne00 * ne01 * ne02); - const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int64_t i13 = i/(ne10 * ne11 * ne12); - const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; - - cpy_1(cx + x_offset, cdst + dst_offset); -} - -static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q8_0 * dsti = (block_q8_0 *) cdsti; - - float amax = 0.0f; // absolute max - - for (int j = 0; j < QK8_0; j++) { - const float v = xi[j]; - amax = fmaxf(amax, fabsf(v)); - } - - const float d = amax / ((1 << 7) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK8_0; ++j) { - const float x0 = xi[j]*id; - - dsti->qs[j] = roundf(x0); - } -} - -static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_0 * dsti = (block_q4_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_0; ++j) { - const float v = xi[j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - vmax = v; - } - } - - const float d = vmax / -8; - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - for (int j = 0; j < QK4_0/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK4_0/2 + j]*id; - - const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f)); - const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q4_1 * dsti = (block_q4_1 *) cdsti; - - float vmin = FLT_MAX; - float vmax = -FLT_MAX; - - for (int j = 0; j < QK4_1; ++j) { - const float v = xi[j]; - - if (v < vmin) vmin = v; - if (v > vmax) vmax = v; - } - - const float d = (vmax - vmin) / ((1 << 4) - 1); - const float id = d ? 1.0f/d : 0.0f; - - dsti->dm.x = d; - dsti->dm.y = vmin; - - for (int j = 0; j < QK4_1/2; ++j) { - const float x0 = (xi[0 + j] - vmin)*id; - const float x1 = (xi[QK4_1/2 + j] - vmin)*id; - - const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f)); - const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f)); - - dsti->qs[j] = xi0; - dsti->qs[j] |= xi1 << 4; - } -} - -static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_0 * dsti = (block_q5_0 *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK5_0; ++j) { - const float v = xi[j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - vmax = v; - } - } - - const float d = vmax / -16; - const float id = d ? 1.0f/d : 0.0f; - - dsti->d = d; - - uint32_t qh = 0; - for (int j = 0; j < QK5_0/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK5_0/2 + j]*id; - - const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f)); - const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f)); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - -static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_q5_1 * dsti = (block_q5_1 *) cdsti; - - float min = xi[0]; - float max = xi[0]; - - for (int j = 1; j < QK5_1; ++j) { - const float v = xi[j]; - min = v < min ? v : min; - max = v > max ? v : max; - } - - const float d = (max - min) / 31; - const float id = d ? 1.0f/d : 0.0f; - - dsti->dm.x = d; - dsti->dm.y = min; - - uint32_t qh = 0; - for (int j = 0; j < QK5_1/2; ++j) { - const float x0 = (xi[0 + j] - min)*id; - const float x1 = (xi[QK5_1/2 + j] - min)*id; - - const uint8_t xi0 = (uint8_t)(x0 + 0.5f); - const uint8_t xi1 = (uint8_t)(x1 + 0.5f); - - dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); - qh |= ((xi0 & 0x10u) >> 4) << (j + 0); - qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); - } - memcpy(dsti->qh, &qh, sizeof(qh)); -} - -static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) { - if (x <= val[0]) return 0; - if (x >= val[n-1]) return n-1; - int ml = 0, mu = n-1; - while (mu-ml > 1) { - int mav = (ml+mu)/2; - if (x < val[mav]) mu = mav; else ml = mav; - } - return x - val[mu-1] < val[mu] - x ? mu-1 : mu; -} - -static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { - const float * xi = (const float *) cxi; - block_iq4_nl * dsti = (block_iq4_nl *) cdsti; - - float amax = 0.0f; - float vmax = 0.0f; - - for (int j = 0; j < QK4_NL; ++j) { - const float v = xi[j]; - if (amax < fabsf(v)) { - amax = fabsf(v); - vmax = v; - } - } - - float d = vmax / kvalues_iq4nl[0]; - const float id = d ? 1.0f/d : 0.0f; - - float sumqx = 0, sumq2 = 0; - for (int j = 0; j < QK4_NL/2; ++j) { - const float x0 = xi[0 + j]*id; - const float x1 = xi[QK4_NL/2 + j]*id; - const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); - const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); - dsti->qs[j] = xi0 | (xi1 << 4); - const float v0 = kvalues_iq4nl[xi0]; - const float v1 = kvalues_iq4nl[xi1]; - const float w0 = xi[0 + j]*xi[0 + j]; - const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j]; - sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j]; - sumq2 += w0*v0*v0 + w1*v1*v1; - } - - dsti->d = sumq2 > 0 ? sumqx/sumq2 : d; -} - - -template -static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, - const int nb12, const int nb13) { - const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; - - if (i >= ne) { - return; - } - - const int i03 = i/(ne00 * ne01 * ne02); - const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); - const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; - const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; - const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; - - const int i13 = i/(ne10 * ne11 * ne12); - const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); - const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; - const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; - const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; - - cpy_blck(cx + x_offset, cdst + dst_offset); -} - -static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) { - const float y = (i0 / 2 - low) / max(0.001f, high - low); - return 1.0f - min(1.0f, max(0.0f, y)); -} - -struct rope_corr_dims { - float v[4]; -}; - -// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn -// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -static __device__ void rope_yarn( - float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, - float * cos_theta, float * sin_theta -) { - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta = theta_interp; - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; - theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); - } - *cos_theta = cosf(theta) * mscale; - *sin_theta = sinf(theta) * mscale; -} - -// rope == RoPE == rotary positional embedding -template -static __global__ void rope( - const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, - float ext_factor, float attn_factor, rope_corr_dims corr_dims -) { - const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); - - if (col >= ncols) { - return; - } - - const int row = blockDim.x*blockIdx.x + threadIdx.x; - const int i = row*ncols + col; - const int i2 = row/p_delta_rows; - - const int p = has_pos ? pos[i2] : 0; - const float theta_base = p*powf(freq_base, -float(col)/ncols); - - float cos_theta, sin_theta; - rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); - - const float x0 = x[i + 0]; - const float x1 = x[i + 1]; - - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + 1] = x0*sin_theta + x1*cos_theta; -} - -template -static __global__ void rope_neox( - const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, - float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims -) { - const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); - - if (col >= ncols) { - return; - } - - const int row = blockDim.x*blockIdx.x + threadIdx.x; - const int ib = col / n_dims; - const int ic = col % n_dims; - - if (ib > 0) { - const int i = row*ncols + ib*n_dims + ic; - - dst[i + 0] = x[i + 0]; - dst[i + 1] = x[i + 1]; - - return; - } - - const int i = row*ncols + ib*n_dims + ic/2; - const int i2 = row/p_delta_rows; - - float cur_rot = inv_ndims * ic - ib; - - const int p = has_pos ? pos[i2] : 0; - const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f); - - float cos_theta, sin_theta; - rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); - - const float x0 = x[i + 0]; - const float x1 = x[i + n_dims/2]; - - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; -} - -static __global__ void rope_glm_f32( - const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, - int n_ctx -) { - const int col = blockDim.x*blockIdx.x + threadIdx.x; - const int half_n_dims = ncols/4; - - if (col >= half_n_dims) { - return; - } - - const int row = blockDim.y*blockIdx.y + threadIdx.y; - const int i = row*ncols + col; - const int i2 = row/p_delta_rows; - - const float col_theta_scale = powf(freq_base, -2.0f*col/ncols); - // FIXME: this is likely wrong - const int p = pos != nullptr ? pos[i2] : 0; - - const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale; - const float sin_theta = sinf(theta); - const float cos_theta = cosf(theta); - - const float x0 = x[i + 0]; - const float x1 = x[i + half_n_dims]; - - dst[i + 0] = x0*cos_theta - x1*sin_theta; - dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; - - const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale; - const float sin_block_theta = sinf(block_theta); - const float cos_block_theta = cosf(block_theta); - - const float x2 = x[i + half_n_dims * 2]; - const float x3 = x[i + half_n_dims * 3]; - - dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta; - dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; -} - -static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows, - const int n_heads_log2_floor, const float m0, const float m1) { - const int col = blockDim.x*blockIdx.x + threadIdx.x; - - if (col >= ncols) { - return; - } - - const int row = blockDim.y*blockIdx.y + threadIdx.y; - const int i = row*ncols + col; - - const int k = row/k_rows; - - float m_k; - if (k < n_heads_log2_floor) { - m_k = powf(m0, k + 1); - } else { - m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); - } - - dst[i] = col * m_k + x[i]; -} - -static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { - const int row = blockIdx.x; - const int col = threadIdx.x; - - float sum = 0.0f; - for (int i = col; i < ncols; i += blockDim.x) { - sum += x[row * ncols + i]; - } - - sum = warp_reduce_sum(sum); - - if (col == 0) { - dst[row] = sum; - } -} - -template -static inline __device__ void ggml_cuda_swap(T & a, T & b) { - T tmp = a; - a = b; - b = tmp; -} - -template -static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols) { - // bitonic sort - int col = threadIdx.x; - int row = blockIdx.y; - - if (col >= ncols) return; - - const float * x_row = x + row * ncols; - int * dst_row = dst + row * ncols; - - // initialize indices - if (col < ncols) { - dst_row[col] = col; - } - __syncthreads(); - - for (int k = 2; k <= ncols; k *= 2) { - for (int j = k / 2; j > 0; j /= 2) { - int ixj = col ^ j; - if (ixj > col) { - if ((col & k) == 0) { - if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { - ggml_cuda_swap(dst_row[col], dst_row[ixj]); - } - } else { - if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { - ggml_cuda_swap(dst_row[col], dst_row[ixj]); - } - } - } - __syncthreads(); - } - } -} - -static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { - const int col = blockDim.y*blockIdx.y + threadIdx.y; - const int row = blockDim.x*blockIdx.x + threadIdx.x; - - if (col >= ncols) { - return; - } - - const int i = row*ncols + col; - //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; - //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU - dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; -} - -template -static __global__ void soft_max_f32(const float * x, const float * mask, const float * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { - const int ncols = ncols_template == 0 ? ncols_par : ncols_template; - - const int tid = threadIdx.x; - const int rowx = blockIdx.x; - const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension - - const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; - - const int warp_id = threadIdx.x / WARP_SIZE; - const int lane_id = threadIdx.x % WARP_SIZE; - - float slope = 0.0f; - - // ALiBi - if (max_bias > 0.0f) { - const int h = rowx/nrows_y; // head index - - const float base = h < n_head_log2 ? m0 : m1; - const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; - - slope = powf(base, exp); - } - - extern __shared__ float data_soft_max_f32[]; - float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication - // shared memory buffer to cache values between iterations: - float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + rowx*ncols; - - float max_val = -INFINITY; - -#pragma unroll - for (int col0 = 0; col0 < ncols; col0 += block_size) { - const int col = col0 + tid; - - if (ncols_template == 0 && col >= ncols) { - break; - } - - const int ix = rowx*ncols + col; - const int iy = rowy*ncols + col; - - const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f); - - vals[col] = val; - max_val = max(max_val, val); - } - - // find the max value in the block - max_val = warp_reduce_max(max_val); - if (block_size > WARP_SIZE) { - if (warp_id == 0) { - buf_iw[lane_id] = -INFINITY; - } - __syncthreads(); - - if (lane_id == 0) { - buf_iw[warp_id] = max_val; - } - __syncthreads(); - - max_val = buf_iw[lane_id]; - max_val = warp_reduce_max(max_val); - } - - float tmp = 0.0f; // partial sum - -#pragma unroll - for (int col0 = 0; col0 < ncols; col0 += block_size) { - const int col = col0 + tid; - - if (ncols_template == 0 && col >= ncols) { - break; - } - - const float val = expf(vals[col] - max_val); - tmp += val; - vals[col] = val; - } - - // find the sum of exps in the block - tmp = warp_reduce_sum(tmp); - if (block_size > WARP_SIZE) { - __syncthreads(); - if (warp_id == 0) { - buf_iw[lane_id] = 0.0f; - } - __syncthreads(); - - if (lane_id == 0) { - buf_iw[warp_id] = tmp; - } - __syncthreads(); - - tmp = buf_iw[lane_id]; - tmp = warp_reduce_sum(tmp); - } - - const float inv_sum = 1.0f / tmp; - -#pragma unroll - for (int col0 = 0; col0 < ncols; col0 += block_size) { - const int col = col0 + tid; - - if (ncols_template == 0 && col >= ncols) { - return; - } - - const int idst = rowx*ncols + col; - dst[idst] = vals[col] * inv_sum; - } -} - -static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - - dst[i] = scale * x[i]; -} - -static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) { - const int i = blockDim.x*blockIdx.x + threadIdx.x; - - if (i >= k) { - return; - } - - dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); -} - -template -static __global__ void im2col_kernel( - const float * x, T * dst, int64_t batch_offset, - int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW, - int s0, int s1, int p0, int p1, int d0, int d1) { - const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; - if (i >= pelements) { - return; - } - - const int64_t ksize = OW * (KH > 1 ? KW : 1); - const int64_t kx = i / ksize; - const int64_t kd = kx * ksize; - const int64_t ky = (i - kd) / OW; - const int64_t ix = i % OW; - - const int64_t oh = blockIdx.y; - const int64_t batch = blockIdx.z / IC; - const int64_t ic = blockIdx.z % IC; - - const int64_t iiw = ix * s0 + kx * d0 - p0; - const int64_t iih = oh * s1 + ky * d1 - p1; - - const int64_t offset_dst = - ((batch * OH + oh) * OW + ix) * CHW + - (ic * (KW * KH) + ky * KW + kx); - - if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { - dst[offset_dst] = 0.0f; - } else { - const int64_t offset_src = ic * offset_delta + batch * batch_offset; - dst[offset_dst] = x[offset_src + iih * IW + iiw]; - } -} - -template -static __global__ void pool2d_nchw_kernel( - const int ih, const int iw, const int oh, const int ow, - const int kh, const int kw, const int sh, const int sw, - const int ph, const int pw, const int parallel_elements, - const Ti* src, To* dst, const enum ggml_op_pool op) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx >= parallel_elements) { - return; - } - - const int I_HW = ih * iw; - const int O_HW = oh * ow; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / ow; - const int cur_ow = idx % O_HW % ow; - const Ti* i_ptr = src + nc * I_HW; - To* o_ptr = dst + nc * O_HW; - const int start_h = cur_oh * sh - ph; - const int bh = max(0, start_h); - const int eh = min(ih, start_h + kh); - const int start_w = cur_ow * sw - pw; - const int bw = max(0, start_w); - const int ew = min(iw, start_w + kw); - const To scale = 1. / (kh * kw); - To res = 0; - - switch (op) { - case GGML_OP_POOL_AVG: res = 0; break; - case GGML_OP_POOL_MAX: res = -FLT_MAX; break; - default: assert(false); - } - - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { -#if __CUDA_ARCH__ >= 350 - Ti cur = __ldg(i_ptr + i * iw + j); -#else - Ti cur = i_ptr[i * iw + j]; -#endif - switch (op) { - case GGML_OP_POOL_AVG: res += cur * scale; break; - case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; - default: assert(false); - } - } - } - o_ptr[cur_oh * ow + cur_ow] = res; -} - -template -static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); - const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); - const dim3 block_nums(block_num_x, ne10, ne11*ne12); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - GGML_ASSERT(ne00 % 2 == 0); - - k_get_rows<<>>( - src0_dd, src1_dd, dst_dd, - ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ - /* s0,*/ s1, s2, s3, - /* nb00,*/ nb01, nb02, nb03, - s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); -} - -template -static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); - const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; - const dim3 block_nums(block_num_x, ne10, ne11*ne12); - - // strides in elements - //const size_t s0 = nb0 / ggml_element_size(dst); - const size_t s1 = nb1 / ggml_element_size(dst); - const size_t s2 = nb2 / ggml_element_size(dst); - const size_t s3 = nb3 / ggml_element_size(dst); - - const size_t s10 = nb10 / ggml_element_size(src1); - const size_t s11 = nb11 / ggml_element_size(src1); - const size_t s12 = nb12 / ggml_element_size(src1); - //const size_t s13 = nb13 / ggml_element_size(src1); - - k_get_rows_float<<>>( - src0_dd, src1_dd, dst_dd, - ne00, /*ne01, ne02, ne03,*/ - /*ne10, ne11,*/ ne12, /*ne13,*/ - /* s0,*/ s1, s2, s3, - /* nb00,*/ nb01, nb02, nb03, - s10, s11, s12/*, s13*/); - - GGML_UNUSED(dst); -} - -template -struct bin_bcast_cuda { - template - void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, - const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, - cudaStream_t stream) { - - GGML_TENSOR_BINARY_OP_LOCALS - - int nr0 = ne10/ne0; - int nr1 = ne11/ne1; - int nr2 = ne12/ne2; - int nr3 = ne13/ne3; - - int nr[4] = { nr0, nr1, nr2, nr3 }; - - // collapse dimensions until first broadcast dimension - int64_t cne0[] = {ne0, ne1, ne2, ne3}; - int64_t cne1[] = {ne10, ne11, ne12, ne13}; - size_t cnb0[] = {nb0, nb1, nb2, nb3}; - size_t cnb1[] = {nb10, nb11, nb12, nb13}; - auto collapse = [](int64_t cne[]) { - cne[0] *= cne[1]; - cne[1] = cne[2]; - cne[2] = cne[3]; - cne[3] = 1; - }; - - auto collapse_nb = [](size_t cnb[], const int64_t cne[]) { - cnb[1] *= cne[1]; - cnb[2] *= cne[2]; - cnb[3] *= cne[3]; - }; - - for (int i = 0; i < 4; i++) { - if (nr[i] != 1) { - break; - } - if (i > 0) { - collapse_nb(cnb0, cne0); - collapse_nb(cnb1, cne1); - collapse(cne0); - collapse(cne1); - } - } - { - int64_t ne0 = cne0[0]; - int64_t ne1 = cne0[1]; - int64_t ne2 = cne0[2]; - int64_t ne3 = cne0[3]; - - int64_t ne10 = cne1[0]; - int64_t ne11 = cne1[1]; - int64_t ne12 = cne1[2]; - int64_t ne13 = cne1[3]; - - size_t nb0 = cnb0[0]; - size_t nb1 = cnb0[1]; - size_t nb2 = cnb0[2]; - size_t nb3 = cnb0[3]; - - size_t nb10 = cnb1[0]; - size_t nb11 = cnb1[1]; - size_t nb12 = cnb1[2]; - size_t nb13 = cnb1[3]; - - size_t s0 = nb0 / sizeof(dst_t); - size_t s1 = nb1 / sizeof(dst_t); - size_t s2 = nb2 / sizeof(dst_t); - size_t s3 = nb3 / sizeof(dst_t); - - size_t s10 = nb10 / sizeof(src1_t); - size_t s11 = nb11 / sizeof(src1_t); - size_t s12 = nb12 / sizeof(src1_t); - size_t s13 = nb13 / sizeof(src1_t); - - GGML_ASSERT(s0 == 1); - GGML_ASSERT(s10 == 1); - - const int block_size = 128; - - int64_t hne0 = std::max(ne0/2LL, 1LL); - - dim3 block_dims; - block_dims.x = std::min(hne0, block_size); - block_dims.y = std::min(ne1, block_size / block_dims.x); - block_dims.z = std::min(std::min(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U); - - dim3 block_nums( - (hne0 + block_dims.x - 1) / block_dims.x, - (ne1 + block_dims.y - 1) / block_dims.y, - (ne2*ne3 + block_dims.z - 1) / block_dims.z - ); - - if (block_nums.z > 65535) { - // this is the maximum number of blocks in z direction, fallback to 1D grid kernel - int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; - k_bin_bcast_unravel<<>>( - src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s10, */ s11, s12, s13); - } else { - k_bin_bcast<<>>( - src0_dd, src1_dd, dst_dd, - ne0, ne1, ne2, ne3, - ne10, ne11, ne12, ne13, - /* s0, */ s1, s2, s3, - /* s10, */ s11, s12, s13); - } - } - } -}; - -static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements, - const int ne10, const int ne11, const int ne12, - const int nb1, const int nb2, const int offset, cudaStream_t stream) { - int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; - acc_f32<<>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset); -} - -static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; - gelu_f32<<>>(x, dst, k); -} - -static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; - silu_f32<<>>(x, dst, k); -} - -static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; - gelu_quick_f32<<>>(x, dst, k); -} - -static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE; - tanh_f32<<>>(x, dst, k); -} - -static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; - relu_f32<<>>(x, dst, k); -} - -static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE; - hardsigmoid_f32<<>>(x, dst, k); -} - -static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE; - hardswish_f32<<>>(x, dst, k); -} - -static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) { - const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; - leaky_relu_f32<<>>(x, dst, k, negative_slope); -} - -static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE; - sqr_f32<<>>(x, dst, k); -} - -static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { - GGML_ASSERT(ncols % WARP_SIZE == 0); - if (ncols < 1024) { - const dim3 block_dims(WARP_SIZE, 1, 1); - norm_f32<<>>(x, dst, ncols, eps); - } else { - const dim3 block_dims(1024, 1, 1); - norm_f32<1024><<>>(x, dst, ncols, eps); - } -} - -static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) { - static const float eps = 1e-6f; - if (group_size < 1024) { - const dim3 block_dims(WARP_SIZE, 1, 1); - group_norm_f32<<>>(x, dst, group_size, ne_elements, eps); - } else { - const dim3 block_dims(1024, 1, 1); - group_norm_f32<1024><<>>(x, dst, group_size, ne_elements, eps); - } -} - -static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne1, ne2); - concat_f32<<>>(x, y, dst, ne0, ne02); -} - -static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03, - const int scale_factor, cudaStream_t stream) { - int ne0 = (ne00 * scale_factor); - int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; - dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03); - upscale_f32<<>>(x, dst, ne00, ne00 * ne01, scale_factor); -} - -static void pad_f32_cuda(const float * x, float * dst, - const int ne00, const int ne01, const int ne02, const int ne03, - const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne1, ne2*ne3); - pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); -} - -static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; - arange_f32<<>>(dst, ne0, start, step); -} - -static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1, - const int dim, const int max_period, cudaStream_t stream) { - int half_ceil = (dim + 1) / 2; - int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE; - dim3 gridDim(num_blocks, ne00, 1); - timestep_embedding_f32<<>>(x, dst, nb1, dim, max_period); -} - -static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { - GGML_ASSERT(ncols % WARP_SIZE == 0); - if (ncols < 1024) { - const dim3 block_dims(WARP_SIZE, 1, 1); - rms_norm_f32<<>>(x, dst, ncols, eps); - } else { - const dim3 block_dims(1024, 1, 1); - rms_norm_f32<1024><<>>(x, dst, ncols, eps); - } -} - -static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) { - const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; - const dim3 num_blocks(block_num_x, ky, 1); - const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1); - quantize_q8_1<<>>(x, vy, kx, kx_padded); -} - -template -static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { - const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE); - dequantize_block<<>>(vx, y, k); -} - -static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN; - if (k % CUDA_Q8_0_NE_ALIGN == 0) { - const bool need_check = false; - dequantize_block_q8_0_f16<<>>(vx, y, k); - } else { - const bool need_check = true; - dequantize_block_q8_0_f16<<>>(vx, y, k); - } -} - -template -static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; -#if QK_K == 256 - dequantize_block_q2_K<<>>(vx, y); -#else - dequantize_block_q2_K<<>>(vx, y); -#endif -} - -template -static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; -#if QK_K == 256 - dequantize_block_q3_K<<>>(vx, y); -#else - dequantize_block_q3_K<<>>(vx, y); -#endif -} - -template -static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb32 = k / 32; - const int nb = (k + 255) / 256; - dequantize_block_q4_0<<>>(vx, y, nb32); -} - -template -static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb32 = k / 32; - const int nb = (k + 255) / 256; - dequantize_block_q4_1<<>>(vx, y, nb32); -} - -template -static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_q4_K<<>>(vx, y); -} - -template -static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; -#if QK_K == 256 - dequantize_block_q5_K<<>>(vx, y); -#else - dequantize_block_q5_K<<>>(vx, y); -#endif -} - -template -static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; -#if QK_K == 256 - dequantize_block_q6_K<<>>(vx, y); -#else - dequantize_block_q6_K<<>>(vx, y); -#endif -} - -template -static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq2_xxs<<>>(vx, y); -} - -template -static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq2_xs<<>>(vx, y); -} - -template -static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq2_s<<>>(vx, y); -} - -template -static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq3_xxs<<>>(vx, y); -} - -template -static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq3_s<<>>(vx, y); -} - -template -static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = k / QK_K; - dequantize_block_iq1_s<<>>(vx, y); -} - -template -static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = (k + QK_K - 1) / QK_K; - dequantize_block_iq4_nl<<>>(vx, y); -} - -template -static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { - const int nb = (k + QK_K - 1) / QK_K; -#if QK_K == 64 - dequantize_block_iq4_nl<<>>(vx, y); -#else - dequantize_block_iq4_xs<<>>(vx, y); -#endif -} - -template -static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; - convert_unary<<>>(vx, y, k); -} - -static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { - int id; - switch (type) { - case GGML_TYPE_Q4_0: - return dequantize_row_q4_0_cuda; - case GGML_TYPE_Q4_1: - return dequantize_row_q4_1_cuda; - case GGML_TYPE_Q5_0: - return dequantize_block_cuda; - case GGML_TYPE_Q5_1: - return dequantize_block_cuda; - case GGML_TYPE_Q8_0: - CUDA_CHECK(cudaGetDevice(&id)); - if (get_cuda_global_info().devices[id].cc >= CC_PASCAL) { - return dequantize_block_q8_0_f16_cuda; - } - return dequantize_block_cuda; - case GGML_TYPE_Q2_K: - return dequantize_row_q2_K_cuda; - case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_cuda; - case GGML_TYPE_Q4_K: - return dequantize_row_q4_K_cuda; - case GGML_TYPE_Q5_K: - return dequantize_row_q5_K_cuda; - case GGML_TYPE_Q6_K: - return dequantize_row_q6_K_cuda; - case GGML_TYPE_IQ2_XXS: - return dequantize_row_iq2_xxs_cuda; - case GGML_TYPE_IQ2_XS: - return dequantize_row_iq2_xs_cuda; - case GGML_TYPE_IQ2_S: - return dequantize_row_iq2_s_cuda; - case GGML_TYPE_IQ3_XXS: - return dequantize_row_iq3_xxs_cuda; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_cuda; - case GGML_TYPE_IQ4_NL: - return dequantize_row_iq4_nl_cuda; - case GGML_TYPE_IQ4_XS: - return dequantize_row_iq4_xs_cuda; - case GGML_TYPE_IQ3_S: - return dequantize_row_iq3_s_cuda; - case GGML_TYPE_F32: - return convert_unary_cuda; - default: - return nullptr; - } -} - -static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { - switch (type) { - case GGML_TYPE_Q4_0: - return dequantize_row_q4_0_cuda; - case GGML_TYPE_Q4_1: - return dequantize_row_q4_1_cuda; - case GGML_TYPE_Q5_0: - return dequantize_block_cuda; - case GGML_TYPE_Q5_1: - return dequantize_block_cuda; - case GGML_TYPE_Q8_0: - return dequantize_block_cuda; - case GGML_TYPE_Q2_K: - return dequantize_row_q2_K_cuda; - case GGML_TYPE_Q3_K: - return dequantize_row_q3_K_cuda; - case GGML_TYPE_Q4_K: - return dequantize_row_q4_K_cuda; - case GGML_TYPE_Q5_K: - return dequantize_row_q5_K_cuda; - case GGML_TYPE_Q6_K: - return dequantize_row_q6_K_cuda; - case GGML_TYPE_IQ2_XXS: - return dequantize_row_iq2_xxs_cuda; - case GGML_TYPE_IQ2_XS: - return dequantize_row_iq2_xs_cuda; - case GGML_TYPE_IQ2_S: - return dequantize_row_iq2_s_cuda; - case GGML_TYPE_IQ3_XXS: - return dequantize_row_iq3_xxs_cuda; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_cuda; - case GGML_TYPE_IQ4_NL: - return dequantize_row_iq4_nl_cuda; - case GGML_TYPE_IQ4_XS: - return dequantize_row_iq4_xs_cuda; - case GGML_TYPE_IQ3_S: - return dequantize_row_iq3_s_cuda; - case GGML_TYPE_F16: - return convert_unary_cuda; - default: - return nullptr; - } -} - -static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - dequantize_mul_mat_vec - <<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_q2_k<<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols, nrows); -} - -static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const dim3 block_dims(32, 1, 1); - dequantize_mul_mat_vec_q5_k<<>>(vx, y, dst, ncols); -} - -static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % QK_K == 0); - const int ny = 2 / K_QUANTS_PER_ITERATION; - const int block_num_y = (nrows + ny - 1) / ny; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(32, ny, 1); - dequantize_mul_mat_vec_q6_k<<>>(vx, y, dst, ncols, nrows); -} - -static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); - const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; - const dim3 block_nums(block_num_y, 1, 1); - const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); - dequantize_mul_mat_vec<1, 1, convert_f16> - <<>>(vx, y, dst, ncols, nrows); -} - -template -static void mul_mat_vec_q_cuda( - const void * vx, const void * vy, float * dst, - const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { - - GGML_ASSERT(ncols_x % qk == 0); - GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - - int64_t nwarps = 1; - int64_t rows_per_cuda_block = 1; - - if (get_cuda_global_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 - switch(ncols_y) { - case 1: - nwarps = 4; - rows_per_cuda_block = 1; - break; - case 2: - case 3: - case 4: - nwarps = 4; - rows_per_cuda_block = 2; - break; - case 5: - case 6: - case 7: - case 8: - nwarps = 2; - rows_per_cuda_block = 2; - break; - default: - GGML_ASSERT(false); - break; - } - } - const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block; - const dim3 block_nums(nblocks, 1, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - switch (ncols_y) { - case 1: - mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - case 2: - mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - case 3: - mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - case 4: - mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - case 5: - mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - case 6: - mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - case 7: - mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - case 8: - mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot> - <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); - break; - default: - GGML_ASSERT(false); - break; - } -} - -static void ggml_mul_mat_q4_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q4_0_RDNA2; - mmq_y = MMQ_Y_Q4_0_RDNA2; - nwarps = NWARPS_Q4_0_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q4_0_RDNA1; - mmq_y = MMQ_Y_Q4_0_RDNA1; - nwarps = NWARPS_Q4_0_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q4_0_AMPERE; - mmq_y = MMQ_Y_Q4_0_AMPERE; - nwarps = NWARPS_Q4_0_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q4_0_PASCAL; - mmq_y = MMQ_Y_Q4_0_PASCAL; - nwarps = NWARPS_Q4_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q4_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q4_1_RDNA2; - mmq_y = MMQ_Y_Q4_1_RDNA2; - nwarps = NWARPS_Q4_1_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q4_1_RDNA1; - mmq_y = MMQ_Y_Q4_1_RDNA1; - nwarps = NWARPS_Q4_1_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q4_1_AMPERE; - mmq_y = MMQ_Y_Q4_1_AMPERE; - nwarps = NWARPS_Q4_1_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q4_1_PASCAL; - mmq_y = MMQ_Y_Q4_1_PASCAL; - nwarps = NWARPS_Q4_1_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q5_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q5_0_RDNA2; - mmq_y = MMQ_Y_Q5_0_RDNA2; - nwarps = NWARPS_Q5_0_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q5_0_RDNA1; - mmq_y = MMQ_Y_Q5_0_RDNA1; - nwarps = NWARPS_Q5_0_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q5_0_AMPERE; - mmq_y = MMQ_Y_Q5_0_AMPERE; - nwarps = NWARPS_Q5_0_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q5_0_PASCAL; - mmq_y = MMQ_Y_Q5_0_PASCAL; - nwarps = NWARPS_Q5_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q5_1_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q5_1_RDNA2; - mmq_y = MMQ_Y_Q5_1_RDNA2; - nwarps = NWARPS_Q5_1_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q5_1_RDNA1; - mmq_y = MMQ_Y_Q5_1_RDNA1; - nwarps = NWARPS_Q5_1_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q5_1_AMPERE; - mmq_y = MMQ_Y_Q5_1_AMPERE; - nwarps = NWARPS_Q5_1_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q5_1_PASCAL; - mmq_y = MMQ_Y_Q5_1_PASCAL; - nwarps = NWARPS_Q5_1_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_1<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q8_0_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q8_0_RDNA2; - mmq_y = MMQ_Y_Q8_0_RDNA2; - nwarps = NWARPS_Q8_0_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q8_0_RDNA1; - mmq_y = MMQ_Y_Q8_0_RDNA1; - nwarps = NWARPS_Q8_0_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q8_0_AMPERE; - mmq_y = MMQ_Y_Q8_0_AMPERE; - nwarps = NWARPS_Q8_0_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q8_0_PASCAL; - mmq_y = MMQ_Y_Q8_0_PASCAL; - nwarps = NWARPS_Q8_0_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q8_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q8_0<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q2_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q2_K_RDNA2; - mmq_y = MMQ_Y_Q2_K_RDNA2; - nwarps = NWARPS_Q2_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q2_K_RDNA1; - mmq_y = MMQ_Y_Q2_K_RDNA1; - nwarps = NWARPS_Q2_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q2_K_AMPERE; - mmq_y = MMQ_Y_Q2_K_AMPERE; - nwarps = NWARPS_Q2_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q2_K_PASCAL; - mmq_y = MMQ_Y_Q2_K_PASCAL; - nwarps = NWARPS_Q2_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q2_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q2_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q3_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - -#if QK_K == 256 - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q3_K_RDNA2; - mmq_y = MMQ_Y_Q3_K_RDNA2; - nwarps = NWARPS_Q3_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q3_K_RDNA1; - mmq_y = MMQ_Y_Q3_K_RDNA1; - nwarps = NWARPS_Q3_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q3_K_AMPERE; - mmq_y = MMQ_Y_Q3_K_AMPERE; - nwarps = NWARPS_Q3_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q3_K_PASCAL; - mmq_y = MMQ_Y_Q3_K_PASCAL; - nwarps = NWARPS_Q3_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q3_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q3_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -#endif -} - -static void ggml_mul_mat_q4_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q4_K_RDNA2; - mmq_y = MMQ_Y_Q4_K_RDNA2; - nwarps = NWARPS_Q4_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q4_K_RDNA1; - mmq_y = MMQ_Y_Q4_K_RDNA1; - nwarps = NWARPS_Q4_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q4_K_AMPERE; - mmq_y = MMQ_Y_Q4_K_AMPERE; - nwarps = NWARPS_Q4_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q4_K_PASCAL; - mmq_y = MMQ_Y_Q4_K_PASCAL; - nwarps = NWARPS_Q4_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q4_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q4_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q5_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q5_K_RDNA2; - mmq_y = MMQ_Y_Q5_K_RDNA2; - nwarps = NWARPS_Q5_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q5_K_RDNA1; - mmq_y = MMQ_Y_Q5_K_RDNA1; - nwarps = NWARPS_Q5_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q5_K_AMPERE; - mmq_y = MMQ_Y_Q5_K_AMPERE; - nwarps = NWARPS_Q5_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q5_K_PASCAL; - mmq_y = MMQ_Y_Q5_K_PASCAL; - nwarps = NWARPS_Q5_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q5_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q5_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - -static void ggml_mul_mat_q6_K_q8_1_cuda( - const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, - const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = get_cuda_global_info().devices[id].cc; - - int mmq_x, mmq_y, nwarps; - if (compute_capability >= CC_RDNA2) { - mmq_x = MMQ_X_Q6_K_RDNA2; - mmq_y = MMQ_Y_Q6_K_RDNA2; - nwarps = NWARPS_Q6_K_RDNA2; - } else if (compute_capability >= CC_OFFSET_AMD) { - mmq_x = MMQ_X_Q6_K_RDNA1; - mmq_y = MMQ_Y_Q6_K_RDNA1; - nwarps = NWARPS_Q6_K_RDNA1; - } else if (compute_capability >= CC_VOLTA) { - mmq_x = MMQ_X_Q6_K_AMPERE; - mmq_y = MMQ_Y_Q6_K_AMPERE; - nwarps = NWARPS_Q6_K_AMPERE; - } else if (compute_capability >= MIN_CC_DP4A) { - mmq_x = MMQ_X_Q6_K_PASCAL; - mmq_y = MMQ_Y_Q6_K_PASCAL; - nwarps = NWARPS_Q6_K_PASCAL; - } else { - GGML_ASSERT(false); - } - - const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; - const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; - const dim3 block_nums(block_num_x, block_num_y, 1); - const dim3 block_dims(WARP_SIZE, nwarps, 1); - - if (nrows_x % mmq_y == 0) { - const bool need_check = false; - mul_mat_q6_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } else { - const bool need_check = true; - mul_mat_q6_K<<>> - (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); - } -} - static void ggml_mul_mat_p021_f16_f32_cuda( const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y, cudaStream_t stream) { @@ -8543,280 +1170,6 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda( (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x); } - -static void ggml_cpy_f16_f32_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_f32_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_f16_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_q8_0_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - GGML_ASSERT(ne % QK8_0 == 0); - const int num_blocks = ne / QK8_0; - cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_q4_0_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - GGML_ASSERT(ne % QK4_0 == 0); - const int num_blocks = ne / QK4_0; - cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_q4_1_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - GGML_ASSERT(ne % QK4_1 == 0); - const int num_blocks = ne / QK4_1; - cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_q5_0_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - GGML_ASSERT(ne % QK5_0 == 0); - const int num_blocks = ne / QK5_0; - cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_q5_1_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - GGML_ASSERT(ne % QK5_1 == 0); - const int num_blocks = ne / QK5_1; - cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f32_iq4_nl_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - GGML_ASSERT(ne % QK4_NL == 0); - const int num_blocks = ne / QK4_NL; - cpy_f32_q<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - -static void ggml_cpy_f16_f16_cuda( - const char * cx, char * cdst, const int ne, - const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, - const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { - - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_f32_f16<<>> - (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); -} - - - -static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; - scale_f32<<>>(x, dst, scale, k); -} - -static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) { - const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; - clamp_f32<<>>(x, dst, min, max, k); -} - -template -static void rope_cuda( - const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream -) { - GGML_ASSERT(ncols % 2 == 0); - const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); - const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); - const dim3 block_nums(nrows, num_blocks_x, 1); - if (pos == nullptr) { - rope<<>>( - x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims - ); - } else { - rope<<>>( - x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims - ); - } -} - -template -static void rope_neox_cuda( - const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream -) { - GGML_ASSERT(ncols % 2 == 0); - const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); - const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); - const dim3 block_nums(nrows, num_blocks_x, 1); - - const float theta_scale = powf(freq_base, -2.0f/n_dims); - const float inv_ndims = -1.0f / n_dims; - - if (pos == nullptr) { - rope_neox<<>>( - x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, inv_ndims - ); - } else { - rope_neox<<>>( - x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, - theta_scale, inv_ndims - ); - } -} - -static void rope_glm_f32_cuda( - const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, - float freq_base, int n_ctx, cudaStream_t stream -) { - GGML_ASSERT(ncols % 4 == 0); - const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); - const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; - const dim3 block_nums(num_blocks_x, nrows, 1); - rope_glm_f32<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx); -} - -static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, - const int k_rows, const int n_heads_log2_floor, const float m0, - const float m1, cudaStream_t stream) { - const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1); - const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE); - const dim3 block_nums(num_blocks_x, nrows, 1); - alibi_f32<<>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1); -} - -static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - const dim3 block_dims(WARP_SIZE, 1, 1); - const dim3 block_nums(nrows, 1, 1); - k_sum_rows_f32<<>>(x, dst, ncols); -} - -static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { - // bitonic sort requires ncols to be power of 2 - GGML_ASSERT((ncols & (ncols - 1)) == 0); - - const dim3 block_dims(ncols, 1, 1); - const dim3 block_nums(1, nrows, 1); - if (order == GGML_SORT_ORDER_ASC) { - k_argsort_f32_i32<<>>(x, dst, ncols); - } else if (order == GGML_SORT_ORDER_DESC) { - k_argsort_f32_i32<<>>(x, dst, ncols); - } else { - GGML_ASSERT(false); - } -} - -static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { - const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); - const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; - const dim3 block_nums(nrows_x, block_num_x, 1); - diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); -} - -static void soft_max_f32_cuda(const float * x, const float * mask, const float * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) { - int nth = WARP_SIZE; - while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; - const dim3 block_dims(nth, 1, 1); - const dim3 block_nums(nrows_x, 1, 1); - const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); - static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); - - const uint32_t n_head_kv = nrows_x/nrows_y; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - if (shmem < get_cuda_global_info().devices[ggml_cuda_get_device()].smpb) { - switch (ncols_x) { - case 32: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - case 64: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - case 128: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - case 256: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - case 512: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - case 1024: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - case 2048: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - case 4096: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - default: - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - break; - } - } else { - const size_t shmem_low = WARP_SIZE*sizeof(float); - soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); - } -} - -template -static void im2col_cuda(const float * x, T* dst, - int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, - int64_t batch, int64_t batch_offset, int64_t offset_delta, - int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { - const int parallel_elements = OW * KW * KH; - const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; - dim3 block_nums(num_blocks, OH, batch * IC); - im2col_kernel<<>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); -} - static cudaError_t ggml_cuda_cpy_tensor_2d( void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { @@ -8853,670 +1206,6 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( } } -static void ggml_cuda_op_get_rows( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) { - - GGML_UNUSED(ctx); - - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); - GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); - GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); - - const int32_t * src1_i32 = (const int32_t *) src1_d; - - switch (src0->type) { - case GGML_TYPE_F16: - get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_F32: - get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q4_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q4_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q5_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q5_1: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - case GGML_TYPE_Q8_0: - get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); - break; - default: - // TODO: k-quants - fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); - GGML_ASSERT(false); - break; - } -} - -template -static void ggml_cuda_op_bin_bcast( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - - GGML_UNUSED(ctx); - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - op()(src0, src1, dst, (const half *) src0_dd, src1_dd, (half *) dst_dd, main_stream); - } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { - op()(src0, src1, dst, (const half *) src0_dd, src1_dd, dst_dd, main_stream); - } else { - fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, - ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); - } -} - -static void ggml_cuda_op_repeat( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) { - - ggml_cuda_op_bin_bcast>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(src1_d); -} - -static void ggml_cuda_op_add( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - - ggml_cuda_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -static void ggml_cuda_op_acc( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - - GGML_UNUSED(ctx); - - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported - - int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 - int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 - // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused - int offset = dst->op_params[3] / 4; // offset in bytes - - acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); - - GGML_UNUSED(dst); -} - -static void ggml_cuda_op_mul( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - - ggml_cuda_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -static void ggml_cuda_op_div( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - - ggml_cuda_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); -} - -static void ggml_cuda_op_gelu( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_silu( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_gelu_quick( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_tanh( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_relu( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_hardsigmoid( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_hardswish( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_leaky_relu( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - float negative_slope; - memcpy(&negative_slope, dst->op_params, sizeof(float)); - - leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_sqr( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_norm( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_group_norm( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - int num_groups = dst->op_params[0]; - int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); - group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_concat( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - for (int i3 = 0; i3 < dst->ne[3]; i3++) { - concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream); - } - - GGML_UNUSED(src1); - GGML_UNUSED(dst); -} - -static void ggml_cuda_op_upscale( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - - const int scale_factor = dst->op_params[0]; - - upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_pad( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors - - pad_f32_cuda(src0_dd, dst_dd, - src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], - dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_arange( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - float start; - float stop; - float step; - memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); - memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); - memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); - - int64_t steps = (int64_t)ceil((stop - start) / step); - GGML_ASSERT(ggml_nelements(dst) == steps); - - arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream); - - GGML_UNUSED(src0); - GGML_UNUSED(src1); - GGML_UNUSED(src0_dd); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_timestep_embedding( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT(dst->type == GGML_TYPE_F32); - - const int dim = dst->op_params[0]; - const int max_period = dst->op_params[1]; - - timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_rms_norm( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - - rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_mul_mat_q( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { - - const int64_t ne00 = src0->ne[0]; - - const int64_t ne10 = src1->ne[0]; - GGML_ASSERT(ne10 % QK8_1 == 0); - - const int64_t ne0 = dst->ne[0]; - - const int64_t row_diff = row_high - row_low; - - int id = ggml_cuda_get_device(); - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - - switch (src0->type) { - case GGML_TYPE_Q4_0: - ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q4_1: - ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q5_0: - ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q5_1: - ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q8_0: - ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q2_K: - ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q3_K: - ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q4_K: - ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q5_K: - ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - case GGML_TYPE_Q6_K: - ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); - break; - default: - GGML_ASSERT(false); - break; - } - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddf_i); -} - -static void ggml_cuda_op_mul_mat_vec_q( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { - - const int64_t ne00 = src0->ne[0]; - const int64_t row_diff = row_high - row_low; - - const int64_t ne10 = src1->ne[0]; - GGML_ASSERT(ne10 % QK8_1 == 0); - - const int64_t ne0 = dst->ne[0]; - - int id; - CUDA_CHECK(cudaGetDevice(&id)); - - // the main device has a larger memory buffer to hold the results from all GPUs - // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; - - switch (src0->type) { - case GGML_TYPE_Q4_0: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_1: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_0: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_1: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q8_0: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q2_K: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q3_K: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q4_K: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q5_K: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_Q6_K: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XXS: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_XS: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ2_S: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_XXS: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_NL: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ4_XS: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - case GGML_TYPE_IQ3_S: - mul_mat_vec_q_cuda - (src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); - break; - default: - GGML_ASSERT(false); - break; - } - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddf_i); - GGML_UNUSED(src1_ncols); - GGML_UNUSED(src1_padded_row_size); -} - -static void ggml_cuda_op_dequantize_mul_mat_vec( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, - const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, - const int64_t src1_padded_row_size, cudaStream_t stream) { - GGML_UNUSED(ctx); - const int64_t ne00 = src0->ne[0]; - const int64_t row_diff = row_high - row_low; - - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics -#ifdef GGML_CUDA_F16 - ggml_cuda_pool_alloc src1_dfloat_a(ctx.pool()); - half * src1_dfloat = nullptr; // dfloat == half - - bool src1_convert_f16 = - src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || - src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || - src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; - - if (src1_convert_f16) { - src1_dfloat = src1_dfloat_a.alloc(ne00); - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); - GGML_ASSERT(to_fp16_cuda != nullptr); - to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream); - } -#else - const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion -#endif // GGML_CUDA_F16 - - switch (src0->type) { - case GGML_TYPE_Q4_0: - dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_1: - dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_0: - dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_1: - dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q8_0: - dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q2_K: - dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q3_K: - dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q4_K: - dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q5_K: - dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_Q6_K: - dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); - break; - case GGML_TYPE_F16: - convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); - break; - default: - GGML_ASSERT(false); - break; - } - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_ddq_i); - GGML_UNUSED(src1_ncols); - GGML_UNUSED(src1_padded_row_size); -} - static void ggml_cuda_op_mul_mat_cublas( ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, @@ -9538,13 +1227,13 @@ static void ggml_cuda_op_mul_mat_cublas( // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = id == ctx.device ? ne0 : row_diff; + int64_t ldc = id == ctx.device ? ne0 : row_diff; - const int compute_capability = get_cuda_global_info().devices[id].cc; + const int compute_capability = ggml_cuda_info().devices[id].cc; if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 - ggml_cuda_pool_alloc src0_as_f16(ctx.pool()); + ggml_cuda_pool_alloc src0_as_f16(ctx.pool(id)); if (src0->type != GGML_TYPE_F16) { const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); GGML_ASSERT(to_fp16_cuda != nullptr); @@ -9554,7 +1243,7 @@ static void ggml_cuda_op_mul_mat_cublas( } const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get(); - ggml_cuda_pool_alloc src1_as_f16(ctx.pool()); + ggml_cuda_pool_alloc src1_as_f16(ctx.pool(id)); if (src1->type != GGML_TYPE_F16) { const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); GGML_ASSERT(to_fp16_cuda != nullptr); @@ -9563,7 +1252,7 @@ static void ggml_cuda_op_mul_mat_cublas( to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream); } const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get(); - ggml_cuda_pool_alloc dst_f16(ctx.pool(), row_diff*src1_ncols); + ggml_cuda_pool_alloc dst_f16(ctx.pool(id), row_diff*src1_ncols); const half alpha_f16 = 1.0f; const half beta_f16 = 0.0f; @@ -9617,345 +1306,6 @@ static void ggml_cuda_op_mul_mat_cublas( GGML_UNUSED(src1_padded_row_size); } -static void ggml_cuda_op_rope( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); - GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); - GGML_ASSERT(src0->type == dst->type); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t nrows = ggml_nrows(src0); - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - const int n_ctx = ((int32_t *) dst->op_params)[3]; - const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; - - // RoPE alteration for extended context - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - - const int32_t * pos = nullptr; - if ((mode & 1) == 0) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(src1->ne[0] == ne2); - pos = (const int32_t *) src1_dd; - } - - const bool is_neox = mode & 2; - const bool is_glm = mode & 4; - - rope_corr_dims corr_dims; - ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); - - // compute - if (is_glm) { - GGML_ASSERT(false); - rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream); - } else if (is_neox) { - if (src0->type == GGML_TYPE_F32) { - rope_neox_cuda( - (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_neox_cuda( - (const half *)src0_dd, (half *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream - ); - } else { - GGML_ASSERT(false); - } - } else { - if (src0->type == GGML_TYPE_F32) { - rope_cuda( - (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream - ); - } else if (src0->type == GGML_TYPE_F16) { - rope_cuda( - (const half *)src0_dd, (half *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, - attn_factor, corr_dims, main_stream - ); - } else { - GGML_ASSERT(false); - } - } - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_alibi( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - const int64_t nrows = ggml_nrows(src0); - - //const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_head = ((int32_t *) dst->op_params)[1]; - float max_bias; - memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); - - //GGML_ASSERT(ne01 + n_past == ne00); - GGML_ASSERT(n_head == ne02); - - const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); - - const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); - - alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_pool2d( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int32_t * opts = (const int32_t *)dst->op_params; - enum ggml_op_pool op = static_cast(opts[0]); - const int k0 = opts[1]; - const int k1 = opts[2]; - const int s0 = opts[3]; - const int s1 = opts[4]; - const int p0 = opts[5]; - const int p1 = opts[6]; - - const int64_t IH = src0->ne[1]; - const int64_t IW = src0->ne[0]; - - const int64_t N = dst->ne[3]; - const int64_t OC = dst->ne[2]; - const int64_t OH = dst->ne[1]; - const int64_t OW = dst->ne[0]; - - const int parallel_elements = N * OC * OH * OW; - const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE; - dim3 block_nums(num_blocks); - pool2d_nchw_kernel<<>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op); - - GGML_UNUSED(src1); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_im2col( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); - - const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; - const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; - const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; - const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; - const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; - const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; - - const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; - - const int64_t IC = src1->ne[is_2D ? 2 : 1]; - const int64_t IH = is_2D ? src1->ne[1] : 1; - const int64_t IW = src1->ne[0]; - - const int64_t KH = is_2D ? src0->ne[1] : 1; - const int64_t KW = src0->ne[0]; - - const int64_t OH = is_2D ? dst->ne[2] : 1; - const int64_t OW = dst->ne[1]; - - const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 - const int64_t batch = src1->ne[3]; - const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 - - if(dst->type == GGML_TYPE_F16) { - im2col_cuda(src1_dd, (half*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); - } else { - im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); - } - - GGML_UNUSED(src0); - GGML_UNUSED(src0_dd); -} - -static void ggml_cuda_op_sum_rows( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_argsort( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_I32); - - const int64_t ncols = src0->ne[0]; - const int64_t nrows = ggml_nrows(src0); - - enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; - - argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_diag_mask_inf( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int nrows0 = ggml_nrows(src0); - - const int n_past = ((int32_t *) dst->op_params)[0]; - - diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_soft_max( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional - - const int64_t ne00 = src0->ne[0]; - const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src0->ne[1]; - - float scale = 1.0f; - float max_bias = 0.0f; - - memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); - - // positions tensor - float * src2_dd = nullptr; - - ggml_tensor * src2 = dst->src[2]; - const bool use_src2 = src2 != nullptr; - - if (use_src2) { - src2_dd = (float *)src2->data; - } - - soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream); -} - -static void ggml_cuda_op_scale( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - float scale; - memcpy(&scale, dst->op_params, sizeof(float)); - - scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); - CUDA_CHECK(cudaGetLastError()); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -static void ggml_cuda_op_clamp( - ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, - const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - GGML_UNUSED(ctx); - GGML_ASSERT(src0->type == GGML_TYPE_F32); - GGML_ASSERT( dst->type == GGML_TYPE_F32); - - float min; - float max; - memcpy(&min, dst->op_params, sizeof(float)); - memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); - - clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); - CUDA_CHECK(cudaGetLastError()); - - GGML_UNUSED(src1); - GGML_UNUSED(dst); - GGML_UNUSED(src1_dd); -} - -// TODO: remove this function -static void ggml_cuda_op_flatten(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) { - GGML_ASSERT(!src0 || ggml_backend_buffer_is_cuda(src0->buffer)); - GGML_ASSERT(!src1 || ggml_backend_buffer_is_cuda(src1->buffer)); - GGML_ASSERT( ggml_backend_buffer_is_cuda(dst->buffer)); - - // dd = data device - float * src0_ddf = src0 ? (float *) src0->data : nullptr; - float * src1_ddf = src1 ? (float *) src1->data : nullptr; - float * dst_ddf = (float *) dst->data; - - ggml_cuda_set_device(ctx.device); - - // do the computation - op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, ctx.stream()); - CUDA_CHECK(cudaGetLastError()); -} - static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; @@ -9999,6 +1349,8 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { } } } + + ggml_cuda_set_device(main_device); #endif // NDEBUG peer_access_enabled = enable_peer_access; @@ -10027,8 +1379,8 @@ static void ggml_cuda_op_mul_mat( const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; - const int nb2 = dst->nb[2]; - const int nb3 = dst->nb[3]; + const int64_t nb2 = dst->nb[2]; + const int64_t nb3 = dst->nb[3]; GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer)); GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer)); @@ -10295,98 +1647,7 @@ static void ggml_cuda_op_mul_mat( } } -static void ggml_cuda_repeat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_repeat); -} - -static void ggml_cuda_get_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_get_rows); -} - -static void ggml_cuda_add(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_add); -} - -static void ggml_cuda_acc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_acc); -} - -static void ggml_cuda_mul(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_mul); -} - -static void ggml_cuda_div(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_div); -} - -static void ggml_cuda_gelu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu); -} - -static void ggml_cuda_silu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_silu); -} - -static void ggml_cuda_gelu_quick(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu_quick); -} - -static void ggml_cuda_tanh(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_tanh); -} - -static void ggml_cuda_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_relu); -} - -static void ggml_cuda_hardsigmoid(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardsigmoid); -} - -static void ggml_cuda_hardswish(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardswish); -} -static void ggml_cuda_leaky_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_leaky_relu); -} - -static void ggml_cuda_sqr(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sqr); -} - -static void ggml_cuda_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_norm); -} - -static void ggml_cuda_group_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_group_norm); -} - -static void ggml_cuda_concat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_concat); -} - -static void ggml_cuda_upscale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_upscale); -} - -static void ggml_cuda_pad(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pad); -} - -static void ggml_cuda_arange(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_arange); -} - -static void ggml_cuda_timestep_embedding(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_timestep_embedding); -} - -static void ggml_cuda_rms_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rms_norm); -} - -static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ +static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation @@ -10400,7 +1661,6 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg const int64_t ne12 = src1->ne[2]; - ggml_cuda_set_device(ctx.device); cudaStream_t main_stream = ctx.stream(); void * src0_ddq = src0->data; @@ -10410,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); } -static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ +static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); @@ -10427,7 +1687,6 @@ static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml const int64_t ne12 = src1->ne[2]; - ggml_cuda_set_device(ctx.device); cudaStream_t main_stream = ctx.stream(); void * src0_ddq = src0->data; @@ -10475,7 +1734,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co const int64_t ne_dst = ggml_nelements(dst); - ggml_cuda_set_device(ctx.device); cudaStream_t main_stream = ctx.stream(); CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); @@ -10622,16 +1880,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor continue; } - if (min_compute_capability > get_cuda_global_info().devices[id].cc) { - min_compute_capability = get_cuda_global_info().devices[id].cc; + if (min_compute_capability > ggml_cuda_info().devices[id].cc) { + min_compute_capability = ggml_cuda_info().devices[id].cc; } - if (get_cuda_global_info().devices[id].cc == 610) { + if (ggml_cuda_info().devices[id].cc == 610) { any_pascal_with_slow_fp16 = true; } } } else { - min_compute_capability = get_cuda_global_info().devices[ctx.device].cc; - any_pascal_with_slow_fp16 = get_cuda_global_info().devices[ctx.device].cc == 610; + min_compute_capability = ggml_cuda_info().devices[ctx.device].cc; + any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610; } // check data types and tensor shapes for custom matrix multiplication kernels: @@ -10690,7 +1948,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst); - } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { @@ -10704,224 +1962,122 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor } } -#if 0 -template -static __global__ void k_compute_batched_ptrs_id( - const void ** ptrs_src, void ** ptrs_dst, - int ne12, int ne13, - int ne23, - int nb02, int nb03, - int nb12, int nb13, - int nb2, int nb3, - int r2, int r3, - ggml_type src0_type, half * src0_as_f16, int64_t src0_ne, - const half * src1_f16, half * dst_f16, - const int32_t * ids, const int id, - Srcs... src0s) { +struct mmid_row_mapping { + int32_t i1; + int32_t i2; +}; - int i = ids[id]; +static __global__ void k_copy_src1_to_contiguous(const char * __restrict__ src1_original, char * __restrict__ src1_contiguous, + int * __restrict__ cur_src1_row, mmid_row_mapping * __restrict__ row_mapping, + const char * __restrict ids, int64_t i02, size_t ids_nb1, size_t ids_nb0, + int64_t ne11, int64_t ne10, + size_t nb11, size_t nb12) { + int32_t iid1 = blockIdx.x; + int32_t id = blockIdx.y; - half * src0_f16; - const void * srcs_ar[] = { (const half *) src0s... }; - if (src0_type == GGML_TYPE_F16) { - src0_f16 = (half *) srcs_ar[i]; - } else { - src0_f16 = src0_as_f16; - if (threadIdx.x == 0 && threadIdx.y == 0) { - const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type); - to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget); - } - } + const int32_t row_id_i = *(const int32_t *) (ids + iid1*ids_nb1 + id*ids_nb0); - int i13 = blockIdx.x * blockDim.x + threadIdx.x; - int i12 = blockIdx.y * blockDim.y + threadIdx.y; - - if (i13 >= ne13 || i12 >= ne12) { + if (row_id_i != i02) { return; } - int i03 = i13 / r3; - int i02 = i12 / r2; + const int64_t i11 = id % ne11; + const int64_t i12 = iid1; - ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02 + i03*nb03; - ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2; - ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst_f16 + i12* nb2/2 + i13* nb3/2; + __shared__ int src1_row; + if (threadIdx.x == 0) { + src1_row = atomicAdd(cur_src1_row, 1); + row_mapping[src1_row] = {id, iid1}; + } + __syncthreads(); + + const float * src1_row_original = (const float *)(src1_original + i11*nb11 + i12*nb12); + float * src1_row_contiguous = (float *)(src1_contiguous + src1_row*nb11); + + for (int i = threadIdx.x; i < ne10; i += blockDim.x) { + src1_row_contiguous[i] = src1_row_original[i]; + } } -static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { - const struct ggml_tensor * ids = dst->src[0]; - const struct ggml_tensor * src1 = dst->src[1]; - const struct ggml_tensor * src00 = dst->src[2]; +static __global__ void k_copy_dst_from_contiguous(char * __restrict__ dst_original, const char * __restrict__ dst_contiguous, + const mmid_row_mapping * __restrict__ row_mapping, + int64_t ne0, + size_t nb1, size_t nb2) { + int32_t i = blockIdx.x; - const int id = dst->op_params[0]; + const int32_t i1 = row_mapping[i].i1; + const int32_t i2 = row_mapping[i].i2; - GGML_ASSERT(!ggml_is_transposed(src00)); - GGML_ASSERT(!ggml_is_transposed(src1)); + const float * dst_row_contiguous = (const float *)(dst_contiguous + i*nb1); + float * dst_row_original = (float *)(dst_original + i1*nb1 + i2*nb2); - GGML_ASSERT(src00->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - - const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00); - const int64_t ne01 = src00->ne[1]; - const int64_t ne02 = src00->ne[2]; - const int64_t ne03 = src00->ne[3]; - - //const int64_t nb01 = src00->nb[1]; - const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02); - const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03); - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - const int64_t ne13 = src1->ne[3]; - - //const int64_t nb11 = src1->nb[1]; - const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); - const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); - - const int64_t ne1 = ggml_nelements(src1); - const int64_t ne = ggml_nelements(dst); - - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - - CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream)); - - //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - //void * src0_ddq = src0_extra->data_device[g_main_device]; - //half * src0_as_f16 = (half *) src0_ddq; - - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; - - // convert src1 to fp16 - const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); - GGML_ASSERT(to_fp16_cuda != nullptr); - - size_t src1_as = 0; - half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as); - to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); - - size_t dst_as = 0; - half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); - - GGML_ASSERT(ne12 % ne02 == 0); - GGML_ASSERT(ne13 % ne03 == 0); - - // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; - - const half alpha_f16 = 1.0f; - const half beta_f16 = 0.0f; - - // use cublasGemmBatchedEx - const int ne23 = ne12*ne13; - - const void ** ptrs_src = nullptr; - void ** ptrs_dst = nullptr; - - size_t ptrs_src_s = 0; - size_t ptrs_dst_s = 0; - - ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s); - ptrs_dst = ( void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s); - - int64_t src0_ne = ggml_nelements(src00); - half * src0_as_f16 = nullptr; - size_t src0_as = 0; - if (src00->type != GGML_TYPE_F16) { - src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as); + for (int j = threadIdx.x; j < ne0; j += blockDim.x) { + dst_row_original[j] = dst_row_contiguous[j]; } - - static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6"); - dim3 block_dims(ne13, ne12); - k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>( - ptrs_src, ptrs_dst, - ne12, ne13, - ne23, - ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half), - nb12, nb13, - dst->nb[2], dst->nb[3], - r2, r3, - src00->type, src0_as_f16, src0_ne, - src1_as_f16, dst_f16, - (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id, - dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr, - dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr, - dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr, - dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr - ); - CUDA_CHECK(cudaGetLastError()); - - CUBLAS_CHECK( - cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, - ne01, ne11, ne10, - &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00, - (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10, - &beta_f16, ( void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01, - ne23, - CUBLAS_COMPUTE_16F, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); - - if (src0_as != 0) { - ggml_cuda_pool_free(src0_as_f16, src0_as); - } - if (ptrs_src_s != 0) { - ggml_cuda_pool_free(ptrs_src, ptrs_src_s); - } - if (ptrs_dst_s != 0) { - ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s); - } - - const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); - to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); - - ggml_cuda_pool_free(src1_as_f16, src1_as); - ggml_cuda_pool_free(dst_f16, dst_as); } -#endif -static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { -#if 0 - ggml_cuda_mul_mat_id_cublas(dst); - // TODO: mmq/mmv support -#endif +static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * ids = dst->src[2]; + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers"); + cudaStream_t stream = ctx.stream(); - const size_t nb11 = src1->nb[1]; - const size_t nb1 = dst->nb[1]; - - const struct ggml_tensor * ids = src0; - const int32_t id = ((int32_t *) dst->op_params)[0]; - const int32_t n_as = ((int32_t *) dst->op_params)[1]; + const int64_t n_as = ne02; + const int64_t n_ids = ids->ne[0]; std::vector ids_host(ggml_nbytes(ids)); const char * ids_dev = (const char *) ids->data; CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); + ggml_tensor src0_row = *src0; ggml_tensor src1_row = *src1; - ggml_tensor dst_row = *dst; + ggml_tensor dst_row = *dst; + char * src0_original = (char *) src0->data; char * src1_original = (char *) src1->data; char * dst_original = (char *) dst->data; - if (src1->ne[1] == 1) { - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + src0_row.ne[2] = 1; + src0_row.ne[3] = 1; + src0_row.nb[3] = nb02; - GGML_ASSERT(row_id >= 0 && row_id < n_as); + src1_row.ne[1] = 1; + src1_row.ne[2] = 1; + src1_row.ne[3] = 1; + src1_row.nb[2] = nb11; + src1_row.nb[3] = nb11; - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + dst_row.ne[1] = 1; + dst_row.ne[2] = 1; + dst_row.ne[3] = 1; + dst_row.nb[2] = nb1; + dst_row.nb[3] = nb1; - src1_row.data = src1_original + i01*src1->nb[1]; - dst_row.data = dst_original + i01*dst->nb[1]; + if (ne12 == 1) { + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { + for (int64_t id = 0; id < n_ids; id++) { + const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); - ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row); + GGML_ASSERT(i02 >= 0 && i02 < n_as); + + const int64_t i11 = id % ne11; + const int64_t i12 = iid1; + + const int64_t i1 = id; + const int64_t i2 = i12; + + src0_row.data = src0_original + i02*nb02; + src1_row.data = src1_original + i11*nb11 + i12*nb12; + dst_row.data = dst_original + i1*nb1 + i2*nb2; + + ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); + } } } else { ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); @@ -10930,334 +2086,230 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, const ggml_ten src1_row.data = src1_contiguous.get(); dst_row.data = dst_contiguous.get(); - for (int32_t row_id = 0; row_id < n_as; ++row_id) { - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - + for (int64_t i02 = 0; i02 < n_as; i02++) { int64_t num_src1_rows = 0; - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); - if (row_id_i != row_id) { - continue; + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { + for (int64_t id = 0; id < n_ids; id++) { + const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); + + GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as); + + if (row_id_i != i02) { + continue; + } + + num_src1_rows++; } - - GGML_ASSERT(row_id >= 0 && row_id < n_as); - - CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11, - nb11, cudaMemcpyDeviceToDevice, stream)); - num_src1_rows++; } if (num_src1_rows == 0) { continue; } - src1_row.ne[1] = num_src1_rows; - dst_row.ne[1] = num_src1_rows; + ggml_cuda_pool_alloc dev_cur_src1_row(ctx.pool(), 1); + ggml_cuda_pool_alloc dev_row_mapping(ctx.pool(), num_src1_rows); + CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream)); + { + dim3 block_dims(std::min((unsigned int)ne10, 768u)); + dim3 grid_dims(ids->ne[1], n_ids); + k_copy_src1_to_contiguous<<>>( + src1_original, src1_contiguous.get(), + dev_cur_src1_row.get(), dev_row_mapping.get(), + ids_dev, i02, ids->nb[1], ids->nb[0], + ne11, ne10, + nb11, nb12); + CUDA_CHECK(cudaGetLastError()); + } + + src0_row.data = src0_original + i02*nb02; + + GGML_ASSERT(nb11 == sizeof(float)*ne10); + GGML_ASSERT(nb1 == sizeof(float)*ne0); + + src1_row.ne[1] = num_src1_rows; src1_row.nb[1] = nb11; src1_row.nb[2] = num_src1_rows*nb11; src1_row.nb[3] = num_src1_rows*nb11; + dst_row.ne[1] = num_src1_rows; dst_row.nb[1] = nb1; dst_row.nb[2] = num_src1_rows*nb1; dst_row.nb[3] = num_src1_rows*nb1; - ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row); + ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row); - num_src1_rows = 0; - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); - - if (row_id_i != row_id) { - continue; - } - - GGML_ASSERT(row_id >= 0 && row_id < n_as); - - CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1, - nb1, cudaMemcpyDeviceToDevice, stream)); - num_src1_rows++; + { + dim3 block_dims(std::min((unsigned int)ne0, 768u)); + dim3 grid_dims(num_src1_rows); + k_copy_dst_from_contiguous<<>>( + dst_original, dst_contiguous.get(), + dev_row_mapping.get(), + ne0, + nb1, nb2); + CUDA_CHECK(cudaGetLastError()); } } } } -static void ggml_cuda_scale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_scale); -} - -static void ggml_cuda_clamp(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_clamp); -} - -static void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const int64_t ne = ggml_nelements(src0); - GGML_ASSERT(ne == ggml_nelements(src1)); - - GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); - - GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); - GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); - - const int64_t ne00 = src0->ne[0]; - const int64_t ne01 = src0->ne[1]; - const int64_t ne02 = src0->ne[2]; - - //GGML_ASSERT(src0->ne[3] == 1); - - const int64_t nb00 = src0->nb[0]; - const int64_t nb01 = src0->nb[1]; - const int64_t nb02 = src0->nb[2]; - const int64_t nb03 = src0->nb[3]; - - const int64_t ne10 = src1->ne[0]; - const int64_t ne11 = src1->ne[1]; - const int64_t ne12 = src1->ne[2]; - - //GGML_ASSERT(src1->ne[3] == 1); - - const int64_t nb10 = src1->nb[0]; - const int64_t nb11 = src1->nb[1]; - const int64_t nb12 = src1->nb[2]; - const int64_t nb13 = src1->nb[3]; - - ggml_cuda_set_device(ctx.device); - cudaStream_t main_stream = ctx.stream(); - - char * src0_ddc = (char *) src0->data; - char * src1_ddc = (char *) src1->data; - - if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { - ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { - ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { - ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { - ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { - ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { - ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); - } else { - fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, - ggml_type_name(src0->type), ggml_type_name(src1->type)); - GGML_ASSERT(false); +static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) { + // why is this here instead of mul_mat? + if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) { + ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device); } - GGML_UNUSED(dst); -} - -static void ggml_cuda_dup(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - // TODO: why do we pass dst as src1 here? - ggml_cuda_cpy(ctx, src0, dst, nullptr); - GGML_UNUSED(src1); -} - -static void ggml_cuda_diag_mask_inf(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_diag_mask_inf); -} - -static void ggml_cuda_soft_max(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_soft_max); -} - -static void ggml_cuda_rope(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rope); -} - -static void ggml_cuda_alibi(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_alibi); -} - -static void ggml_cuda_pool2d(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pool2d); -} - -static void ggml_cuda_im2col(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_im2col); -} - -static void ggml_cuda_sum_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sum_rows); -} - -static void ggml_cuda_argsort(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_argsort); -} - -static void ggml_cuda_nop(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - GGML_UNUSED(ctx); - GGML_UNUSED(src0); - GGML_UNUSED(src1); - GGML_UNUSED(dst); -} - -static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * tensor) { - // FIXME: where should this be? - if (tensor->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(tensor->src[0]->buffer)) { - ggml_cuda_set_peer_access(tensor->src[1]->ne[1], ctx.device); - } - - ggml_cuda_func_t func; - - switch (tensor->op) { + switch (dst->op) { case GGML_OP_REPEAT: - func = ggml_cuda_repeat; + ggml_cuda_op_repeat(ctx, dst); break; case GGML_OP_GET_ROWS: - func = ggml_cuda_get_rows; + ggml_cuda_op_get_rows(ctx, dst); break; case GGML_OP_DUP: - func = ggml_cuda_dup; + ggml_cuda_dup(ctx, dst); + break; + case GGML_OP_CPY: + ggml_cuda_cpy(ctx, dst->src[0], dst->src[1]); + break; + case GGML_OP_CONT: + ggml_cuda_dup(ctx, dst); break; case GGML_OP_ADD: - func = ggml_cuda_add; + ggml_cuda_op_add(ctx, dst); break; case GGML_OP_ACC: - func = ggml_cuda_acc; + ggml_cuda_op_acc(ctx, dst); break; case GGML_OP_MUL: - func = ggml_cuda_mul; + ggml_cuda_op_mul(ctx, dst); break; case GGML_OP_DIV: - func = ggml_cuda_div; + ggml_cuda_op_div(ctx, dst); break; case GGML_OP_UNARY: - switch (ggml_get_unary_op(tensor)) { + switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_GELU: - func = ggml_cuda_gelu; + ggml_cuda_op_gelu(ctx, dst); break; case GGML_UNARY_OP_SILU: - func = ggml_cuda_silu; + ggml_cuda_op_silu(ctx, dst); break; case GGML_UNARY_OP_GELU_QUICK: - func = ggml_cuda_gelu_quick; + ggml_cuda_op_gelu_quick(ctx, dst); break; case GGML_UNARY_OP_TANH: - func = ggml_cuda_tanh; + ggml_cuda_op_tanh(ctx, dst); break; case GGML_UNARY_OP_RELU: - func = ggml_cuda_relu; + ggml_cuda_op_relu(ctx, dst); break; case GGML_UNARY_OP_HARDSIGMOID: - func = ggml_cuda_hardsigmoid; + ggml_cuda_op_hardsigmoid(ctx, dst); break; case GGML_UNARY_OP_HARDSWISH: - func = ggml_cuda_hardswish; + ggml_cuda_op_hardswish(ctx, dst); break; default: return false; } break; case GGML_OP_NORM: - func = ggml_cuda_norm; + ggml_cuda_op_norm(ctx, dst); break; case GGML_OP_GROUP_NORM: - func = ggml_cuda_group_norm; + ggml_cuda_op_group_norm(ctx, dst); break; case GGML_OP_CONCAT: - func = ggml_cuda_concat; + ggml_cuda_op_concat(ctx, dst); break; case GGML_OP_UPSCALE: - func = ggml_cuda_upscale; + ggml_cuda_op_upscale(ctx, dst); break; case GGML_OP_PAD: - func = ggml_cuda_pad; + ggml_cuda_op_pad(ctx, dst); break; case GGML_OP_ARANGE: - func = ggml_cuda_arange; + ggml_cuda_op_arange(ctx, dst); break; case GGML_OP_TIMESTEP_EMBEDDING: - func = ggml_cuda_timestep_embedding; + ggml_cuda_op_timestep_embedding(ctx, dst); break; case GGML_OP_LEAKY_RELU: - func = ggml_cuda_leaky_relu; + ggml_cuda_op_leaky_relu(ctx, dst); break; case GGML_OP_RMS_NORM: - func = ggml_cuda_rms_norm; + ggml_cuda_op_rms_norm(ctx, dst); break; case GGML_OP_MUL_MAT: - if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { - fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); + if (dst->src[0]->ne[3] != dst->src[1]->ne[3]) { + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, dst->name, dst->src[0]->ne[3], dst->src[1]->ne[3]); return false; } else { - func = ggml_cuda_mul_mat; + ggml_cuda_mul_mat(ctx, dst->src[0], dst->src[1], dst); } break; case GGML_OP_MUL_MAT_ID: - func = ggml_cuda_mul_mat_id; + ggml_cuda_mul_mat_id(ctx, dst); break; case GGML_OP_SCALE: - func = ggml_cuda_scale; + ggml_cuda_op_scale(ctx, dst); break; case GGML_OP_SQR: - func = ggml_cuda_sqr; + ggml_cuda_op_sqr(ctx, dst); break; case GGML_OP_CLAMP: - func = ggml_cuda_clamp; - break; - case GGML_OP_CPY: - func = ggml_cuda_cpy; - break; - case GGML_OP_CONT: - func = ggml_cuda_dup; + ggml_cuda_op_clamp(ctx, dst); break; case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: - func = ggml_cuda_nop; - break; + break; case GGML_OP_DIAG_MASK_INF: - func = ggml_cuda_diag_mask_inf; + ggml_cuda_op_diag_mask_inf(ctx, dst); break; case GGML_OP_SOFT_MAX: - func = ggml_cuda_soft_max; + ggml_cuda_op_soft_max(ctx, dst); break; case GGML_OP_ROPE: - func = ggml_cuda_rope; + ggml_cuda_op_rope(ctx, dst); break; case GGML_OP_ALIBI: - func = ggml_cuda_alibi; + ggml_cuda_op_alibi(ctx, dst); break; case GGML_OP_IM2COL: - func = ggml_cuda_im2col; + ggml_cuda_op_im2col(ctx, dst); break; case GGML_OP_POOL_2D: - func = ggml_cuda_pool2d; + ggml_cuda_op_pool2d(ctx, dst); break; case GGML_OP_SUM_ROWS: - func = ggml_cuda_sum_rows; + ggml_cuda_op_sum_rows(ctx, dst); break; case GGML_OP_ARGSORT: - func = ggml_cuda_argsort; + ggml_cuda_op_argsort(ctx, dst); + break; + case GGML_OP_FLASH_ATTN_EXT: + ggml_cuda_flash_attn_ext(ctx, dst); break; default: return false; } - func(ctx, tensor->src[0], tensor->src[1], tensor); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + fprintf(stderr, "%s: %s failed\n", __func__, ggml_op_desc(dst)); + CUDA_CHECK(err); + } + return true; } - //////////////////////////////////////////////////////////////////////////////// - // backend GGML_CALL static const char * ggml_backend_cuda_name(ggml_backend_t backend) { @@ -11322,19 +2374,23 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ GGML_ASSERT(cuda_ctx_src->device == buf_ctx_src->device); GGML_ASSERT(cuda_ctx_dst->device == buf_ctx_dst->device); + // copy on src stream + if (cuda_ctx_src->device == cuda_ctx_dst->device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); + } else { +#ifdef GGML_CUDA_NO_PEER_COPY + return false; +#else + CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); +#endif + } + + // record event on src stream if (!cuda_ctx_src->copy_event) { ggml_cuda_set_device(cuda_ctx_src->device); CUDA_CHECK(cudaEventCreateWithFlags(&cuda_ctx_src->copy_event, cudaEventDisableTiming)); } - // copy on src stream - if (cuda_ctx_src->device == cuda_ctx_dst->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); - } else { - CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); - } - - // record event on src stream CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream())); // wait on dst stream for the copy to complete @@ -11354,32 +2410,304 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { GGML_UNUSED(backend); } +static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { + graph_node_properties->node_address = node->data; + graph_node_properties->node_op = node->op; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + graph_node_properties->ne[i] = node->ne[i]; + graph_node_properties->nb[i] = node->nb[i]; + } + for (int i = 0; i < GGML_MAX_SRC; i++) { + graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr; + } +} + +static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) { + if (node->data != graph_node_properties->node_address && + node->op != GGML_OP_CPY && + node->op != GGML_OP_VIEW) { + return false; + } + + if (node->op != graph_node_properties->node_op) { + return false; + } + + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (node->ne[i] != graph_node_properties->ne[i]) { + return false; + } + if (node->nb[i] != graph_node_properties->nb[i]) { + return false; + } + } + + for (int i = 0; i < GGML_MAX_SRC; i++) { + if (node->src[i] && + node->src[i]->data != graph_node_properties->src_address[i] && + node->op != GGML_OP_CPY && + node->op != GGML_OP_VIEW + ) { + return false; + } + } + return true; +} + GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_device(cuda_ctx->device); - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_tensor * node = cgraph->nodes[i]; +#ifdef USE_CUDA_GRAPH + static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr); - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { - continue; + // Objects required for CUDA Graph + if (cuda_ctx->cuda_graph == nullptr) { + cuda_ctx->cuda_graph.reset(new ggml_cuda_graph()); + } + + bool use_cuda_graph = true; + bool cuda_graph_update_required = false; + // pointer to CUDA cpy kernel, which is required to identify + // kernel parameters which need updated in the graph for each token + void * ggml_cuda_cpy_fn_ptr = nullptr; + + if (cuda_ctx->cuda_graph->graph == nullptr) { + if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) { + cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true; +#ifndef NDEBUG + fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__); +#endif + } + } + + // Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly, + // or previous graph capture failure. + // Also disable for multi-gpu for now. TO DO investigate + if (disable_cuda_graphs_due_to_env + || cuda_ctx->cuda_graph->disable_due_to_gpu_arch + || cuda_ctx->cuda_graph->disable_due_to_too_many_updates + || cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) { + use_cuda_graph = false; + } + + if (use_cuda_graph) { + if (cuda_ctx->cuda_graph->instance == nullptr) { + cuda_graph_update_required = true; } + // Check if the graph size has changed + if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) { + cuda_graph_update_required = true; + cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes); + } + + // Loop over nodes in GGML graph to determine if CUDA graph update is required + // and store properties to allow this comparison for the next token + for (int i = 0; i < cgraph->n_nodes; i++) { + bool has_matching_properties = true; + if (!cuda_graph_update_required) { + has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); + } + if (!has_matching_properties) { + cuda_graph_update_required = true; + } + set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]); + } + + // Loop over nodes in GGML graph to obtain info needed for CUDA graph + cuda_ctx->cuda_graph->updated_kernel_arg.clear(); + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + + if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) { + use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture #ifndef NDEBUG - assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - for (int j = 0; j < GGML_MAX_SRC; j++) { - if (node->src[j] != nullptr) { - assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); + fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__); +#endif + } + + if (node->op == GGML_OP_MUL_MAT_ID) { + use_cuda_graph = false; // This node type is not supported by CUDA graph capture +#ifndef NDEBUG + fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__); +#endif + } + + if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) { + // disable CUDA graphs for batch size > 1 for now. + // Changes in batch size or context size can cause changes to the grid size of some kernels. + use_cuda_graph = false; +#ifndef NDEBUG + fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]); +#endif + } + + if (node->op == GGML_OP_CPY) { + // store the copy op parameter which changes with each token. + cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data)); + if (ggml_cuda_cpy_fn_ptr == nullptr) { + // store a pointer to the copy op CUDA kernel to identify it later + ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]); + } + } + + if (!use_cuda_graph) { + break; } } + + // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates. + if (cuda_graph_update_required) { + cuda_ctx->cuda_graph->number_consecutive_updates++; + } else { + cuda_ctx->cuda_graph->number_consecutive_updates = 0; + } + + if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) { + cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true; +#ifndef NDEBUG + fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__); +#endif + } + } + + if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture + CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed)); + } + +#else + bool use_cuda_graph = false; + bool cuda_graph_update_required = false; +#endif // USE_CUDA_GRAPH + + bool graph_evaluated_or_captured = false; + + while (!graph_evaluated_or_captured) { + // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph. + // With the use of CUDA graphs, the execution will be performed by the graph launch. + if (!use_cuda_graph || cuda_graph_update_required) { + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor * node = cgraph->nodes[i]; + + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + continue; + } + +#ifndef NDEBUG + assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (node->src[j] != nullptr) { + assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); + } + } #endif - bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); - if (!ok) { - fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); + if (!ok) { + fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); + } + GGML_ASSERT(ok); + } } - GGML_ASSERT(ok); + +#ifdef USE_CUDA_GRAPH + if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture + if (cuda_ctx->cuda_graph->graph != nullptr) { + CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph)); + cuda_ctx->cuda_graph->graph = nullptr; + } + CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph)); + +#if 0 + if (disable_cuda_graphs_due_to_failed_capture) { + use_cuda_graph = false; + cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true; +#ifndef NDEBUG + fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__); +#endif + } else { + graph_evaluated_or_captured = true; // CUDA graph has been captured + } +#endif + graph_evaluated_or_captured = true; // CUDA graph has been captured + } else { + graph_evaluated_or_captured = true; // ggml graph has been directly evaluated + } + } + + if (use_cuda_graph) { + if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph. + CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); + } + + // Perform update to graph (if required for this token), and change copy parameter (required for every token) + + if (cuda_graph_update_required) { + // Extract nodes from graph + if (cuda_ctx->cuda_graph->num_nodes == 0) { + // First call with null argument gets number of nodes in graph + CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes)); + } + // Subsequent call with non-null argument gets nodes + cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes); + cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes); + if (cuda_ctx->cuda_graph->num_nodes > 0) { + CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes)); + + // Loop over nodes, and extract kernel parameters from each node + for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { + cudaGraphNodeType node_type; + CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type)); + if (node_type == cudaGraphNodeTypeKernel) { + cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime + if (stat == cudaErrorInvalidDeviceFunction) { + // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node. + // We don't need to update blas nodes, so clear error and move on. + cudaGetLastError(); + } else { + GGML_ASSERT(stat == cudaSuccess); + } + } + } + } + } + + // One of the arguments to the copy kernel is updated for each token, hence we need to + // replace that argument with the updated value in the CUDA graph + if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured + int k = 0; + for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) { + if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) { + char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++); + cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr; + CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i])); + } + } + } + + // Update graph executable + cudaGraphExecUpdateResultInfo result_info; + cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info); + if (stat == cudaErrorGraphExecUpdateFailure) { +#ifndef NDEBUG + fprintf(stderr, "%s: CUDA graph update failed\n", __func__); +#endif + // The pre-existing graph exec cannot be updated due to violated constraints + // so instead clear error and re-instantiate + cudaGetLastError(); + CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance)); + cuda_ctx->cuda_graph->instance = nullptr; + CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0)); + } else { + GGML_ASSERT(stat == cudaSuccess); + } + // Launch graph + CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream())); +#else + graph_evaluated_or_captured = true; +#endif // USE_CUDA_GRAPH } return GGML_STATUS_SUCCESS; @@ -11419,7 +2747,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons ggml_type a_type = a->type; if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S || - a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { + a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) { if (b->ne[1] == 1 && ggml_nrows(b) > 1) { return false; } @@ -11513,6 +2841,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: + case GGML_OP_FLASH_ATTN_EXT: return true; default: return false; @@ -11524,12 +2853,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { const int min_batch_size = 32; - return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || + (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); GGML_UNUSED(backend); } static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { +#ifdef GGML_CUDA_NO_PEER_COPY + return nullptr; +#else ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_device(cuda_ctx->device); @@ -11541,6 +2874,7 @@ static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) /* .backend = */ backend, /* .context = */ event, }; +#endif } static void ggml_backend_cuda_event_free(ggml_backend_event_t event) { @@ -11630,7 +2964,7 @@ GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { } GGML_CALL int ggml_backend_cuda_get_device_count() { - return get_cuda_global_info().device_count; + return ggml_cuda_info().device_count; } GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { @@ -11650,6 +2984,7 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size return false; } +#if CUDART_VERSION >= 11100 cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); if (err != cudaSuccess) { // clear the error @@ -11660,6 +2995,9 @@ GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size return false; } return true; +#else + return false; +#endif } GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { diff --git a/ggml-cuda/acc.cu b/ggml-cuda/acc.cu new file mode 100644 index 000000000..96bfe1c9d --- /dev/null +++ b/ggml-cuda/acc.cu @@ -0,0 +1,47 @@ +#include "acc.cuh" + +static __global__ void acc_f32(const float * x, const float * y, float * dst, const int ne, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, int offset) { + const int i = blockDim.x * blockIdx.x + threadIdx.x; + if (i >= ne) { + return; + } + int src1_idx = i - offset; + int oz = src1_idx / nb2; + int oy = (src1_idx - (oz * nb2)) / nb1; + int ox = src1_idx % nb1; + if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) { + dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11]; + } else { + dst[i] = x[i]; + } +} + +static void acc_f32_cuda(const float * x, const float * y, float * dst, const int n_elements, + const int ne10, const int ne11, const int ne12, + const int nb1, const int nb2, const int offset, cudaStream_t stream) { + int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE; + acc_f32<<>>(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset); +} + +void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported + + int nb1 = dst->op_params[0] / 4; // 4 bytes of float32 + int nb2 = dst->op_params[1] / 4; // 4 bytes of float32 + // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused + int offset = dst->op_params[3] / 4; // offset in bytes + + acc_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, stream); +} diff --git a/ggml-cuda/acc.cuh b/ggml-cuda/acc.cuh new file mode 100644 index 000000000..1168ea1b2 --- /dev/null +++ b/ggml-cuda/acc.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_ACC_BLOCK_SIZE 256 + +void ggml_cuda_op_acc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/alibi.cu b/ggml-cuda/alibi.cu new file mode 100644 index 000000000..6c7f1fd95 --- /dev/null +++ b/ggml-cuda/alibi.cu @@ -0,0 +1,63 @@ +#include "alibi.cuh" + +static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows, + const int n_heads_log2_floor, const float m0, const float m1) { + const int col = blockDim.x*blockIdx.x + threadIdx.x; + + if (col >= ncols) { + return; + } + + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + + const int k = row/k_rows; + + float m_k; + if (k < n_heads_log2_floor) { + m_k = powf(m0, k + 1); + } else { + m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1); + } + + dst[i] = col * m_k + x[i]; +} + +static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, + const int k_rows, const int n_heads_log2_floor, const float m0, + const float m1, cudaStream_t stream) { + const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1); + const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE); + const dim3 block_nums(num_blocks_x, nrows, 1); + alibi_f32<<>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1); +} + +void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t nrows = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); + + //GGML_ASSERT(ne01 + n_past == ne00); + GGML_ASSERT(n_head == ne02); + + const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); + + const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream); +} diff --git a/ggml-cuda/alibi.cuh b/ggml-cuda/alibi.cuh new file mode 100644 index 000000000..630adfc7f --- /dev/null +++ b/ggml-cuda/alibi.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_ALIBI_BLOCK_SIZE 32 + +void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/arange.cu b/ggml-cuda/arange.cu new file mode 100644 index 000000000..b5e495a24 --- /dev/null +++ b/ggml-cuda/arange.cu @@ -0,0 +1,34 @@ +#include "arange.cuh" + +static __global__ void arange_f32(float * dst, const int ne0, const float start, const float step) { + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + dst[nidx] = start + step * nidx; +} + +static void arange_f32_cuda(float * dst, const int ne0, const float start, const float step, cudaStream_t stream) { + int num_blocks = (ne0 + CUDA_ARANGE_BLOCK_SIZE - 1) / CUDA_ARANGE_BLOCK_SIZE; + arange_f32<<>>(dst, ne0, start, step); +} + +void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float start; + float stop; + float step; + memcpy(&start, (float *)dst->op_params + 0, sizeof(float)); + memcpy(&stop, (float *)dst->op_params + 1, sizeof(float)); + memcpy(&step, (float *)dst->op_params + 2, sizeof(float)); + + int64_t steps = (int64_t)ceil((stop - start) / step); + GGML_ASSERT(ggml_nelements(dst) == steps); + + arange_f32_cuda(dst_d, dst->ne[0], start, step, stream); +} diff --git a/ggml-cuda/arange.cuh b/ggml-cuda/arange.cuh new file mode 100644 index 000000000..41e74fdfc --- /dev/null +++ b/ggml-cuda/arange.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_ARANGE_BLOCK_SIZE 256 + +void ggml_cuda_op_arange(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/argsort.cu b/ggml-cuda/argsort.cu new file mode 100644 index 000000000..164144061 --- /dev/null +++ b/ggml-cuda/argsort.cu @@ -0,0 +1,103 @@ +#include "argsort.cuh" + +template +static inline __device__ void ggml_cuda_swap(T & a, T & b) { + T tmp = a; + a = b; + b = tmp; +} + +template +static __global__ void k_argsort_f32_i32(const float * x, int * dst, const int ncols, int ncols_pad) { + // bitonic sort + int col = threadIdx.x; + int row = blockIdx.y; + + if (col >= ncols_pad) { + return; + } + + const float * x_row = x + row * ncols; + extern __shared__ int dst_row[]; + + // initialize indices + dst_row[col] = col; + + __syncthreads(); + + for (int k = 2; k <= ncols_pad; k *= 2) { + for (int j = k / 2; j > 0; j /= 2) { + int ixj = col ^ j; + if (ixj > col) { + if ((col & k) == 0) { + if (dst_row[col] >= ncols || + (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]])) + ) { + ggml_cuda_swap(dst_row[col], dst_row[ixj]); + } + } else { + if (dst_row[ixj] >= ncols || + (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]])) + ) { + ggml_cuda_swap(dst_row[col], dst_row[ixj]); + } + } + } + __syncthreads(); + } + } + + // copy the result to dst without the padding + if (col < ncols) { + dst[row * ncols + col] = dst_row[col]; + } +} + +static int next_power_of_2(int x) { + int n = 1; + while (n < x) { + n *= 2; + } + return n; +} + +static void argsort_f32_i32_cuda(const float * x, int * dst, const int ncols, const int nrows, ggml_sort_order order, cudaStream_t stream) { + // bitonic sort requires ncols to be power of 2 + const int ncols_pad = next_power_of_2(ncols); + + const dim3 block_dims(ncols_pad, 1, 1); + const dim3 block_nums(1, nrows, 1); + const size_t shared_mem = ncols_pad * sizeof(int); + + GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb); + + if (order == GGML_SORT_ORDER_ASC) { + k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); + } else if (order == GGML_SORT_ORDER_DESC) { + k_argsort_f32_i32<<>>(x, dst, ncols, ncols_pad); + } else { + GGML_ASSERT(false); + } +} + +void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + + argsort_f32_i32_cuda(src0_d, (int *)dst_d, ncols, nrows, order, stream); +} diff --git a/ggml-cuda/argsort.cuh b/ggml-cuda/argsort.cuh new file mode 100644 index 000000000..68a001547 --- /dev/null +++ b/ggml-cuda/argsort.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_argsort(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/binbcast.cu b/ggml-cuda/binbcast.cu new file mode 100644 index 000000000..19b08b74f --- /dev/null +++ b/ggml-cuda/binbcast.cu @@ -0,0 +1,280 @@ +#include "binbcast.cuh" + +static __device__ __forceinline__ float op_repeat(const float a, const float b) { + return b; + GGML_UNUSED(a); +} + +static __device__ __forceinline__ float op_add(const float a, const float b) { + return a + b; +} + +static __device__ __forceinline__ float op_mul(const float a, const float b) { + return a * b; +} + +static __device__ __forceinline__ float op_div(const float a, const float b) { + return a / b; +} + +template +static __global__ void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13) { + const int i0s = blockDim.x*blockIdx.x + threadIdx.x; + const int i1 = (blockDim.y*blockIdx.y + threadIdx.y); + const int i2 = (blockDim.z*blockIdx.z + threadIdx.z) / ne3; + const int i3 = (blockDim.z*blockIdx.z + threadIdx.z) % ne3; + + if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + for (int i0 = i0s; i0 < ne0; i0 += blockDim.x*gridDim.x) { + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); + } +} + +template +static __global__ void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst, + int ne0, int ne1, int ne2, int ne3, + int ne10, int ne11, int ne12, int ne13, + /*int s0, */ int s1, int s2, int s3, + /*int s00,*/ int s01, int s02, int s03, + /*int s10,*/ int s11, int s12, int s13) { + + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + const int i3 = i/(ne2*ne1*ne0); + const int i2 = (i/(ne1*ne0)) % ne2; + const int i1 = (i/ne0) % ne1; + const int i0 = i % ne0; + + if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) { + return; + } + + const int i11 = i1 % ne11; + const int i12 = i2 % ne12; + const int i13 = i3 % ne13; + + const size_t i_src0 = i3*s03 + i2*s02 + i1*s01; + const size_t i_src1 = i13*s13 + i12*s12 + i11*s11; + const size_t i_dst = i3*s3 + i2*s2 + i1*s1; + + const src0_t * src0_row = src0 + i_src0; + const src1_t * src1_row = src1 + i_src1; + dst_t * dst_row = dst + i_dst; + + const int i10 = i0 % ne10; + dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]); +} + +template +struct bin_bcast_cuda { + template + void operator()(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, + const src0_t * src0_dd, const src1_t * src1_dd, dst_t * dst_dd, + cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + int nr0 = ne10/ne0; + int nr1 = ne11/ne1; + int nr2 = ne12/ne2; + int nr3 = ne13/ne3; + + int nr[4] = { nr0, nr1, nr2, nr3 }; + + // collapse dimensions until first broadcast dimension + int64_t cne[] = {ne0, ne1, ne2, ne3}; + int64_t cne0[] = {ne00, ne01, ne02, ne03}; + int64_t cne1[] = {ne10, ne11, ne12, ne13}; + + size_t cnb[] = {nb0, nb1, nb2, nb3}; + size_t cnb0[] = {nb00, nb01, nb02, nb03}; + size_t cnb1[] = {nb10, nb11, nb12, nb13}; + + auto collapse = [](int64_t cne[]) { + cne[0] *= cne[1]; + cne[1] = cne[2]; + cne[2] = cne[3]; + cne[3] = 1; + }; + + auto collapse_nb = [](size_t cnb[], const int64_t cne[]) { + cnb[1] *= cne[1]; + cnb[2] *= cne[2]; + cnb[3] *= cne[3]; + }; + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ggml_is_contiguous(dst)) { + for (int i = 0; i < 4; i++) { + if (nr[i] != 1) { + break; + } + if (i > 0) { + collapse_nb(cnb, cne); + collapse_nb(cnb0, cne0); + collapse_nb(cnb1, cne1); + collapse(cne); + collapse(cne0); + collapse(cne1); + } + } + } + + { + int64_t ne0 = cne[0]; + int64_t ne1 = cne[1]; + int64_t ne2 = cne[2]; + int64_t ne3 = cne[3]; + + //int64_t ne00 = cne0[0]; GGML_UNUSED(ne00); + //int64_t ne01 = cne0[1]; GGML_UNUSED(ne01); + //int64_t ne02 = cne0[2]; GGML_UNUSED(ne02); + //int64_t ne03 = cne0[3]; GGML_UNUSED(ne03); + + int64_t ne10 = cne1[0]; + int64_t ne11 = cne1[1]; + int64_t ne12 = cne1[2]; + int64_t ne13 = cne1[3]; + + size_t nb0 = cnb[0]; + size_t nb1 = cnb[1]; + size_t nb2 = cnb[2]; + size_t nb3 = cnb[3]; + + size_t nb00 = cnb0[0]; + size_t nb01 = cnb0[1]; + size_t nb02 = cnb0[2]; + size_t nb03 = cnb0[3]; + + size_t nb10 = cnb1[0]; + size_t nb11 = cnb1[1]; + size_t nb12 = cnb1[2]; + size_t nb13 = cnb1[3]; + + size_t s0 = nb0 / sizeof(dst_t); + size_t s1 = nb1 / sizeof(dst_t); + size_t s2 = nb2 / sizeof(dst_t); + size_t s3 = nb3 / sizeof(dst_t); + + size_t s10 = nb10 / sizeof(src1_t); + size_t s11 = nb11 / sizeof(src1_t); + size_t s12 = nb12 / sizeof(src1_t); + size_t s13 = nb13 / sizeof(src1_t); + + size_t s00 = nb00 / sizeof(src0_t); + size_t s01 = nb01 / sizeof(src0_t); + size_t s02 = nb02 / sizeof(src0_t); + size_t s03 = nb03 / sizeof(src0_t); + + GGML_ASSERT(nb0 % sizeof(dst_t) == 0); + GGML_ASSERT(nb1 % sizeof(dst_t) == 0); + GGML_ASSERT(nb2 % sizeof(dst_t) == 0); + GGML_ASSERT(nb3 % sizeof(dst_t) == 0); + + GGML_ASSERT(nb00 % sizeof(src0_t) == 0); + GGML_ASSERT(nb01 % sizeof(src0_t) == 0); + GGML_ASSERT(nb02 % sizeof(src0_t) == 0); + GGML_ASSERT(nb03 % sizeof(src0_t) == 0); + + GGML_ASSERT(nb10 % sizeof(src1_t) == 0); + GGML_ASSERT(nb11 % sizeof(src1_t) == 0); + GGML_ASSERT(nb12 % sizeof(src1_t) == 0); + GGML_ASSERT(nb13 % sizeof(src1_t) == 0); + + GGML_ASSERT(s0 == 1); + GGML_ASSERT(s00 == 1); + GGML_ASSERT(s10 == 1); + + const int block_size = 128; + + int64_t hne0 = std::max(ne0/2LL, 1LL); + + dim3 block_dims; + block_dims.x = std::min(hne0, block_size); + block_dims.y = std::min(ne1, block_size / block_dims.x); + block_dims.z = std::min(std::min(ne2*ne3, block_size / block_dims.x / block_dims.y), 64U); + + dim3 block_nums( + (hne0 + block_dims.x - 1) / block_dims.x, + (ne1 + block_dims.y - 1) / block_dims.y, + (ne2*ne3 + block_dims.z - 1) / block_dims.z + ); + + if (block_nums.z > 65535) { + // this is the maximum number of blocks in z dimension, fallback to 1D grid kernel + int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size; + k_bin_bcast_unravel<<>>( + src0_dd, src1_dd, dst_dd, + ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00, */ s01, s02, s03, + /* s10, */ s11, s12, s13); + } else { + k_bin_bcast<<>>( + src0_dd, src1_dd, dst_dd, + ne0, ne1, ne2, ne3, + ne10, ne11, ne12, ne13, + /* s0, */ s1, s2, s3, + /* s00, */ s01, s02, s03, + /* s10, */ s11, s12, s13); + } + } + } +}; + +template +static void ggml_cuda_op_bin_bcast( + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const void * src1_dd, void * dst_dd, cudaStream_t stream) { + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const float *)src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { + op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (half *) dst_dd, stream); + } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + op()(src0, src1, dst, (const half *) src0_dd, (const float *)src1_dd, (float *)dst_dd, stream); + } else { + fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__, + ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } +} + +void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst, dst->src[0], dst, nullptr, dst->src[0]->data, dst->data, ctx.stream()); +} + +void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); +} + +void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); +} + +void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_bin_bcast>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream()); +} diff --git a/ggml-cuda/binbcast.cuh b/ggml-cuda/binbcast.cuh new file mode 100644 index 000000000..4f63d6372 --- /dev/null +++ b/ggml-cuda/binbcast.cuh @@ -0,0 +1,6 @@ +#include "common.cuh" + +void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst); +void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/clamp.cu b/ggml-cuda/clamp.cu new file mode 100644 index 000000000..8009a3e3d --- /dev/null +++ b/ggml-cuda/clamp.cu @@ -0,0 +1,34 @@ +#include "clamp.cuh" + +static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); +} + +static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; + clamp_f32<<>>(x, dst, min, max, k); +} + + +void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); + + clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream); +} diff --git a/ggml-cuda/clamp.cuh b/ggml-cuda/clamp.cuh new file mode 100644 index 000000000..7f9559dd1 --- /dev/null +++ b/ggml-cuda/clamp.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_CLAMP_BLOCK_SIZE 256 + +void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/common.cuh b/ggml-cuda/common.cuh new file mode 100644 index 000000000..44e67e040 --- /dev/null +++ b/ggml-cuda/common.cuh @@ -0,0 +1,654 @@ +#pragma once + +#include "ggml.h" +#include "ggml-cuda.h" + +#include + +#if defined(GGML_USE_HIPBLAS) +#define GGML_COMMON_DECL_HIP +#define GGML_COMMON_IMPL_HIP +#else +#define GGML_COMMON_DECL_CUDA +#define GGML_COMMON_IMPL_CUDA +#endif +#include "ggml-common.h" + +#include +#include +#include +#include +#include +#include + +#if defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy +#define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaErrorPeerAccessAlreadyEnabled hipErrorPeerAccessAlreadyEnabled +#define cudaErrorPeerAccessNotEnabled hipErrorPeerAccessNotEnabled +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly +#define cudaHostUnregister hipHostUnregister +#define cudaLaunchHostFunc hipLaunchHostFunc +#ifdef GGML_HIP_UMA +#define cudaMalloc hipMallocManaged +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size) +#else +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#endif +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyPeerAsync hipMemcpyPeerAsync +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaMemGetInfo hipMemGetInfo +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamFireAndForget hipStreamFireAndForget +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamPerThread hipStreamPerThread +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess +#define __trap abort +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED +#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED +#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE +#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH +#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR +#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED +#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED +#else +#include +#include +#include +#include + +#if CUDART_VERSION < 11020 +#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED +#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH +#define CUBLAS_COMPUTE_16F CUDA_R_16F +#define CUBLAS_COMPUTE_32F CUDA_R_32F +#define cublasComputeType_t cudaDataType_t +#endif // CUDART_VERSION < 11020 + +#endif // defined(GGML_USE_HIPBLAS) + +#define STRINGIZE_IMPL(...) #__VA_ARGS__ +#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__) + +#define WARP_SIZE 32 +#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) +#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons + +#define CC_PASCAL 600 +#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products +#define CC_VOLTA 700 +#define CC_AMPERE 800 +#define CC_OFFSET_AMD 1000000 +#define CC_RDNA1 (CC_OFFSET_AMD + 1010) +#define CC_RDNA2 (CC_OFFSET_AMD + 1030) +#define CC_RDNA3 (CC_OFFSET_AMD + 1100) + +// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication +// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant +// for large computational tasks. the drawback is that this requires some extra amount of VRAM: +// - 7B quantum model: +100-200 MB +// - 13B quantum model: +200-400 MB +// +//#define GGML_CUDA_FORCE_MMQ + +// TODO: improve this to be correct for more hardware +// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores +#if !defined(GGML_CUDA_FORCE_MMQ) +#define CUDA_USE_TENSOR_CORES +#endif + +#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels +#define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available + +#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#define GGML_CUDA_MAX_STREAMS 8 + +[[noreturn]] +void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg); + +#define CUDA_CHECK_GEN(err, success, error_fn) \ + do { \ + auto err_ = (err); \ + if (err_ != (success)) { \ + ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ + } \ + } while (0) + +#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) + +#if CUDART_VERSION >= 12000 + static const char * cublas_get_error_str(const cublasStatus_t err) { + return cublasGetStatusString(err); + } +#else + static const char * cublas_get_error_str(const cublasStatus_t err) { + switch (err) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + default: return "unknown error"; + } + } +#endif // CUDART_VERSION >= 12000 + +#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) + +#if !defined(GGML_USE_HIPBLAS) +static const char * cu_get_error_str(CUresult err) { + const char * err_str; + cuGetErrorString(err, &err_str); + return err_str; +} +#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) +#endif + +#if CUDART_VERSION >= 11100 +#define GGML_CUDA_ASSUME(x) __builtin_assume(x) +#else +#define GGML_CUDA_ASSUME(x) +#endif // CUDART_VERSION >= 11100 + +#ifdef GGML_CUDA_F16 +typedef half dfloat; // dequantize float +typedef half2 dfloat2; +#else +typedef float dfloat; // dequantize float +typedef float2 dfloat2; +#endif //GGML_CUDA_F16 + +#if defined(GGML_USE_HIPBLAS) +#define __CUDA_ARCH__ 1300 + +#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \ + defined(__gfx1150__) || defined(__gfx1151__) +#define RDNA3 +#endif + +#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \ + defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__) +#define RDNA2 +#endif + +#ifndef __has_builtin + #define __has_builtin(x) 0 +#endif + +typedef int8_t int8x4_t __attribute__((ext_vector_type(4))); +typedef uint8_t uint8x4_t __attribute__((ext_vector_type(4))); +static __device__ __forceinline__ int __vsubss4(const int a, const int b) { + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); +#if __has_builtin(__builtin_elementwise_sub_sat) + const int8x4_t c = __builtin_elementwise_sub_sat(va, vb); + return reinterpret_cast(c); +#else + int8x4_t c; + int16_t tmp; +#pragma unroll + for (int i = 0; i < 4; i++) { + tmp = va[i] - vb[i]; + if(tmp > std::numeric_limits::max()) tmp = std::numeric_limits::max(); + if(tmp < std::numeric_limits::min()) tmp = std::numeric_limits::min(); + c[i] = tmp; + } + return reinterpret_cast(c); +#endif // __has_builtin(__builtin_elementwise_sub_sat) +} + +static __device__ __forceinline__ int __vsub4(const int a, const int b) { + return __vsubss4(a, b); +} + +static __device__ __forceinline__ unsigned int __vcmpeq4(unsigned int a, unsigned int b) { + const uint8x4_t& va = reinterpret_cast(a); + const uint8x4_t& vb = reinterpret_cast(b); + unsigned int c; + uint8x4_t& vc = reinterpret_cast(c); +#pragma unroll + for (int i = 0; i < 4; ++i) { + vc[i] = va[i] == vb[i] ? 0xff : 0x00; + } + return c; +} + +static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { +#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__) + c = __builtin_amdgcn_sdot4(a, b, c, false); +#elif defined(RDNA3) + c = __builtin_amdgcn_sudot4( true, a, true, b, c, false); +#elif defined(__gfx1010__) || defined(__gfx900__) + int tmp1; + int tmp2; + asm("\n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \ + v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \ + v_add3_u32 %0, %1, %2, %0 \n \ + " + : "+v"(c), "=&v"(tmp1), "=&v"(tmp2) + : "v"(a), "v"(b) + ); +#else + const int8x4_t va = reinterpret_cast(a); + const int8x4_t vb = reinterpret_cast(b); + c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3]; +#endif + return c; +} +#endif // defined(GGML_USE_HIPBLAS) + +#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL + +#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA + +static bool fp16_mma_available(const int cc) { + return cc < CC_OFFSET_AMD && cc >= CC_VOLTA; +} + +[[noreturn]] +static __device__ void no_device_code( + const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", + file_name, line, function_name, arch); + GGML_UNUSED(arch_list); +#else + printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", + file_name, line, function_name, arch, arch_list); +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + __trap(); + + GGML_UNUSED(no_device_code); // suppress unused function warning +} + +#ifdef __CUDA_ARCH__ +#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) +#else +#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") +#endif // __CUDA_ARCH__ + +static __device__ __forceinline__ float warp_reduce_sum(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x += __shfl_xor_sync(0xffffffff, x, mask, 32); + } + return x; +} + +static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32); + a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32); + } + return a; +} + +static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { +#if FP16_AVAILABLE + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32); + reinterpret_cast(a.x) += __low2half(a_other); + reinterpret_cast(a.y) += __high2half(a_other); + } + return a; +#else +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); + } + return a; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + +#else + NO_DEVICE_CODE; + return a; +#endif // FP16_AVAILABLE +} + +static __device__ __forceinline__ float warp_reduce_max(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + } + return x; +} + +static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) { +#if FP16_AVAILABLE + +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX + return __float2half(fmaxf(__half2float(a), __half2float(b))); +#else + return __hmax(a, b); +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX + +#else + NO_DEVICE_CODE; + GGML_UNUSED(b); + return a; +#endif // FP16_AVAILABLE +} + +static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) { +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) + +#if CUDART_VERSION >= CUDART_HMAX + return __hmax2(a, b); +#else + half2 ret; + reinterpret_cast(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b))); + reinterpret_cast(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b))); + return ret; +#endif // CUDART_VERSION >= CUDART_HMAX + +#else + GGML_UNUSED(a); + GGML_UNUSED(b); + NO_DEVICE_CODE; +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +} + +static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); + } + return x; +#else + GGML_UNUSED(x); + NO_DEVICE_CODE; +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +} + +#if CUDART_VERSION < CUDART_HMASK +static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) { + const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b))); + const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b))); + return mask_low | mask_high; +} +#endif // CUDART_VERSION < 12000 + +// TODO: move to ggml-common.h +static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + +typedef void (*dequantize_kernel_t)(const void * vx, const int64_t ib, const int iqs, dfloat2 & v); + + +////////////////////// + +struct ggml_cuda_device_info { + int device_count; + + struct cuda_device_info { + int cc; // compute capability + int nsm; // number of streaming multiprocessors + size_t smpb; // max. shared memory per block + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory + size_t total_vram; + }; + + cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; + + std::array default_tensor_split = {}; +}; + +const ggml_cuda_device_info & ggml_cuda_info(); + +void ggml_cuda_set_device(int device); +int ggml_cuda_get_device(); + +struct ggml_cuda_pool { + virtual ~ggml_cuda_pool() = default; + + virtual void * alloc(size_t size, size_t * actual_size) = 0; + virtual void free(void * ptr, size_t size) = 0; +}; + +template +struct ggml_cuda_pool_alloc { + ggml_cuda_pool * pool = nullptr; + T * ptr = nullptr; + size_t actual_size = 0; + + ggml_cuda_pool_alloc() = default; + + explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) { + } + + ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) { + alloc(size); + } + + ~ggml_cuda_pool_alloc() { + if (ptr != nullptr) { + pool->free(ptr, actual_size); + } + } + + // size is in number of elements + T * alloc(size_t size) { + GGML_ASSERT(pool != nullptr); + GGML_ASSERT(ptr == nullptr); + ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); + return ptr; + } + + T * alloc(ggml_cuda_pool & pool, size_t size) { + this->pool = &pool; + return alloc(size); + } + + T * get() { + return ptr; + } + + ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete; + ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete; + ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete; + ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete; +}; + + +// backend interface + +struct ggml_tensor_extra_gpu { + void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors + cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs +}; + + +#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS) +#define USE_CUDA_GRAPH +#endif + +struct ggml_graph_node_properties { + void * node_address; + ggml_op node_op; + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + void * src_address[GGML_MAX_SRC]; +}; + +struct ggml_cuda_graph { +#ifdef USE_CUDA_GRAPH + ~ggml_cuda_graph() { + if (instance != nullptr) { + CUDA_CHECK(cudaGraphExecDestroy(instance)); + } + if (graph != nullptr) { + CUDA_CHECK(cudaGraphDestroy(graph)); + } + } + cudaGraph_t graph = nullptr; + cudaGraphExec_t instance = nullptr; + size_t num_nodes = 0; + std::vector nodes; + std::vector params; + bool disable_due_to_gpu_arch = false; + bool disable_due_to_too_many_updates = false; + bool disable_due_to_failed_graph_capture = false; + int number_consecutive_updates = 0; + std::vector ggml_graph_properties; + std::vector updated_kernel_arg; +#endif +}; + +struct ggml_backend_cuda_context { + int device; + std::string name; + cudaEvent_t copy_event = nullptr; + + cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; + cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + + std::unique_ptr cuda_graph; + + explicit ggml_backend_cuda_context(int device) : + device(device), + name(GGML_CUDA_NAME + std::to_string(device)) { + } + + ~ggml_backend_cuda_context() { + if (copy_event != nullptr) { + CUDA_CHECK(cudaEventDestroy(copy_event)); + } + for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { + for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { + if (streams[i][j] != nullptr) { + CUDA_CHECK(cudaStreamDestroy(streams[i][j])); + } + } + if (cublas_handles[i] != nullptr) { + CUBLAS_CHECK(cublasDestroy(cublas_handles[i])); + } + } + } + + cudaStream_t stream(int device, int stream) { + if (streams[device][stream] == nullptr) { + ggml_cuda_set_device(device); + CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking)); + } + return streams[device][stream]; + } + + cudaStream_t stream() { + return stream(device, 0); + } + + cublasHandle_t cublas_handle(int device) { + if (cublas_handles[device] == nullptr) { + ggml_cuda_set_device(device); + CUBLAS_CHECK(cublasCreate(&cublas_handles[device])); + CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH)); + } + return cublas_handles[device]; + } + + cublasHandle_t cublas_handle() { + return cublas_handle(device); + } + + // pool + std::unique_ptr pools[GGML_CUDA_MAX_DEVICES]; + + static std::unique_ptr new_pool_for_device(int device); + + ggml_cuda_pool & pool(int device) { + if (pools[device] == nullptr) { + pools[device] = new_pool_for_device(device); + } + return *pools[device]; + } + + ggml_cuda_pool & pool() { + return pool(device); + } +}; diff --git a/ggml-cuda/concat.cu b/ggml-cuda/concat.cu new file mode 100644 index 000000000..2941d2f17 --- /dev/null +++ b/ggml-cuda/concat.cu @@ -0,0 +1,49 @@ +#include "concat.cuh" + +static __global__ void concat_f32(const float * x,const float * y, float * dst, const int ne0, const int ne02) { + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + // operation + int offset_dst = + nidx + + blockIdx.y * ne0 + + blockIdx.z * ne0 * gridDim.y; + if (blockIdx.z < ne02) { // src0 + int offset_src = + nidx + + blockIdx.y * ne0 + + blockIdx.z * ne0 * gridDim.y; + dst[offset_dst] = x[offset_src]; + } else { + int offset_src = + nidx + + blockIdx.y * ne0 + + (blockIdx.z - ne02) * ne0 * gridDim.y; + dst[offset_dst] = y[offset_src]; + } +} + +static void concat_f32_cuda(const float * x, const float * y, float * dst, const int ne0, int ne1, int ne2, int ne02, cudaStream_t stream) { + int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne1, ne2); + concat_f32<<>>(x, y, dst, ne0, ne02); +} + +void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + for (int i3 = 0; i3 < dst->ne[3]; i3++) { + concat_f32_cuda(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4), dst_d + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], stream); + } +} diff --git a/ggml-cuda/concat.cuh b/ggml-cuda/concat.cuh new file mode 100644 index 000000000..aa506a05f --- /dev/null +++ b/ggml-cuda/concat.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_CONCAT_BLOCK_SIZE 256 + +void ggml_cuda_op_concat(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/convert.cu b/ggml-cuda/convert.cu new file mode 100644 index 000000000..830e2d756 --- /dev/null +++ b/ggml-cuda/convert.cu @@ -0,0 +1,824 @@ +#include "convert.cuh" +#include "dequantize.cuh" + +#define CUDA_Q8_0_NE_ALIGN 2048 + +template +static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { + const int64_t i = (int64_t)2*(blockDim.x*blockIdx.x + threadIdx.x); + + if (i >= k) { + return; + } + + const int64_t ib = i/qk; // block index + const int64_t iqs = (i%qk)/qr; // quant index + const int64_t iybs = i - i%qk; // y block start index + const int64_t y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(vx, ib, iqs, v); + + y[iybs + iqs + 0] = v.x; + y[iybs + iqs + y_offset] = v.y; +} + +template +static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, half * __restrict__ y, const int64_t k) { +#if __CUDA_ARCH__ >= CC_PASCAL + constexpr int nint = CUDA_Q8_0_NE_ALIGN/sizeof(int) + WARP_SIZE; + + const int64_t i0 = CUDA_Q8_0_NE_ALIGN*blockIdx.x; + const int * x0 = ((int *) vx) + blockIdx.x * nint; + half2 * y2 = (half2 *) (y + i0); + + __shared__ int vals[nint]; + +#pragma unroll + for (int ix0 = 0; ix0 < nint; ix0 += WARP_SIZE) { + if (need_check && i0*sizeof(block_q8_0)/QK8_0 + sizeof(int)*(ix0 + threadIdx.x) >= k*sizeof(block_q8_0)/QK8_0) { + break; + } + + const int ix = ix0 + threadIdx.x; + vals[ix] = x0[ix]; + } + + __syncthreads(); + +#pragma unroll + for (int iy = 0; iy < CUDA_Q8_0_NE_ALIGN; iy += 2*WARP_SIZE) { + if (need_check && i0 + iy + 2*threadIdx.x >= k) { + return; + } + + const half * b0 = ((const half *) vals) + (sizeof(block_q8_0)/sizeof(half)) * ((iy + 2*threadIdx.x)/QK8_0); + const half d = *b0; + const char2 qs = ((const char2 *) (b0 + 1))[threadIdx.x % (QK8_0/2)]; + + y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d)); + } +#else + GGML_UNUSED(vx); + GGML_UNUSED(y); + GGML_UNUSED(k); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_PASCAL +} + +template +static __global__ void dequantize_block_q4_0(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int64_t i = blockIdx.x; + + // assume 32 threads + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; + const int64_t ir = tid%8; + const int64_t ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_0 * x = (const block_q4_0 *)vx + ib; + const float d = __half2float(x->d); + const float dm = -8*d; + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d * (q[l] & 0xF) + dm; + y[l+16] = d * (q[l] >> 4) + dm; + } +} + +template +static __global__ void dequantize_block_q4_1(const void * __restrict__ vx, dst_t * __restrict__ yy, int nb32) { + + const int64_t i = blockIdx.x; + + // assume 32 threads + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; + const int64_t ir = tid%8; + const int64_t ib = 8*i + ir; + if (ib >= nb32) { + return; + } + + dst_t * y = yy + 256*i + 32*ir + 4*il; + + const block_q4_1 * x = (const block_q4_1 *)vx + ib; + const float2 d = __half22float2(x->dm); + + const uint8_t * q = x->qs + 4*il; + + for (int l = 0; l < 4; ++l) { + y[l+ 0] = d.x * (q[l] & 0xF) + d.y; + y[l+16] = d.x * (q[l] >> 4) + d.y; + } +} + +//================================== k-quants + +template +static __global__ void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_q2_K * x = (const block_q2_K *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t n = tid/32; + const int64_t l = tid - 32*n; + const int64_t is = 8*n + l/16; + + const uint8_t q = x[i].qs[32*n + l]; + dst_t * y = yy + i*QK_K + 128*n; + + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); + y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4); + y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4); + y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4); +#else + const int64_t is = tid/16; // 0 or 1 + const int64_t il = tid%16; // 0...15 + const uint8_t q = x[i].qs[il] >> (2*is); + dst_t * y = yy + i*QK_K + 16*is + il; + float dall = __low2half(x[i].dm); + float dmin = __high2half(x[i].dm); + y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4); + y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4); +#endif + +} + +template +static __global__ void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_q3_K * x = (const block_q3_K *) vx; + +#if QK_K == 256 + const int64_t r = threadIdx.x/4; + const int64_t tid = r/2; + const int64_t is0 = r%2; + const int64_t l0 = 16*is0 + 4*(threadIdx.x%4); + const int64_t n = tid / 4; + const int64_t j = tid - 4*n; + + uint8_t m = 1 << (4*n + j); + int64_t is = 8*n + 2*j + is0; + int shift = 2*j; + + int8_t us = is < 4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (x[i].scales[is-8] >> 4) | (((x[i].scales[is+0] >> 4) & 3) << 4) : + (x[i].scales[is-8] >> 4) | (((x[i].scales[is-4] >> 6) & 3) << 4); + float d_all = x[i].d; + float dl = d_all * (us - 32); + + dst_t * y = yy + i*QK_K + 128*n + 32*j; + const uint8_t * q = x[i].qs + 32*n; + const uint8_t * hm = x[i].hmask; + + for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4)); +#else + const int64_t tid = threadIdx.x; + const int64_t is = tid/16; // 0 or 1 + const int64_t il = tid%16; // 0...15 + const int64_t im = il/8; // 0...1 + const int64_t in = il%8; // 0...7 + + dst_t * y = yy + i*QK_K + 16*is + il; + + const uint8_t q = x[i].qs[il] >> (2*is); + const uint8_t h = x[i].hmask[in] >> (2*is + im); + const float d = (float)x[i].d; + + if (is == 0) { + y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } else { + y[ 0] = d * ((x[i].scales[0] >> 4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4)); + y[32] = d * ((x[i].scales[1] >> 4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4)); + } +#endif + +} + +#if QK_K == 256 +static inline __device__ void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) { + if (j < 4) { + d = q[j] & 63; m = q[j + 4] & 63; + } else { + d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4); + m = (q[j+4] >> 4) | ((q[j-0] >> 6) << 4); + } +} +#endif + +template +static __global__ void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const block_q4_K * x = (const block_q4_K *) vx; + + const int64_t i = blockIdx.x; + +#if QK_K == 256 + // assume 32 threads + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; + const int64_t ir = tid%8; + const int64_t is = 2*il; + const int64_t n = 4; + + dst_t * y = yy + i*QK_K + 64*il + n*ir; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint8_t * q = x[i].qs + 32*il + n*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + for (int l = 0; l < n; ++l) { + y[l + 0] = d1 * (q[l] & 0xF) - m1; + y[l +32] = d2 * (q[l] >> 4) - m2; + } +#else + const int64_t tid = threadIdx.x; + const uint8_t * q = x[i].qs; + dst_t * y = yy + i*QK_K; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4); + y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >> 4) - m * (x[i].scales[1] >> 4); +#endif +} + +template +static __global__ void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const block_q5_K * x = (const block_q5_K *) vx; + + const int64_t i = blockIdx.x; + +#if QK_K == 256 + // assume 64 threads - this is very slightly better than the one below + const int64_t tid = threadIdx.x; + const int64_t il = tid/16; // il is in 0...3 + const int64_t ir = tid%16; // ir is in 0...15 + const int64_t is = 2*il; // is is in 0...6 + + dst_t * y = yy + i*QK_K + 64*il + 2*ir; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint8_t * ql = x[i].qs + 32*il + 2*ir; + const uint8_t * qh = x[i].qh + 2*ir; + + uint8_t sc, m; + get_scale_min_k4(is + 0, x[i].scales, sc, m); + const float d1 = dall * sc; const float m1 = dmin * m; + get_scale_min_k4(is + 1, x[i].scales, sc, m); + const float d2 = dall * sc; const float m2 = dmin * m; + + uint8_t hm = 1 << (2*il); + y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1; + y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1; + hm <<= 1; + y[32] = d2 * ((ql[ 0] >> 4) + (qh[ 0] & hm ? 16 : 0)) - m2; + y[33] = d2 * ((ql[ 1] >> 4) + (qh[ 1] & hm ? 16 : 0)) - m2; +#else + const int64_t tid = threadIdx.x; + const uint8_t q = x[i].qs[tid]; + const int64_t im = tid/8; // 0...3 + const int64_t in = tid%8; // 0...7 + const int64_t is = tid/16; // 0 or 1 + const uint8_t h = x[i].qh[in] >> im; + const float d = x[i].d; + dst_t * y = yy + i*QK_K + tid; + y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16)); + y[32] = d * x[i].scales[is+2] * ((q >> 4) - ((h >> 4) & 1 ? 0 : 16)); +#endif +} + +template +static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const block_q6_K * x = (const block_q6_K *) vx; + + const int64_t i = blockIdx.x; +#if QK_K == 256 + + // assume 64 threads - this is very slightly better than the one below + const int64_t tid = threadIdx.x; + const int64_t ip = tid/32; // ip is 0 or 1 + const int64_t il = tid - 32*ip; // 0...32 + const int64_t is = 8*ip + il/16; + + dst_t * y = yy + i*QK_K + 128*ip + il; + + const float d = x[i].d; + + const uint8_t * ql = x[i].ql + 64*ip + il; + const uint8_t qh = x[i].qh[32*ip + il]; + const int8_t * sc = x[i].scales + is; + + y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32); + y[64] = d * sc[4] * ((int8_t)((ql[ 0] >> 4) | (((qh >> 4) & 3) << 4)) - 32); + y[96] = d * sc[6] * ((int8_t)((ql[32] >> 4) | (((qh >> 6) & 3) << 4)) - 32); +#else + + // assume 32 threads + const int64_t tid = threadIdx.x; + const int64_t ip = tid/16; // 0 or 1 + const int64_t il = tid - 16*ip; // 0...15 + + dst_t * y = yy + i*QK_K + 16*ip + il; + + const float d = x[i].d; + + const uint8_t ql = x[i].ql[16*ip + il]; + const uint8_t qh = x[i].qh[il] >> (2*ip); + const int8_t * sc = x[i].scales; + + y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32); + y[32] = d * sc[ip+2] * ((int8_t)((ql >> 4) | (((qh >> 4) & 3) << 4)) - 32); +#endif +} + +template +static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq2_xxs * x = (const block_iq2_xxs *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * aux8 = (const uint8_t *)q2; + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]); + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + NO_DEVICE_CODE; +#endif + +} + +template +static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq2_xs * x = (const block_iq2_xs *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + NO_DEVICE_CODE; +#endif + +} + +template +static __global__ void dequantize_block_iq2_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq2_s * x = (const block_iq2_s *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + NO_DEVICE_CODE; +#endif + +} + +template +static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq3_xxs * x = (const block_iq3_xxs *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * q3 = x[i].qs + 8*ib; + const uint16_t * gas = (const uint16_t *)(x[i].qs + QK_K/4) + 2*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3xxs_grid + q3[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq3xxs_grid + q3[2*il+1]); + const uint32_t aux32 = gas[0] | (gas[1] << 16); + const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.5f; + const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + NO_DEVICE_CODE; +#endif + +} + +template +static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq3_s * x = (const block_iq3_s *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); + const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); + const uint8_t signs = x[i].signs[4*ib + il]; + for (int j = 0; j < 4; ++j) { + y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); + y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + } +#else + NO_DEVICE_CODE; +#endif + +} + +template +static __global__ void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq1_s * x = (const block_iq1_s *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; + const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + NO_DEVICE_CODE; +#endif + +} + +template +static __global__ void dequantize_block_iq1_m(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq1_m * x = (const block_iq1_m *) vx; + + const int64_t tid = threadIdx.x; +#if QK_K == 256 + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * sc = (const uint16_t *)x[i].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const int64_t ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); + const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); + const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + NO_DEVICE_CODE; +#endif + +} + + +template +static __global__ void dequantize_block_iq4_nl(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int64_t i = blockIdx.x; + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} + +#if QK_K != 64 +template +static __global__ void dequantize_block_iq4_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { + const int64_t i = blockIdx.x; + const block_iq4_xs * x = (const block_iq4_xs *)vx; + + const int64_t tid = threadIdx.x; + const int64_t il = tid/8; // 0...3 + const int64_t ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[i].qs + 16*ib + 4*il; + const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } +} +#endif + +template +static void dequantize_block_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { + const int num_blocks = (k + 2*CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / (2*CUDA_DEQUANTIZE_BLOCK_SIZE); + dequantize_block<<>>(vx, y, k); +} + +static void dequantize_block_q8_0_f16_cuda(const void * __restrict__ vx, half * __restrict__ y, const int64_t k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_Q8_0_NE_ALIGN - 1) / CUDA_Q8_0_NE_ALIGN; + if (k % CUDA_Q8_0_NE_ALIGN == 0) { + const bool need_check = false; + dequantize_block_q8_0_f16<<>>(vx, y, k); + } else { + const bool need_check = true; + dequantize_block_q8_0_f16<<>>(vx, y, k); + } +} + +template +static void dequantize_row_q2_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q2_K<<>>(vx, y); +#else + dequantize_block_q2_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q3_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q3_K<<>>(vx, y); +#else + dequantize_block_q3_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q4_0_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_0<<>>(vx, y, nb32); +} + +template +static void dequantize_row_q4_1_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb32 = k / 32; + const int nb = (k + 255) / 256; + dequantize_block_q4_1<<>>(vx, y, nb32); +} + +template +static void dequantize_row_q4_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_q4_K<<>>(vx, y); +} + +template +static void dequantize_row_q5_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q5_K<<>>(vx, y); +#else + dequantize_block_q5_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_q6_K_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; +#if QK_K == 256 + dequantize_block_q6_K<<>>(vx, y); +#else + dequantize_block_q6_K<<>>(vx, y); +#endif +} + +template +static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq2_xxs<<>>(vx, y); +} + +template +static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq2_xs<<>>(vx, y); +} + +template +static void dequantize_row_iq2_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq2_s<<>>(vx, y); +} + +template +static void dequantize_row_iq3_xxs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq3_xxs<<>>(vx, y); +} + +template +static void dequantize_row_iq3_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq3_s<<>>(vx, y); +} + +template +static void dequantize_row_iq1_s_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq1_s<<>>(vx, y); +} + +template +static void dequantize_row_iq4_nl_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; + dequantize_block_iq4_nl<<>>(vx, y); +} + +template +static void dequantize_row_iq1_m_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq1_m<<>>(vx, y); +} + +template +static void dequantize_row_iq4_xs_cuda(const void * vx, dst_t * y, const int64_t k, cudaStream_t stream) { + const int nb = (k + QK_K - 1) / QK_K; +#if QK_K == 64 + dequantize_block_iq4_nl<<>>(vx, y); +#else + dequantize_block_iq4_xs<<>>(vx, y); +#endif +} + +template +static __global__ void convert_unary(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k) { + const int64_t i = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + const src_t * x = (src_t *) vx; + + y[i] = x[i]; +} + +template +static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int64_t k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; + convert_unary<<>>(vx, y, k); +} + +to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_cuda; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) { + return dequantize_block_q8_0_f16_cuda; + } + return dequantize_block_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_cuda; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_cuda; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_cuda; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_cuda; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_cuda; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_cuda; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_cuda; + case GGML_TYPE_F32: + return convert_unary_cuda; + default: + return nullptr; + } +} + +to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + return dequantize_row_q4_0_cuda; + case GGML_TYPE_Q4_1: + return dequantize_row_q4_1_cuda; + case GGML_TYPE_Q5_0: + return dequantize_block_cuda; + case GGML_TYPE_Q5_1: + return dequantize_block_cuda; + case GGML_TYPE_Q8_0: + return dequantize_block_cuda; + case GGML_TYPE_Q2_K: + return dequantize_row_q2_K_cuda; + case GGML_TYPE_Q3_K: + return dequantize_row_q3_K_cuda; + case GGML_TYPE_Q4_K: + return dequantize_row_q4_K_cuda; + case GGML_TYPE_Q5_K: + return dequantize_row_q5_K_cuda; + case GGML_TYPE_Q6_K: + return dequantize_row_q6_K_cuda; + case GGML_TYPE_IQ2_XXS: + return dequantize_row_iq2_xxs_cuda; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_cuda; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_cuda; + case GGML_TYPE_IQ3_XXS: + return dequantize_row_iq3_xxs_cuda; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_cuda; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_cuda; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_cuda; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_cuda; + case GGML_TYPE_IQ3_S: + return dequantize_row_iq3_s_cuda; + case GGML_TYPE_F16: + return convert_unary_cuda; + default: + return nullptr; + } +} diff --git a/ggml-cuda/convert.cuh b/ggml-cuda/convert.cuh new file mode 100644 index 000000000..5394be9f1 --- /dev/null +++ b/ggml-cuda/convert.cuh @@ -0,0 +1,13 @@ +#include "common.cuh" + +#define CUDA_DEQUANTIZE_BLOCK_SIZE 256 + +template +using to_t_cuda_t = void (*)(const void * __restrict__ x, T * __restrict__ y, int64_t k, cudaStream_t stream); + +typedef to_t_cuda_t to_fp32_cuda_t; +typedef to_t_cuda_t to_fp16_cuda_t; + +to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type); + +to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type); diff --git a/ggml-cuda/cpy.cu b/ggml-cuda/cpy.cu new file mode 100644 index 000000000..12d741f01 --- /dev/null +++ b/ggml-cuda/cpy.cu @@ -0,0 +1,490 @@ +#include "cpy.cuh" + +typedef void (*cpy_kernel_t)(const char * cx, char * cdst); + +static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +static __device__ void cpy_1_f32_f16(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + half * dsti = (half *) cdsti; + + *dsti = __float2half(*xi); +} + +static __device__ void cpy_1_f16_f16(const char * cxi, char * cdsti) { + const half * xi = (const half *) cxi; + half * dsti = (half *) cdsti; + + *dsti = *xi; +} + +static __device__ void cpy_1_f16_f32(const char * cxi, char * cdsti) { + const half * xi = (const half *) cxi; + float * dsti = (float *) cdsti; + + *dsti = *xi; +} + +template +static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13) { + const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= ne) { + return; + } + + // determine indices i03/i13, i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor + // then combine those indices with the corresponding byte offsets to get the total offsets + const int64_t i03 = i/(ne00 * ne01 * ne02); + const int64_t i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int64_t i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int64_t i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int64_t x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; + + const int64_t i13 = i/(ne10 * ne11 * ne12); + const int64_t i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int64_t i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int64_t i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int64_t dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13 * nb13; + + cpy_1(cx + x_offset, cdst + dst_offset); +} + +static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q8_0 * dsti = (block_q8_0 *) cdsti; + + float amax = 0.0f; // absolute max + + for (int j = 0; j < QK8_0; j++) { + const float v = xi[j]; + amax = fmaxf(amax, fabsf(v)); + } + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK8_0; ++j) { + const float x0 = xi[j]*id; + + dsti->qs[j] = roundf(x0); + } +} + +static __device__ void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_0 * dsti = (block_q4_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_0; ++j) { + const float v = xi[j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + vmax = v; + } + } + + const float d = vmax / -8; + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + for (int j = 0; j < QK4_0/2; ++j) { + const float x0 = xi[0 + j]*id; + const float x1 = xi[QK4_0/2 + j]*id; + + const uint8_t xi0 = min(15, (int8_t)(x0 + 8.5f)); + const uint8_t xi1 = min(15, (int8_t)(x1 + 8.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static __device__ void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q4_1 * dsti = (block_q4_1 *) cdsti; + + float vmin = FLT_MAX; + float vmax = -FLT_MAX; + + for (int j = 0; j < QK4_1; ++j) { + const float v = xi[j]; + + if (v < vmin) vmin = v; + if (v > vmax) vmax = v; + } + + const float d = (vmax - vmin) / ((1 << 4) - 1); + const float id = d ? 1.0f/d : 0.0f; + + dsti->dm.x = d; + dsti->dm.y = vmin; + + for (int j = 0; j < QK4_1/2; ++j) { + const float x0 = (xi[0 + j] - vmin)*id; + const float x1 = (xi[QK4_1/2 + j] - vmin)*id; + + const uint8_t xi0 = min(15, (int8_t)(x0 + 0.5f)); + const uint8_t xi1 = min(15, (int8_t)(x1 + 0.5f)); + + dsti->qs[j] = xi0; + dsti->qs[j] |= xi1 << 4; + } +} + +static __device__ void cpy_blck_f32_q5_0(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q5_0 * dsti = (block_q5_0 *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK5_0; ++j) { + const float v = xi[j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + vmax = v; + } + } + + const float d = vmax / -16; + const float id = d ? 1.0f/d : 0.0f; + + dsti->d = d; + + uint32_t qh = 0; + for (int j = 0; j < QK5_0/2; ++j) { + const float x0 = xi[0 + j]*id; + const float x1 = xi[QK5_0/2 + j]*id; + + const uint8_t xi0 = min(31, (int8_t)(x0 + 16.5f)); + const uint8_t xi1 = min(31, (int8_t)(x1 + 16.5f)); + + dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_0/2); + } + memcpy(dsti->qh, &qh, sizeof(qh)); +} + +static __device__ void cpy_blck_f32_q5_1(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_q5_1 * dsti = (block_q5_1 *) cdsti; + + float min = xi[0]; + float max = xi[0]; + + for (int j = 1; j < QK5_1; ++j) { + const float v = xi[j]; + min = v < min ? v : min; + max = v > max ? v : max; + } + + const float d = (max - min) / 31; + const float id = d ? 1.0f/d : 0.0f; + + dsti->dm.x = d; + dsti->dm.y = min; + + uint32_t qh = 0; + for (int j = 0; j < QK5_1/2; ++j) { + const float x0 = (xi[0 + j] - min)*id; + const float x1 = (xi[QK5_1/2 + j] - min)*id; + + const uint8_t xi0 = (uint8_t)(x0 + 0.5f); + const uint8_t xi1 = (uint8_t)(x1 + 0.5f); + + dsti->qs[j] = (xi0 & 0xf) | ((xi1 & 0xf) << 4); + qh |= ((xi0 & 0x10u) >> 4) << (j + 0); + qh |= ((xi1 & 0x10u) >> 4) << (j + QK5_1/2); + } + memcpy(dsti->qh, &qh, sizeof(qh)); +} + + +static __device__ __forceinline__ int best_index_int8(int n, const int8_t * val, float x) { + if (x <= val[0]) return 0; + if (x >= val[n-1]) return n-1; + int ml = 0, mu = n-1; + while (mu-ml > 1) { + int mav = (ml+mu)/2; + if (x < val[mav]) mu = mav; else ml = mav; + } + return x - val[mu-1] < val[mu] - x ? mu-1 : mu; +} + +static __device__ void cpy_blck_f32_iq4_nl(const char * cxi, char * cdsti) { + const float * xi = (const float *) cxi; + block_iq4_nl * dsti = (block_iq4_nl *) cdsti; + + float amax = 0.0f; + float vmax = 0.0f; + + for (int j = 0; j < QK4_NL; ++j) { + const float v = xi[j]; + if (amax < fabsf(v)) { + amax = fabsf(v); + vmax = v; + } + } + + float d = vmax / kvalues_iq4nl[0]; + const float id = d ? 1.0f/d : 0.0f; + + float sumqx = 0, sumq2 = 0; + for (int j = 0; j < QK4_NL/2; ++j) { + const float x0 = xi[0 + j]*id; + const float x1 = xi[QK4_NL/2 + j]*id; + const uint8_t xi0 = best_index_int8(16, kvalues_iq4nl, x0); + const uint8_t xi1 = best_index_int8(16, kvalues_iq4nl, x1); + dsti->qs[j] = xi0 | (xi1 << 4); + const float v0 = kvalues_iq4nl[xi0]; + const float v1 = kvalues_iq4nl[xi1]; + const float w0 = xi[0 + j]*xi[0 + j]; + const float w1 = xi[QK4_NL/2 + j]*xi[QK4_NL/2 + j]; + sumqx += w0*v0*xi[j] + w1*v1*xi[QK4_NL/2 + j]; + sumq2 += w0*v0*v0 + w1*v1*v1; + } + + dsti->d = sumq2 > 0 ? sumqx/sumq2 : d; +} + +template +static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, + const int nb12, const int nb13) { + const int i = (blockDim.x*blockIdx.x + threadIdx.x)*qk; + + if (i >= ne) { + return; + } + + const int i03 = i/(ne00 * ne01 * ne02); + const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01); + const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00; + const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00; + const int x_offset = i00*nb00 + i01*nb01 + i02*nb02 + i03 * nb03; + + const int i13 = i/(ne10 * ne11 * ne12); + const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11); + const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10; + const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10; + const int dst_offset = (i10/qk)*nb10 + i11*nb11 + i12*nb12 + i13*nb13; + + cpy_blck(cx + x_offset, cdst + dst_offset); +} + +static void ggml_cpy_f16_f32_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_f32_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_f16_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_q8_0_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + GGML_ASSERT(ne % QK8_0 == 0); + const int num_blocks = ne / QK8_0; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_q4_0_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + GGML_ASSERT(ne % QK4_0 == 0); + const int num_blocks = ne / QK4_0; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_q4_1_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + GGML_ASSERT(ne % QK4_1 == 0); + const int num_blocks = ne / QK4_1; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_q5_0_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + GGML_ASSERT(ne % QK5_0 == 0); + const int num_blocks = ne / QK5_0; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_q5_1_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + GGML_ASSERT(ne % QK5_1 == 0); + const int num_blocks = ne / QK5_1; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f32_iq4_nl_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + GGML_ASSERT(ne % QK4_NL == 0); + const int num_blocks = ne / QK4_NL; + cpy_f32_q<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +static void ggml_cpy_f16_f16_cuda( + const char * cx, char * cdst, const int ne, + const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02, + const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_f32_f16<<>> + (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13); +} + +void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1) { + const int64_t ne = ggml_nelements(src0); + GGML_ASSERT(ne == ggml_nelements(src1)); + + GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); + GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + + //GGML_ASSERT(src0->ne[3] == 1); + + const int64_t nb00 = src0->nb[0]; + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; + const int64_t nb03 = src0->nb[3]; + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + + //GGML_ASSERT(src1->ne[3] == 1); + + const int64_t nb10 = src1->nb[0]; + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; + const int64_t nb13 = src1->nb[3]; + + cudaStream_t main_stream = ctx.stream(); + + char * src0_ddc = (char *) src0->data; + char * src1_ddc = (char *) src1->data; + + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { + ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { + ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { + ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else { + fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } +} + +void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + ggml_cuda_cpy(ctx, src0, dst); +} + +void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) { + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { + return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { + return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) { + return (void*) cpy_f32_q; + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { + return (void*) cpy_f32_f16; + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + return (void*) cpy_f32_f16; + } else { + fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__, + ggml_type_name(src0->type), ggml_type_name(src1->type)); + GGML_ASSERT(false); + } +} + diff --git a/ggml-cuda/cpy.cuh b/ggml-cuda/cpy.cuh new file mode 100644 index 000000000..796167426 --- /dev/null +++ b/ggml-cuda/cpy.cuh @@ -0,0 +1,9 @@ +#include "common.cuh" + +#define CUDA_CPY_BLOCK_SIZE 32 + +void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1); + +void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1); diff --git a/ggml-cuda/dequantize.cuh b/ggml-cuda/dequantize.cuh new file mode 100644 index 000000000..bd3c2d9db --- /dev/null +++ b/ggml-cuda/dequantize.cuh @@ -0,0 +1,103 @@ +#include "common.cuh" + +static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q4_0 * x = (const block_q4_0 *) vx; + + const dfloat d = x[ib].d; + + const int vui = x[ib].qs[iqs]; + + v.x = vui & 0xF; + v.y = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {8.0f, 8.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 8.0f) * d; + v.y = (v.y - 8.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q4_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q4_1 * x = (const block_q4_1 *) vx; + + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); + + const int vui = x[ib].qs[iqs]; + + v.x = vui & 0xF; + v.y = vui >> 4; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q5_0 * x = (const block_q5_0 *) vx; + + const dfloat d = x[ib].d; + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hsub2(v, {16.0f, 16.0f}); + v = __hmul2(v, {d, d}); +#else + v.x = (v.x - 16.0f) * d; + v.y = (v.y - 16.0f) * d; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q5_1(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q5_1 * x = (const block_q5_1 *) vx; + + const dfloat d = __low2half(x[ib].dm); + const dfloat m = __high2half(x[ib].dm); + + uint32_t qh; + memcpy(&qh, x[ib].qh, sizeof(qh)); + + const int xh_0 = ((qh >> (iqs + 0)) << 4) & 0x10; + const int xh_1 = ((qh >> (iqs + 12)) ) & 0x10; + + v.x = ((x[ib].qs[iqs] & 0xf) | xh_0); + v.y = ((x[ib].qs[iqs] >> 4) | xh_1); + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); + v = __hadd2(v, {m, m}); +#else + v.x = (v.x * d) + m; + v.y = (v.y * d) + m; +#endif // GGML_CUDA_F16 +} + +static __device__ __forceinline__ void dequantize_q8_0(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const block_q8_0 * x = (const block_q8_0 *) vx; + + const dfloat d = x[ib].d; + + v.x = x[ib].qs[iqs + 0]; + v.y = x[ib].qs[iqs + 1]; + +#ifdef GGML_CUDA_F16 + v = __hmul2(v, {d, d}); +#else + v.x *= d; + v.y *= d; +#endif // GGML_CUDA_F16 +} diff --git a/ggml-cuda/diagmask.cu b/ggml-cuda/diagmask.cu new file mode 100644 index 000000000..4b713ba22 --- /dev/null +++ b/ggml-cuda/diagmask.cu @@ -0,0 +1,40 @@ +#include "diagmask.cuh" + +static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past) { + const int col = blockDim.y*blockIdx.y + threadIdx.y; + const int row = blockDim.x*blockIdx.x + threadIdx.x; + + if (col >= ncols) { + return; + } + + const int i = row*ncols + col; + //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i]; + //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU + dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX; +} + +static void diag_mask_inf_f32_cuda(const float * x, float * dst, const int ncols_x, const int nrows_x, const int rows_per_channel, const int n_past, cudaStream_t stream) { + const dim3 block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1); + const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE; + const dim3 block_nums(nrows_x, block_num_x, 1); + diag_mask_inf_f32<<>>(x, dst, ncols_x, rows_per_channel, n_past); +} + +void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int nrows0 = ggml_nrows(src0); + + const int n_past = ((int32_t *) dst->op_params)[0]; + + diag_mask_inf_f32_cuda(src0_d, dst_d, ne00, nrows0, ne01, n_past, stream); +} diff --git a/ggml-cuda/diagmask.cuh b/ggml-cuda/diagmask.cuh new file mode 100644 index 000000000..6cdbef17e --- /dev/null +++ b/ggml-cuda/diagmask.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32 + +void ggml_cuda_op_diag_mask_inf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/dmmv.cu b/ggml-cuda/dmmv.cu new file mode 100644 index 000000000..7313e3e17 --- /dev/null +++ b/ggml-cuda/dmmv.cu @@ -0,0 +1,813 @@ +#include "dmmv.cuh" +#include "dequantize.cuh" +#include "convert.cuh" + +#ifndef K_QUANTS_PER_ITERATION +#define K_QUANTS_PER_ITERATION 2 +#else +static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2"); +#endif + +static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = blockIdx.x*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q2_K * x = (const block_q2_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...15 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 16/K_QUANTS_PER_ITERATION; + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int s_offset = 8*im; + const int y_offset = 128*im + l0; + + uint32_t aux[4]; + const uint8_t * d = (const uint8_t *)aux; + const uint8_t * m = (const uint8_t *)(aux + 2); + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset); + aux[0] = a[0] & 0x0f0f0f0f; + aux[1] = a[1] & 0x0f0f0f0f; + aux[2] = (a[0] >> 4) & 0x0f0f0f0f; + aux[3] = (a[1] >> 4) & 0x0f0f0f0f; + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3) + + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3) + + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3) + + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3) + + y[l+16] * d[1] * ((q[l+16] >> 0) & 3) + + y[l+48] * d[3] * ((q[l+16] >> 2) & 3) + + y[l+80] * d[5] * ((q[l+16] >> 4) & 3) + +y[l+112] * d[7] * ((q[l+16] >> 6) & 3); + sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6] + + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7]; + + } + tmp += dall * sum1 - dmin * sum2; + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; + + uint32_t uaux[2]; + const uint8_t * d = (const uint8_t *)uaux; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint32_t * s = (const uint32_t *)x[i].scales; + + uaux[0] = s[0] & 0x0f0f0f0f; + uaux[1] = (s[0] >> 4) & 0x0f0f0f0f; + + const float2 dall = __half22float2(x[i].dm); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t ql = q[l]; + sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3) + + y[l+16] * d[1] * ((ql >> 2) & 3) + + y[l+32] * d[2] * ((ql >> 4) & 3) + + y[l+48] * d[3] * ((ql >> 6) & 3); + sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7]; + } + tmp += dall.x * sum1 - dall.y * sum2; + } +#endif + + // sum up partial sums and write back result + tmp = warp_reduce_sum(tmp); + + if (threadIdx.x == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q3_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + const int row = blockIdx.x*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q3_K * x = (const block_q3_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + + const uint16_t kmask1 = 0x0303; + const uint16_t kmask2 = 0x0f0f; + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int n = K_QUANTS_PER_ITERATION; // iterations in the inner loop + const int step = 16/K_QUANTS_PER_ITERATION; + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0....15 or 0...7 + + const uint8_t m = 1 << (4*im); + + const int l0 = n*in; // 0...15 or 0...14 in steps of 2 + const int q_offset = 32*im + l0; + const int y_offset = 128*im + l0; + + uint16_t utmp[4]; + const int8_t * s = (const int8_t *)utmp; + + const uint16_t s_shift = 4*im; + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * q = x[i].qs + q_offset; + const uint8_t * h = x[i].hmask + l0; + + const uint16_t * a = (const uint16_t *)x[i].scales; + utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4); + utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4); + utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4); + utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4); + + const float d = x[i].d; + + float sum = 0; + for (int l = 0; l < n; ++l) { + sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4)) + + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4)) + + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4)) + + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4)); + sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4)) + + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4)) + + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4)) + + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4)); + } + tmp += d * sum; + + } +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 or 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0....1 or 0...3 + const int offset = tid * K_QUANTS_PER_ITERATION; // 0...15 or 0...14 + const int in = offset/8; // 0 or 1 + const int im = offset%8; // 0...7 + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + offset; + const uint8_t * q = x[i].qs + offset; + const uint8_t * s = x[i].scales; + + const float dall = (float)x[i].d; + + float sum = 0; + for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { + const uint8_t hl = x[i].hmask[im+l] >> in; + const uint8_t ql = q[l]; + sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4)) + + y[l+16] * dall * ((s[0] >> 4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4)) + + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4)) + + y[l+48] * dall * ((s[1] >> 4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result + tmp = warp_reduce_sum(tmp); + + if (threadIdx.x == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + const int row = blockIdx.x*blockDim.y + threadIdx.y; + if (row > nrows) return; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q4_K * x = (const block_q4_K *)vx + ib0; + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0,1 + + const int step = 8/K_QUANTS_PER_ITERATION; // 8 or 4 + + const int il = tid/step; // 0...3 + const int ir = tid - step*il; // 0...7 or 0...3 + const int n = 2 * K_QUANTS_PER_ITERATION; // 2 or 4 + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 4; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4]; + s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif + + } +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + + const int step = tid * K_QUANTS_PER_ITERATION; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + float tmp = 0; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const float * y = yy + i*QK_K + step; + const uint16_t * a = (const uint16_t *)x[i].scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + const float d = (float)x[i].dm[0]; + const float m = (float)x[i].dm[1]; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2]) + + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2]) + + y[j+32] * (d * s[1] * (q[j+ 0] >> 4) - m * s[3]) + + y[j+48] * (d * s[1] * (q[j+16] >> 4) - m * s[3]); + } + tmp += sum; + } + +#endif + + // sum up partial sums and write back result + tmp = warp_reduce_sum(tmp); + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q5_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols) { + + const int row = blockIdx.x; + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q5_K * x = (const block_q5_K *)vx + ib0; + + float tmp = 0; // partial sum for thread in warp + +#if QK_K == 256 + const uint16_t kmask1 = 0x3f3f; + const uint16_t kmask2 = 0x0f0f; + const uint16_t kmask3 = 0xc0c0; + + const int tid = threadIdx.x/2; // 0...15 + const int ix = threadIdx.x%2; + + const int il = tid/4; // 0...3 + const int ir = tid - 4*il;// 0...3 + const int n = 2; + + const int im = il/2; // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224 + const int in = il%2; + + const int l0 = n*(2*ir + in); + const int q_offset = 32*im + l0; + const int y_offset = 64*im + l0; + + const uint8_t hm1 = 1 << (2*im); + const uint8_t hm2 = hm1 << 4; + + uint16_t aux[4]; + const uint8_t * sc = (const uint8_t *)aux; + + uint16_t q16[8]; + const uint8_t * q4 = (const uint8_t *)q16; + + for (int i = ix; i < num_blocks_per_row; i += 2) { + + const uint8_t * ql1 = x[i].qs + q_offset; + const uint8_t * qh = x[i].qh + l0; + const float * y1 = yy + i*QK_K + y_offset; + const float * y2 = y1 + 128; + + const float dall = __low2half(x[i].dm); + const float dmin = __high2half(x[i].dm); + + const uint16_t * a = (const uint16_t *)x[i].scales; + aux[0] = a[im+0] & kmask1; + aux[1] = a[im+2] & kmask1; + aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); + aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); + + float4 sum = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + const uint16_t * q1 = (const uint16_t *)ql1; + const uint16_t * q2 = q1 + 32; + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[8] & 0x0f0f; + q16[2] = (q1[0] >> 4) & 0x0f0f; + q16[3] = (q1[8] >> 4) & 0x0f0f; + q16[4] = q2[0] & 0x0f0f; + q16[5] = q2[8] & 0x0f0f; + q16[6] = (q2[0] >> 4) & 0x0f0f; + q16[7] = (q2[8] >> 4) & 0x0f0f; + for (int l = 0; l < n; ++l) { + sum.x += y1[l+ 0] * (q4[l +0] + (qh[l+ 0] & (hm1 << 0) ? 16 : 0)) + + y1[l+16] * (q4[l +2] + (qh[l+16] & (hm1 << 0) ? 16 : 0)); + sum.y += y1[l+32] * (q4[l +4] + (qh[l+ 0] & (hm1 << 1) ? 16 : 0)) + + y1[l+48] * (q4[l +6] + (qh[l+16] & (hm1 << 1) ? 16 : 0)); + sum.z += y2[l+ 0] * (q4[l +8] + (qh[l+ 0] & (hm2 << 0) ? 16 : 0)) + + y2[l+16] * (q4[l+10] + (qh[l+16] & (hm2 << 0) ? 16 : 0)); + sum.w += y2[l+32] * (q4[l+12] + (qh[l+ 0] & (hm2 << 1) ? 16 : 0)) + + y2[l+48] * (q4[l+14] + (qh[l+16] & (hm2 << 1) ? 16 : 0)); + smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3] + + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7]; + } + tmp += dall * (sum.x * sc[0] + sum.y * sc[1] + sum.z * sc[4] + sum.w * sc[5]) - dmin * smin; + } + +#else + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...15 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); + const int step = tid * K_QUANTS_PER_ITERATION; + const int im = step/8; + const int in = step%8; + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + const uint8_t * q = x[i].qs + step; + const int8_t * s = x[i].scales; + const float * y = yy + i*QK_K + step; + const float d = x[i].d; + float sum = 0.f; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + const uint8_t h = x[i].qh[in+j] >> im; + sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16)) + + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16)) + + y[j+32] * d * s[2] * ((q[j+ 0] >> 4) - ((h >> 4) & 1 ? 0 : 16)) + + y[j+48] * d * s[3] * ((q[j+16] >> 4) - ((h >> 6) & 1 ? 0 : 16)); + } + tmp += sum; + } +#endif + + // sum up partial sums and write back result + tmp = warp_reduce_sum(tmp); + + if (threadIdx.x == 0) { + dst[row] = tmp; + } +} + +static __global__ void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { + + static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); + + const int row = blockIdx.x*blockDim.y + threadIdx.y; + if (row > nrows) return; + + const int num_blocks_per_row = ncols / QK_K; + const int ib0 = row*num_blocks_per_row; + + const block_q6_K * x = (const block_q6_K *)vx + ib0; + +#if QK_K == 256 + + const int tid = threadIdx.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 + const int ix = threadIdx.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 + + const int step = 16/K_QUANTS_PER_ITERATION; // 16 or 8 + + const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... + const int in = tid - step*im; // 0...15 or 0...7 + +#if K_QUANTS_PER_ITERATION == 1 + const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 + const int is = 0; +#else + const int l0 = 4 * in; // 0, 4, 8, ..., 28 + const int is = in / 4; +#endif + const int ql_offset = 64*im + l0; + const int qh_offset = 32*im + l0; + const int s_offset = 8*im + is; + const int y_offset = 128*im + l0; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + y_offset; + const uint8_t * ql = x[i].ql + ql_offset; + const uint8_t * qh = x[i].qh + qh_offset; + const int8_t * s = x[i].scales + s_offset; + + const float d = x[i].d; + +#if K_QUANTS_PER_ITERATION == 1 + float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) + + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32) + + y[64] * s[4] * d * ((int8_t)((ql[ 0] >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32) + + y[80] * s[5] * d * ((int8_t)((ql[16] >> 4) | ((qh[16] & 0x30) >> 0)) - 32) + + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); + tmp += sum; +#else + float sum = 0; + for (int l = 0; l < 4; ++l) { + sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) + + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32) + + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0] >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32) + + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + } + tmp += sum; +#endif + + } + +#else + + const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION); // 0...7 + const int ix = threadIdx.x%(2*K_QUANTS_PER_ITERATION); // 0...3 + + const int step = tid * K_QUANTS_PER_ITERATION; + + float tmp = 0; // partial sum for thread in warp + + for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) { + + const float * y = yy + i * QK_K + step; + const uint8_t * ql = x[i].ql + step; + const uint8_t * qh = x[i].qh + step; + const int8_t * s = x[i].scales; + + const float d = x[i+0].d; + + float sum = 0; + for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) { + sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32) + + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32) + + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >> 4) | ((qh[j] & 0x30) >> 0)) - 32) + + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >> 4) | ((qh[j] & 0xc0) >> 2)) - 32); + } + tmp += sum; + + } + +#endif + + // sum up partial sums and write back result + tmp = warp_reduce_sum(tmp); + + if (tid == 0) { + dst[row] = tmp; + } +} + +static __device__ void convert_f16(const void * vx, const int64_t ib, const int iqs, dfloat2 & v){ + const half * x = (const half *) vx; + + // automatic half -> float type cast if dfloat == float + v.x = x[ib + iqs + 0]; + v.y = x[ib + iqs + 1]; +} + +template +static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) { + // qk = quantized weights per x block + // qr = number of quantized weights per data value in x block + const int64_t row = (int64_t)blockIdx.x*blockDim.y + threadIdx.y; + + if (row >= nrows) { + return; + } + + const int tid = threadIdx.x; + + const int iter_stride = 2*GGML_CUDA_DMMV_X; + const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter + const int y_offset = qr == 1 ? 1 : qk/2; + +// partial sum for each thread +#ifdef GGML_CUDA_F16 + half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics +#else + float tmp = 0.0f; +#endif // GGML_CUDA_F16 + + for (int i = 0; i < ncols; i += iter_stride) { + const int col = i + vals_per_iter*tid; + const int64_t ib = ((int64_t)row*ncols + col)/qk; // x block index + const int iqs = (col%qk)/qr; // x quant index + const int iybs = col - col%qk; // y block start index + +// processing >2 values per i iter is faster for fast GPUs +#pragma unroll + for (int j = 0; j < vals_per_iter; j += 2) { + // process 2 vals per j iter + + // dequantize + // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val + dfloat2 v; + dequantize_kernel(vx, ib, iqs + j/qr, v); + + // matrix multiplication + // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2 +#ifdef GGML_CUDA_F16 + tmp += __hmul2(v, { + y[iybs + iqs + j/qr + 0], + y[iybs + iqs + j/qr + y_offset] + }); +#else + tmp += v.x * y[iybs + iqs + j/qr + 0]; + tmp += v.y * y[iybs + iqs + j/qr + y_offset]; +#endif // GGML_CUDA_F16 + } + } + + // sum up partial sums and write back result + tmp = warp_reduce_sum(tmp); + + if (tid == 0) { +#ifdef GGML_CUDA_F16 + dst[row] = tmp.x + tmp.y; +#else + dst[row] = tmp; +#endif // GGML_CUDA_F16 + } +} + +static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec + <<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2 + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q2_k<<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q3_k<<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q4_k<<>>(vx, y, dst, ncols, nrows); +} + +static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const dim3 block_dims(32, 1, 1); + dequantize_mul_mat_vec_q5_k<<>>(vx, y, dst, ncols); +} + +static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int ny = 2 / K_QUANTS_PER_ITERATION; + const int block_num_y = (nrows + ny - 1) / ny; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(32, ny, 1); + dequantize_mul_mat_vec_q6_k<<>>(vx, y, dst, ncols, nrows); +} + +static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + dequantize_mul_mat_vec<1, 1, convert_f16> + <<>>(vx, y, dst, ncols, nrows); +} + +void ggml_cuda_op_dequantize_mul_mat_vec( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + GGML_UNUSED(ctx); + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics +#ifdef GGML_CUDA_F16 + ggml_cuda_pool_alloc src1_dfloat_a(ctx.pool()); + half * src1_dfloat = nullptr; // dfloat == half + + bool src1_convert_f16 = + src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 || + src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 || + src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16; + + if (src1_convert_f16) { + src1_dfloat = src1_dfloat_a.alloc(ne00); + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + to_fp16_cuda(src1_ddf_i, src1_dfloat, ne00, stream); + } +#else + const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion +#endif // GGML_CUDA_F16 + + switch (src0->type) { + case GGML_TYPE_Q4_0: + dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_1: + dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_0: + dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_1: + dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q8_0: + dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q2_K: + dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q3_K: + dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q4_K: + dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q5_K: + dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_Q6_K: + dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_F16: + convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddq_i); + GGML_UNUSED(src1_ncols); + GGML_UNUSED(src1_padded_row_size); +} diff --git a/ggml-cuda/dmmv.cuh b/ggml-cuda/dmmv.cuh new file mode 100644 index 000000000..4c5ebd475 --- /dev/null +++ b/ggml-cuda/dmmv.cuh @@ -0,0 +1,18 @@ +#include "common.cuh" + +// dmmv = dequantize_mul_mat_vec + +// TODO: remove this? +#ifndef GGML_CUDA_DMMV_X +#define GGML_CUDA_DMMV_X 32 +#endif + +#ifndef GGML_CUDA_MMV_Y +#define GGML_CUDA_MMV_Y 1 +#endif + +void ggml_cuda_op_dequantize_mul_mat_vec( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream); diff --git a/ggml-cuda/fattn.cu b/ggml-cuda/fattn.cu new file mode 100644 index 000000000..7c486f482 --- /dev/null +++ b/ggml-cuda/fattn.cu @@ -0,0 +1,1085 @@ +#include "common.cuh" +#include "fattn.cuh" + +#include + +#if FP16_MMA_AVAILABLE +#include +#endif + +#define FATTN_KQ_STRIDE 256 +#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction. +#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs. + +template // D == head size +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(D, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_vec_ext_f16( + const char * __restrict__ Q, + const char * __restrict__ K, + const char * __restrict__ V, + const char * __restrict__ mask, + float * __restrict__ dst, + float2 * __restrict__ dst_meta, + const float scale, + const int ne00, + const int ne01, + const int ne02, + const int ne03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const int ne31, + const int nb31, + const int nb01, + const int nb02, + const int nb03, + const int nb11, + const int nb12, + const int nb13, + const int ne0, + const int ne1, + const int ne2, + const int ne3) { +#if FP16_AVAILABLE + //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + + const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on. + const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. + + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0); + const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); + const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) mask + ne11*ic0; + + const int stride_KV = nb11 / sizeof(half); + const int stride_KV2 = nb11 / sizeof(half2); + + static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64."); + constexpr int nwarps = D / WARP_SIZE; + const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; + __builtin_assume(tid < D); + + __shared__ half KQ[ncols*D]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + KQ[j*D + tid] = -HALF_MAX_HALF; + } + half2 * KQ2 = (half2 *) KQ; + + half kqmax[ncols]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqmax[j] = -HALF_MAX_HALF; + } + half kqsum[ncols] = {0.0f}; + + __shared__ half kqmax_shared[ncols][WARP_SIZE]; + __shared__ half kqsum_shared[ncols][WARP_SIZE]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + if (threadIdx.y == 0) { + kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF; + kqsum_shared[j][threadIdx.x] = 0.0f; + } + } + __syncthreads(); + + // Convert Q to half2 and store in registers: + half2 Q_h2[ncols][D/(2*WARP_SIZE)]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + + const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i]; + Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y); + } + } + + half2 VKQ[ncols] = {{0.0f, 0.0f}}; + + const int k_start = parallel_blocks == 1 ? 0 : ip*D; + for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { + // Calculate KQ tile and keep track of new maximum KQ values: + + // For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression, + // see https://github.com/ggerganov/llama.cpp/pull/7061 . + // Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable). + half kqmax_new = kqmax[0]; + half kqmax_new_arr[ncols]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqmax_new_arr[j] = kqmax[j]; + } + +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { + const int i_KQ = i_KQ_0 + threadIdx.y; + + if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) { + break; + } + + half2 sum2[ncols] = {{0.0f, 0.0f}}; +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { + const int k_KQ = k_KQ_0 + threadIdx.x; + + const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE]; + } + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + sum2[j] = warp_reduce_sum(sum2[j]); + half sum = __low2half(sum2[j]) + __high2half(sum2[j]); + sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f); + + if (ncols == 1) { + kqmax_new = ggml_cuda_hmax(kqmax_new, sum); + } else { + kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum); + } + + if (threadIdx.x == 0) { + KQ[j*D + i_KQ] = sum; + } + } + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j]; + + kqmax_new_j = warp_reduce_max(kqmax_new_j); + if (threadIdx.x == 0) { + kqmax_shared[j][threadIdx.y] = kqmax_new_j; + } + } + + __syncthreads(); + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + half kqmax_new_j = kqmax_shared[j][threadIdx.x]; + kqmax_new_j = warp_reduce_max(kqmax_new_j); + + const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j); + kqmax[j] = kqmax_new_j; + + const half val = hexp(KQ[j*D + tid] - kqmax[j]); + kqsum[j] = kqsum[j]*KQ_max_scale + val; + KQ[j*D + tid] = val; + + VKQ[j] *= __half2half2(KQ_max_scale); + } + + __syncthreads(); + +#pragma unroll + for (int k0 = 0; k0 < D; k0 += 2) { + if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) { + break; + } + + half2 V_k; + reinterpret_cast(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid]; + reinterpret_cast(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid]; +#pragma unroll + for (int j = 0; j < ncols; ++j) { + VKQ[j] += V_k*KQ2[j*(D/2) + k0/2]; + } + } + + __syncthreads(); + } + +#pragma unroll + for (int j = 0; j < ncols; ++j) { + kqsum[j] = warp_reduce_sum(kqsum[j]); + if (threadIdx.x == 0) { + kqsum_shared[j][threadIdx.y] = kqsum[j]; + } + } + + __syncthreads(); + +#pragma unroll + for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) { + kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x]; + kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]); + + half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ])); + if (parallel_blocks == 1) { + dst_val /= kqsum[j_VKQ]; + } + const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; + dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val; + } + + if (parallel_blocks != 1 && tid != 0) { +#pragma unroll + for (int j = 0; j < ncols; ++j) { + dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]); + } + } +#else + NO_DEVICE_CODE; +#endif // FP16_AVAILABLE +} + +// D == head size, VKQ_stride == num VKQ rows calculated in parallel: +template +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(nwarps*WARP_SIZE, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_ext_f16( + const char * __restrict__ Q, + const char * __restrict__ K, + const char * __restrict__ V, + const char * __restrict__ mask, + float * __restrict__ dst, + float2 * __restrict__ dst_meta, + const float scale, + const int ne00, + const int ne01, + const int ne02, + const int ne03, + const int ne10, + const int ne11, + const int ne12, + const int ne13, + const int ne31, + const int nb31, + const int nb01, + const int nb02, + const int nb03, + const int nb11, + const int nb12, + const int nb13, + const int ne0, + const int ne1, + const int ne2, + const int ne3) { +#if FP16_MMA_AVAILABLE + //In this kernel Q, K, V are matrices while i, j, k are matrix indices. + + const int ic0 = ncols*(blockIdx.x / parallel_blocks); // Index of the first Q/QKV column to work on. + const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. + + static_assert(D <= FATTN_KQ_STRIDE, "D must be <= FATTN_KQ_STRIDE."); + static_assert(ncols == 8 || ncols % 16 == 0, "ncols must be 8 or a multiple of 16."); + constexpr int frag_m = ncols == 8 ? 32 : 16; + constexpr int frag_n = ncols == 8 ? 8 : 16; + static_assert(D % frag_m == 0, "If ncols == 8 then D % frag_m must be 0."); + typedef nvcuda::wmma::fragment frag_a_K; + typedef nvcuda::wmma::fragment frag_a_V; + typedef nvcuda::wmma::fragment frag_b; + typedef nvcuda::wmma::fragment frag_c_KQ; + typedef nvcuda::wmma::fragment frag_c_VKQ; + + constexpr int KQ_stride_tc = nwarps*frag_m; // Number of KQ rows calculated in parallel. + constexpr int VKQ_ratio = KQ_stride_tc/VKQ_stride; // Number of parallel VKQ accumulators needed to keep all warps busy. + static_assert(VKQ_ratio <= nwarps, "VKQ_ratio must be <= nwarps."); + + // Pad internal representation of KQ, KQV to reduce shared memory bank conflicts: + constexpr int D_padded = D + 8; + constexpr int kqs_padded = FATTN_KQ_STRIDE + 8; + constexpr int kqar = sizeof(KQ_acc_t)/sizeof(half); + + const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. + const float * Q_f = (const float *) (Q + nb02* blockIdx.y + nb01*ic0); + const half * K_h = (const half *) (K + nb12*(blockIdx.y / gqa_ratio)); + const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape + const half * maskh = (const half *) mask + (nb31/sizeof(half))* ic0; + const half2 * mask2 = (const half2 *) mask + (nb31/sizeof(half))*(ic0/2); + + const int stride_Q = nb01 / sizeof(float); + const int stride_KV = nb11 / sizeof(half); + + frag_b Q_b[D/16][ncols/frag_n]; + + // A single buffer for temporarily holding tiles of KQ and VKQ parts: + constexpr int mem_KQ = ncols*kqs_padded*kqar; + constexpr int mem_VKQ_parts = VKQ_ratio*ncols*D_padded; + __shared__ half KQ[mem_KQ >= mem_VKQ_parts ? mem_KQ : mem_VKQ_parts]; + float * KQ_f = (float *) KQ; + half2 * KQ2 = (half2 *) KQ; + + float KQ_rowsum_f[ncols/nwarps] = {0.0f}; + float KQ_max_f[ncols/nwarps]; + float KQ_max_scale_f[ncols/nwarps] = {0.0f}; + +#pragma unroll + for (int j = 0; j < ncols/nwarps; ++j) { + KQ_max_f[j] = -FLT_MAX/2.0f; + } + + half2 KQ_rowsum_h2[ncols/nwarps] = {{0.0f, 0.0f}}; + half2 KQ_max_h2[ncols/nwarps]; + half2 KQ_max_scale_h2[ncols/nwarps] = {{0.0f, 0.0f}}; + +#pragma unroll + for (int j = 0; j < ncols/nwarps; ++j) { + KQ_max_h2[j] = make_half2(-HALF_MAX_HALF, -HALF_MAX_HALF); + } + + __shared__ half VKQ[ncols*D_padded]; // Accumulator for final VKQ slice. + half2 * VKQ2 = (half2 *) VKQ; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D/2 && i >= D/2) { + break; + } + VKQ2[j*(D_padded/2) + i] = make_half2(0.0f, 0.0f); + } + } + + // Convert Q to half and apply scale, temporarily store in KQ: +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; +#pragma unroll + for (int i0 = 0; i0 < D; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D && i >= D) { + break; + } + KQ[j*D_padded + i] = ic0 + j < ne01 ? Q_f[j*stride_Q + i] * scale : 0.0f; + } + } + + __syncthreads(); + + // Load Q into tensor core fragments/registers since it will be used frequently: +#pragma unroll + for (int i0 = 0; i0 < D; i0 += 16) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { + nvcuda::wmma::load_matrix_sync(Q_b[i0/16][j0/frag_n], KQ + j0*D_padded + i0, D_padded); + } + } + + __syncthreads(); + + // Iterate over ne11 == previous tokens: + for (int k_VKQ_0 = ip*FATTN_KQ_STRIDE; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*FATTN_KQ_STRIDE) { + // Calculate tile of KQ: +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < FATTN_KQ_STRIDE; i_KQ_0 += KQ_stride_tc) { + frag_c_KQ KQ_c[ncols/frag_n]; +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::fill_fragment(KQ_c[j], 0.0f); + } +#pragma unroll + for (int k_KQ_0 = 0; k_KQ_0 < D; k_KQ_0 += 16) { + frag_a_K K_a; + nvcuda::wmma::load_matrix_sync(K_a, K_h + (k_VKQ_0 + i_KQ_0 + frag_m*threadIdx.y)*stride_KV + k_KQ_0, stride_KV); +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::mma_sync(KQ_c[j], K_a, Q_b[k_KQ_0/16][j], KQ_c[j]); + } + } +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { + nvcuda::wmma::store_matrix_sync((KQ_acc_t *) KQ + j0*kqs_padded + i_KQ_0 + frag_m*threadIdx.y, KQ_c[j0/frag_n], kqs_padded, nvcuda::wmma::mem_col_major); + } + } + + __syncthreads(); + + // Calculate softmax for each KQ column using the current max. value. + // The divisor is stored in KQ_rowsum and will be applied at the end. +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + if (std::is_same::value) { + float KQ_f_tmp[FATTN_KQ_STRIDE / WARP_SIZE]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ_f_tmp[k0/WARP_SIZE] = KQ_f[j*kqs_padded + k]; + } + + float KQ_max_new = KQ_max_f[j0/nwarps]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f; + KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]); + } + KQ_max_new = warp_reduce_max(KQ_max_new); + + const float diff = KQ_max_f[j0/nwarps] - KQ_max_new; + KQ_max_scale_f[j0/nwarps] = expf(diff); + if (diff <= SOFTMAX_FTZ_THRESHOLD) { + KQ_max_scale_f[j0/nwarps] = 0.0f; + } + KQ_max_f[j0/nwarps] = KQ_max_new; + + float KQ_rowsum_add = 0.0f; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + const float diff = KQ_f_tmp[k0/WARP_SIZE] - KQ_max_f[j0/nwarps]; + KQ_f_tmp[k0/WARP_SIZE] = expf(diff); + if (diff <= SOFTMAX_FTZ_THRESHOLD) { + KQ_f_tmp[k0/WARP_SIZE] = 0.0f; + } + KQ_rowsum_add += KQ_f_tmp[k0/WARP_SIZE]; + KQ[j*(kqar*kqs_padded) + k] = KQ_f_tmp[k0/WARP_SIZE]; + } + KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add); + + // Scale previous KQ_rowsum to account for a potential increase in KQ_max: + KQ_rowsum_f[j0/nwarps] = KQ_max_scale_f[j0/nwarps]*KQ_rowsum_f[j0/nwarps] + KQ_rowsum_add; + } else { + half2 KQ2_tmp[FATTN_KQ_STRIDE/(2*WARP_SIZE)]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ2_tmp[k0/WARP_SIZE] = KQ2[j*(kqs_padded/2) + k]; + } + + half2 KQ_max_new = KQ_max_h2[j0/nwarps]; +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f); + KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]); + } + KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new)))); + const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new; + KQ_max_scale_h2[j0/nwarps] = h2exp(diff); + const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD)); + *((uint32_t *) &KQ_max_scale_h2[j0/nwarps]) &= ftz_mask; + KQ_max_h2[j0/nwarps] = KQ_max_new; + + half2 KQ_rowsum_add = make_half2(0.0f, 0.0f); +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { + const int k = k0 + threadIdx.x; + + const half2 diff = KQ2_tmp[k0/WARP_SIZE] - KQ_max_h2[j0/nwarps]; + KQ2_tmp[k0/WARP_SIZE] = h2exp(diff); + const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD)); + *((uint32_t *) &KQ2_tmp[k0/WARP_SIZE]) &= ftz_mask; + KQ_rowsum_add += KQ2_tmp[k0/WARP_SIZE]; + KQ2[j*(kqs_padded/2) + k] = KQ2_tmp[k0/WARP_SIZE]; + } + KQ_rowsum_add = warp_reduce_sum(KQ_rowsum_add); + + // Scale previous KQ_rowsum to account for a potential increase in KQ_max: + KQ_rowsum_h2[j0/nwarps] = KQ_max_scale_h2[j0/nwarps]*KQ_rowsum_h2[j0/nwarps] + KQ_rowsum_add; + } + } + + __syncthreads(); + + frag_b KQ_b[FATTN_KQ_STRIDE/(VKQ_ratio*16)][ncols/frag_n]; +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) { + const int k = k0 + (threadIdx.y % VKQ_ratio)*16; + nvcuda::wmma::load_matrix_sync( + KQ_b[k0/(VKQ_ratio*16)][j0/frag_n], + KQ + j0*(kqar*kqs_padded) + k, + kqar*kqs_padded); + } + } + + frag_c_VKQ VKQ_c[D/VKQ_stride][ncols/frag_n]; +#pragma unroll + for (int i_VKQ_0 = 0; i_VKQ_0 < D; i_VKQ_0 += VKQ_stride) { +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::fill_fragment(VKQ_c[i_VKQ_0/VKQ_stride][j], 0.0f); + } + +#pragma unroll + for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += VKQ_ratio*16) { + const int k = k0 + (threadIdx.y % VKQ_ratio)*16; + + frag_a_V v_a; + nvcuda::wmma::load_matrix_sync(v_a, V_h + (k_VKQ_0 + k)*stride_KV + i_VKQ_0 + frag_m*(threadIdx.y/VKQ_ratio), stride_KV); +#pragma unroll + for (int j = 0; j < ncols/frag_n; ++j) { + nvcuda::wmma::mma_sync(VKQ_c[i_VKQ_0/VKQ_stride][j], v_a, KQ_b[k0/(VKQ_ratio*16)][j], VKQ_c[i_VKQ_0/VKQ_stride][j]); + } + } + } + + __syncthreads(); + + const int offset_k = (threadIdx.y % VKQ_ratio) * (ncols*D_padded); +#pragma unroll + for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += VKQ_stride) { +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += frag_n) { + nvcuda::wmma::store_matrix_sync( + KQ + offset_k + j0*D_padded + i_KQ_0 + frag_m*(threadIdx.y/VKQ_ratio), + VKQ_c[i_KQ_0/VKQ_stride][j0/frag_n], + D_padded, nvcuda::wmma::mem_col_major); + } + } + + __syncthreads(); + +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j = j0 + threadIdx.y; + + half2 VKQ_scale; + if (std::is_same::value) { + VKQ_scale = make_half2(KQ_max_scale_f[j0/nwarps], KQ_max_scale_f[j0/nwarps]); + } else { + VKQ_scale = KQ_max_scale_h2[j0/nwarps]; + } + +#pragma unroll + for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D/2 && i >= D/2) { + break; + } + + half2 VKQ_add = make_half2(0.0f, 0.0f); +#pragma unroll + for (int l = 0; l < VKQ_ratio; ++l) { + VKQ_add += KQ2[l*(ncols*D_padded/2) + j*(D_padded/2) + i]; + } + VKQ2[j*(D_padded/2) + i] = VKQ_scale*VKQ2[j*(D_padded/2) + i] + VKQ_add; + } + } + + __syncthreads(); + } + +#pragma unroll + for (int j0 = 0; j0 < ncols; j0 += nwarps) { + const int j_VKQ = j0 + threadIdx.y; + if (ic0 + j_VKQ >= ne01) { + return; + } + const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip; + + float KQ_rowsum_j; + if (std::is_same::value) { + KQ_rowsum_j = KQ_rowsum_f[j0/nwarps]; + } else { + KQ_rowsum_j = __low2float(KQ_rowsum_h2[j0/nwarps]) + __high2float(KQ_rowsum_h2[j0/nwarps]); + } + +#pragma unroll + for (int i0 = 0; i0 < D; i0 += WARP_SIZE) { + const int i = i0 + threadIdx.x; + if (i0 + WARP_SIZE > D && i >= D) { + break; + } + float dst_val = VKQ[j_VKQ*D_padded + i]; + if (parallel_blocks == 1) { + dst_val /= KQ_rowsum_j; + } + dst[j_dst*gridDim.y*D + blockIdx.y*D + i] = dst_val; + } + + if (parallel_blocks == 1 || threadIdx.x != 0) { + continue; + } + + float2 dst_meta_val; + if (std::is_same::value) { + dst_meta_val.x = KQ_max_f[j0/nwarps]; + } else { + dst_meta_val.x = __low2float(KQ_max_h2[j0/nwarps]); + } + dst_meta_val.y = KQ_rowsum_j; + dst_meta[(ic0 + j_VKQ)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = dst_meta_val; + } +#else + NO_DEVICE_CODE; +#endif // FP16_MMA_AVAILABLE +} + +template // D == head size +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +__launch_bounds__(D, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void flash_attn_combine_results( + const float * __restrict__ VKQ_parts, + const float2 * __restrict__ VKQ_meta, + float * __restrict__ dst) { +#if FP16_AVAILABLE + VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x; + VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x; + dst += D * gridDim.y*blockIdx.x; + + const int tid = threadIdx.x; + __builtin_assume(tid < D); + + __shared__ float2 meta[parallel_blocks]; + if (tid < 2*parallel_blocks) { + ((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid]; + } + + __syncthreads(); + + float kqmax = meta[0].x; +#pragma unroll + for (int l = 1; l < parallel_blocks; ++l) { + kqmax = max(kqmax, meta[l].x); + } + + float VKQ_numerator = 0.0f; + float VKQ_denominator = 0.0f; +#pragma unroll + for (int l = 0; l < parallel_blocks; ++l) { + const float diff = meta[l].x - kqmax; + const float KQ_max_scale = expf(diff); + const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD); + *((uint32_t *) &KQ_max_scale) &= ftz_mask; + + VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid]; + VKQ_denominator += KQ_max_scale * meta[l].y; + } + + dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator; +#else + NO_DEVICE_CODE; +#endif // FP16_AVAILABLE +} + +constexpr int get_max_power_of_2(int x) { + return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1; +} + +static_assert(get_max_power_of_2(1) == 1, "Test failed."); +static_assert(get_max_power_of_2(2) == 2, "Test failed."); +static_assert(get_max_power_of_2(4) == 4, "Test failed."); +static_assert(get_max_power_of_2(6) == 2, "Test failed."); + +// Number of VKQ rows calculated in parallel: +constexpr int get_VKQ_stride(int D, int nwarps, int frag_m) { + return (get_max_power_of_2(D/frag_m) < nwarps ? get_max_power_of_2(D/frag_m) : nwarps)*frag_m; +} + +static_assert(get_VKQ_stride(128, 1, 32) == 32, "Test failed."); +static_assert(get_VKQ_stride(128, 2, 32) == 64, "Test failed."); +static_assert(get_VKQ_stride(128, 4, 32) == 128, "Test failed."); +static_assert(get_VKQ_stride( 64, 1, 32) == 32, "Test failed."); +static_assert(get_VKQ_stride( 64, 2, 32) == 64, "Test failed."); +static_assert(get_VKQ_stride( 64, 4, 32) == 64, "Test failed."); +static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed."); +static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed."); +static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed."); + +template void launch_fattn_vec_f16( + const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask, + ggml_cuda_pool & pool, cudaStream_t main_stream +) { + ggml_cuda_pool_alloc dst_tmp(pool); + ggml_cuda_pool_alloc dst_tmp_meta(pool); + + if (parallel_blocks > 1) { + dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); + dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV)); + } + + constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE; + const dim3 block_dim(WARP_SIZE, nwarps, 1); + const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]); + const int shmem = 0; + + float scale; + memcpy(&scale, KQV->op_params, sizeof(float)); + + flash_attn_vec_ext_f16 + <<>> ( + (const char *) Q->data, + (const char *) K->data, + (const char *) V->data, + mask ? ((const char *) mask->data) : nullptr, + parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, + scale, + Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], + K->ne[0], K->ne[1], K->ne[2], K->ne[3], + mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, + Q->nb[1], Q->nb[2], Q->nb[3], + K->nb[1], K->nb[2], K->nb[3], + KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3] + ); + CUDA_CHECK(cudaGetLastError()); + + if (parallel_blocks == 1) { + return; + } + + const dim3 block_dim_combine(D, 1, 1); + const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z); + const int shmem_combine = 0; + + flash_attn_combine_results + <<>> + (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data); + CUDA_CHECK(cudaGetLastError()); +} + +template void launch_fattn_f16_impl( + const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask, + ggml_cuda_pool & pool, cudaStream_t main_stream +) { + ggml_cuda_pool_alloc dst_tmp(pool); + ggml_cuda_pool_alloc dst_tmp_meta(pool); + + if (parallel_blocks > 1) { + dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV)); + dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV)); + } + + constexpr int frag_m = (cols_per_block) == 8 && (D) % 32 == 0 ? 32 : 16; + const dim3 block_dim(WARP_SIZE, nwarps, 1); + const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]); + const int shmem = 0; + + float scale; + memcpy(&scale, KQV->op_params, sizeof(float)); + + flash_attn_ext_f16 + <<>> ( + (const char *) Q->data, + (const char *) K->data, + (const char *) V->data, + mask ? ((const char *) mask->data) : nullptr, + (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, + scale, + Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], + K->ne[0], K->ne[1], K->ne[2], K->ne[3], + mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, + Q->nb[1], Q->nb[2], Q->nb[3], + K->nb[1], K->nb[2], K->nb[3], + KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3] + ); + CUDA_CHECK(cudaGetLastError()); + + if ((parallel_blocks) == 1) { + return; + } + + const dim3 block_dim_combine(D, 1, 1); + const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z); + const int shmem_combine = 0; + + flash_attn_combine_results + <<>> + (dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data); + CUDA_CHECK(cudaGetLastError()); +} + +template void launch_fattn_f16( + const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask, + const int nsm, ggml_cuda_pool & pool, cudaStream_t main_stream +) { + const int blocks_num_pb1 = ((Q->ne[1] + cols_per_block - 1) / cols_per_block)*Q->ne[2]*Q->ne[3]; + + if (4*blocks_num_pb1 < 2*nsm) { + launch_fattn_f16_impl(Q, K, V, KQV, mask, pool, main_stream); + return; + } + if (2*blocks_num_pb1 < 2*nsm) { + launch_fattn_f16_impl(Q, K, V, KQV, mask, pool, main_stream); + return; + } + launch_fattn_f16_impl(Q, K, V, KQV, mask, pool, main_stream); +} + +void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * Q = dst->src[0]; + const ggml_tensor * K = dst->src[1]; + const ggml_tensor * V = dst->src[2]; + + const ggml_tensor * mask = dst->src[3]; + + ggml_tensor * KQV = dst; + + GGML_ASSERT(Q->type == GGML_TYPE_F32); + GGML_ASSERT(K->type == GGML_TYPE_F16); + GGML_ASSERT(V->type == GGML_TYPE_F16); + GGML_ASSERT(KQV->type == GGML_TYPE_F32); + + GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16); + GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) && + "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big"); + + GGML_ASSERT(K->ne[1] % FATTN_KQ_STRIDE == 0 && "Incorrect KV cache padding."); + + ggml_cuda_set_device(ctx.device); + + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; + + const int32_t precision = KQV->op_params[1]; + + if (!fp16_mma_available(cc)) { + GGML_ASSERT(precision == GGML_PREC_DEFAULT); + GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128."); + + if (Q->ne[1] == 1) { + constexpr int cols_per_block = 1; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (Q->ne[1] == 2) { + constexpr int cols_per_block = 2; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (Q->ne[1] <= 4) { + constexpr int cols_per_block = 4; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (Q->ne[1] <= 8) { + constexpr int cols_per_block = 8; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + constexpr int cols_per_block = 8; + constexpr int parallel_blocks = 1; + switch (Q->ne[0]) { + case 64: + launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (precision != GGML_PREC_DEFAULT) { + if (Q->ne[1] <= 32 || Q->ne[0] > 128) { + constexpr int cols_per_block = 16; + constexpr int nwarps = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 80: + launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 96: + launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 112: + launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 256: + launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + } else { + constexpr int cols_per_block = 32; + constexpr int nwarps = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_f16< 64, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 80: + launch_fattn_f16< 80, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 96: + launch_fattn_f16< 96, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 112: + launch_fattn_f16<112, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_f16<128, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + // case 256: + // launch_fattn_f16<256, cols_per_block, nwarps, float>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + // break; + default: + GGML_ASSERT(false); + break; + } + } + return; + } + + if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) { + constexpr int cols_per_block = 1; + constexpr int parallel_blocks = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + case 256: + launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (Q->ne[1] <= 8 && Q->ne[0] % WARP_SIZE == 0) { + constexpr int cols_per_block = 8; + constexpr int nwarps = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 96: + launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 256: + launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + if (Q->ne[1] <= 32) { + constexpr int cols_per_block = 16; + constexpr int nwarps = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 80: + launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 96: + launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 112: + launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 256: + launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; + } + + constexpr int cols_per_block = 32; + constexpr int nwarps = 4; + switch (Q->ne[0]) { + case 64: + launch_fattn_f16< 64, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 80: + launch_fattn_f16< 80, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 96: + launch_fattn_f16< 96, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 112: + launch_fattn_f16<112, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 128: + launch_fattn_f16<128, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + case 256: + launch_fattn_f16<256, cols_per_block, nwarps, half>(Q, K, V, KQV, mask, nsm, ctx.pool(), ctx.stream()); + break; + default: + GGML_ASSERT(false); + break; + } + return; +} diff --git a/ggml-cuda/fattn.cuh b/ggml-cuda/fattn.cuh new file mode 100644 index 000000000..ad3ca7a8d --- /dev/null +++ b/ggml-cuda/fattn.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/getrows.cu b/ggml-cuda/getrows.cu new file mode 100644 index 000000000..55af195fd --- /dev/null +++ b/ggml-cuda/getrows.cu @@ -0,0 +1,178 @@ +#include "getrows.cuh" +#include "dequantize.cuh" + +template +static __global__ void k_get_rows( + const void * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + + const int i00 = (blockIdx.x*blockDim.x + threadIdx.x)*2; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; + const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03; + + const int ib = i00/qk; // block index + const int iqs = (i00%qk)/qr; // quant index + const int iybs = i00 - i00%qk; // dst block start index + const int y_offset = qr == 1 ? 1 : qk/2; + + // dequantize + dfloat2 v; + dequantize_kernel(src0_row, ib, iqs, v); + + dst_row[iybs + iqs + 0] = v.x; + dst_row[iybs + iqs + y_offset] = v.y; +} + +template +static __global__ void k_get_rows_float( + const src0_t * src0, const int32_t * src1, dst_t * dst, + int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/ + /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/ + /*size_t s0,*/ size_t s1, size_t s2, size_t s3, + /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03, + size_t s10, size_t s11, size_t s12/*, size_t s13*/) { + + const int i00 = blockIdx.x*blockDim.x + threadIdx.x; + const int i10 = blockDim.y*blockIdx.y + threadIdx.y; + const int i11 = (blockIdx.z*blockDim.z + threadIdx.z)/ne12; + const int i12 = (blockIdx.z*blockDim.z + threadIdx.z)%ne12; + + if (i00 >= ne00) { + return; + } + + const int i01 = src1[i10*s10 + i11*s11 + i12*s12]; + + dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3; + const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03); + + dst_row[i00] = src0_row[i00]; +} + +template +static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const void * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); + const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE); + const dim3 block_nums(block_num_x, ne10, ne11*ne12); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + GGML_ASSERT(ne00 % 2 == 0); + + k_get_rows<<>>( + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); + + GGML_UNUSED(dst); +} + +template +static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, + const src0_t * src0_dd, const int32_t * src1_dd, float * dst_dd, cudaStream_t stream) { + + GGML_TENSOR_BINARY_OP_LOCALS + + const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1); + const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE; + const dim3 block_nums(block_num_x, ne10, ne11*ne12); + + // strides in elements + //const size_t s0 = nb0 / ggml_element_size(dst); + const size_t s1 = nb1 / ggml_element_size(dst); + const size_t s2 = nb2 / ggml_element_size(dst); + const size_t s3 = nb3 / ggml_element_size(dst); + + const size_t s10 = nb10 / ggml_element_size(src1); + const size_t s11 = nb11 / ggml_element_size(src1); + const size_t s12 = nb12 / ggml_element_size(src1); + //const size_t s13 = nb13 / ggml_element_size(src1); + + k_get_rows_float<<>>( + src0_dd, src1_dd, dst_dd, + ne00, /*ne01, ne02, ne03,*/ + /*ne10, ne11,*/ ne12, /*ne13,*/ + /* s0,*/ s1, s2, s3, + /* nb00,*/ nb01, nb02, nb03, + s10, s11, s12/*, s13*/); + + GGML_UNUSED(dst); +} + +void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type)); + GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type)); + GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type)); + + const int32_t * src1_i32 = (const int32_t *) src1_d; + + switch (src0->type) { + case GGML_TYPE_F16: + get_rows_cuda_float(src0, src1, dst, (const half *)src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_F32: + get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q4_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q5_1: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + case GGML_TYPE_Q8_0: + get_rows_cuda(src0, src1, dst, src0_d, src1_i32, dst_d, stream); + break; + default: + // TODO: k-quants + fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type)); + GGML_ASSERT(false); + break; + } +} diff --git a/ggml-cuda/getrows.cuh b/ggml-cuda/getrows.cuh new file mode 100644 index 000000000..bbf130232 --- /dev/null +++ b/ggml-cuda/getrows.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_GET_ROWS_BLOCK_SIZE 256 + +void ggml_cuda_op_get_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/im2col.cu b/ggml-cuda/im2col.cu new file mode 100644 index 000000000..3d0d8d4e6 --- /dev/null +++ b/ggml-cuda/im2col.cu @@ -0,0 +1,104 @@ +#include "im2col.cuh" + +template +static __global__ void im2col_kernel( + const float * x, T * dst, int64_t batch_offset, + int64_t offset_delta, int64_t IC, int64_t IW, int64_t IH, int64_t OH, int64_t OW, int64_t KW, int64_t KH, int64_t pelements, int64_t CHW, + int s0, int s1, int p0, int p1, int d0, int d1) { + const int64_t i = threadIdx.x + blockIdx.x * blockDim.x; + if (i >= pelements) { + return; + } + + const int64_t ksize = OW * (KH > 1 ? KW : 1); + const int64_t kx = i / ksize; + const int64_t kd = kx * ksize; + const int64_t ky = (i - kd) / OW; + const int64_t ix = i % OW; + + const int64_t oh = blockIdx.y; + const int64_t batch = blockIdx.z / IC; + const int64_t ic = blockIdx.z % IC; + + const int64_t iiw = ix * s0 + kx * d0 - p0; + const int64_t iih = oh * s1 + ky * d1 - p1; + + const int64_t offset_dst = + ((batch * OH + oh) * OW + ix) * CHW + + (ic * (KW * KH) + ky * KW + kx); + + if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) { + dst[offset_dst] = 0.0f; + } else { + const int64_t offset_src = ic * offset_delta + batch * batch_offset; + dst[offset_dst] = x[offset_src + iih * IW + iiw]; + } +} + +template +static void im2col_cuda(const float * x, T* dst, + int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, + int64_t batch, int64_t batch_offset, int64_t offset_delta, + int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { + const int parallel_elements = OW * KW * KH; + const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE; + dim3 block_nums(num_blocks, OH, batch * IC); + im2col_kernel<<>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); +} + +static void im2col_cuda_f16(const float * x, half * dst, + int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, + int64_t batch, int64_t batch_offset, int64_t offset_delta, + int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { + + im2col_cuda(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); +} + +static void im2col_cuda_f32(const float * x, float * dst, + int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, + int64_t batch, int64_t batch_offset, int64_t offset_delta, + int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { + + im2col_cuda(x, dst, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, offset_delta, s0, s1, p0, p1, d0, d1, stream); +} + +void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + + const int64_t IC = src1->ne[is_2D ? 2 : 1]; + const int64_t IH = is_2D ? src1->ne[1] : 1; + const int64_t IW = src1->ne[0]; + + const int64_t KH = is_2D ? src0->ne[1] : 1; + const int64_t KW = src0->ne[0]; + + const int64_t OH = is_2D ? dst->ne[2] : 1; + const int64_t OW = dst->ne[1]; + + const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32 + const int64_t batch = src1->ne[3]; + const size_t batch_offset = src1->nb[3] / 4; // nb is byte offset, src is type float32 + + if(dst->type == GGML_TYPE_F16) { + im2col_cuda_f16(src1_d, (half *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); + } else { + im2col_cuda_f32(src1_d, (float *) dst_d, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, stream); + } +} diff --git a/ggml-cuda/im2col.cuh b/ggml-cuda/im2col.cuh new file mode 100644 index 000000000..1ce8fae4d --- /dev/null +++ b/ggml-cuda/im2col.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_IM2COL_BLOCK_SIZE 256 + +void ggml_cuda_op_im2col(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/mmq.cu b/ggml-cuda/mmq.cu new file mode 100644 index 000000000..7948f1b12 --- /dev/null +++ b/ggml-cuda/mmq.cu @@ -0,0 +1,2255 @@ +#include "mmq.cuh" +#include "vecdotq.cuh" + +typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc); +typedef void (*load_tiles_cuda_t)( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row); +typedef float (*vec_dot_q_mul_mat_cuda_t)( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); +typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); + +template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); + GGML_UNUSED(x_sc); + + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; + + *x_ql = tile_x_qs; + *x_dm = (half2 *) tile_x_d; +} + +template static __device__ __forceinline__ void load_tiles_q4_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_0; + const int kqsx = k % QI4_0; + + const block_q4_0 * bx0 = (const block_q4_0 *) vx; + + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) { + int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d; + } +} + +static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const float * x_dmf = (const float *) x_dm; + + int u[2*VDR_Q4_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE]; + } + + return vec_dot_q4_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; + + *x_ql = tile_x_qs; + *x_dm = tile_x_dm; +} + +template static __device__ __forceinline__ void load_tiles_q4_1( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_1; + const int kqsx = k % QI4_1; + + const block_q4_1 * bx0 = (const block_q4_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) { + int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm; + } +} + +static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + + int u[2*VDR_Q4_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE]; + } + + return vec_dot_q4_1_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1], + y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; + + *x_ql = tile_x_ql; + *x_dm = (half2 *) tile_x_d; +} + +template static __device__ __forceinline__ void load_tiles_q5_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_0; + const int kqsx = k % QI5_0; + + const block_q5_0 * bx0 = (const block_q5_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8(bxi->qs, kqsx); + const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + qs0 = __vsubss4(qs0, 0x10101010); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + qs1 = __vsubss4(qs1, 0x10101010); // subtract 16 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_0; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) { + int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d; + } +} + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + int u[2*VDR_Q5_0_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE]; + } + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + + +template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; +} + +template static __device__ __forceinline__ void load_tiles_q5_1( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_1; + const int kqsx = k % QI5_1; + + const block_q5_1 * bx0 = (const block_q5_1 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1)); + + int qs0 = (ql >> 0) & 0x0F0F0F0F; + qs0 |= (qh << 4) & 0x00000010; // 0 -> 4 + qs0 |= (qh << 11) & 0x00001000; // 1 -> 12 + qs0 |= (qh << 18) & 0x00100000; // 2 -> 20 + qs0 |= (qh << 25) & 0x10000000; // 3 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0; + + int qs1 = (ql >> 4) & 0x0F0F0F0F; + qs1 |= (qh >> 12) & 0x00000010; // 16 -> 4 + qs1 |= (qh >> 5) & 0x00001000; // 17 -> 12 + qs1 |= (qh << 2) & 0x00100000; // 18 -> 20 + qs1 |= (qh << 9) & 0x10000000; // 19 -> 28 + + x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_1; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) { + int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm; + } +} + +static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); + const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; + + int u[2*VDR_Q5_1_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) { + u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l) % WARP_SIZE]; + u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE]; + } + + return vec_dot_q8_1_q8_1_impl + (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]); +} + +template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; + + *x_ql = tile_x_qs; + *x_dm = (half2 *) tile_x_d; +} + +template static __device__ __forceinline__ void load_tiles_q8_0( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI8_0; + const int kqsx = k % QI8_0; + float * x_dmf = (float *) x_dm; + + const block_q8_0 * bx0 = (const block_q8_0 *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI8_0; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) { + int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row; + + if (need_check) { + i = min(i, i_max); + } + + const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d; + } +} + +static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + return vec_dot_q8_0_q8_1_impl + (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0], + y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]); +} + +template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); + + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q2_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI2_K; + const int kqsx = k % QI2_K; + + const block_q2_K * bx0 = (const block_q2_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI2_K; + const int kbxd = k % blocks_per_tile_x_row; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) { + int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = min(i, i_max); + } + + const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4)); + } +} + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); + + const int kbx = k / QI2_K; + const int ky = (k % QI2_K) * QR2_K; + const float * y_df = (const float *) y_ds; + + int v[QR2_K*VDR_Q2_K_Q8_1_MMQ]; + + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2); + const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2)); + +#pragma unroll + for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) { + v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303; + } + + const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4; + + const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE; + return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]); +} + +template static __device__ __forceinline__ void allocate_tiles_q3_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K]; + __shared__ int tile_x_qh[mmq_y * (WARP_SIZE/2) + mmq_y/2]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/4) + mmq_y/4]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_qh = tile_x_qh; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q3_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI3_K; + const int kqsx = k % QI3_K; + + const block_q3_K * bx0 = (const block_q3_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI3_K; + const int kbxd = k % blocks_per_tile_x_row; + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) { + int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) { + int i = i0 + i_offset * 2 + k / (WARP_SIZE/2); + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2)); + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) { + int i = i0 + i_offset * 4 + k / (WARP_SIZE/4); + + if (need_check) { + i = min(i, i_max); + } + + const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4); + + const int ksc = k % (QI3_K/4); + + const int ksc_low = ksc % (QI3_K/8); + const int shift_low = 4 * (ksc / (QI3_K/8)); + const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F; + + const int ksc_high = QI3_K/8; + const int shift_high = 2 * ksc; + const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030; + + const int sc = __vsubss4(sc_low | sc_high, 0x20202020); + + x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc; + } +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + + const int kbx = k / QI3_K; + const int ky = (k % QI3_K) * QR3_K; + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4; + + int v[QR3_K*VDR_Q3_K_Q8_1_MMQ]; + +#pragma unroll + for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) { + const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2); + const int shift = 2 * ((ky % 32) / 8); + const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303; + + const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8); + const int vlh = (vh << 2) & 0x04040404; + + v[l] = __vsubss4(vll, vlh); + } + + const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE; + return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]); +} + +template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); + + __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q4_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI4_K; // == 0 if QK_K == 256 + const int kqsx = k % QI4_K; // == k if QK_K == 256 + + const block_q4_K * bx0 = (const block_q4_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx; + + x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) { + int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm; +#else + x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]}; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); + + const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE; + return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]); +} + +template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q5_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI5_K; // == 0 if QK_K == 256 + const int kqsx = k % QI5_K; // == k if QK_K == 256 + + const block_q5_K * bx0 = (const block_q5_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR5_K*kqsx; + + const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4)); + const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010; + const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010; + + const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0; + const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0; + x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1; + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) { + int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd; + +#if QK_K == 256 + x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm; +#endif + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8); + + const int * scales = (const int *) bxi->scales; + + const int ksc = k % (WARP_SIZE/8); + + // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8 + int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits + scales8 |= (scales[ksc/2] >> (2 * (ksc % 2))) & 0x30303030; // upper 2 bits + + x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8; + } +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); + + const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); + + const int index_x = i * (QR5_K*WARP_SIZE + 1) + QR5_K*k; + const int index_y = j * WARP_SIZE + (QR5_K*k) % WARP_SIZE; + return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8, + x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]); +} + +template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { + GGML_UNUSED(x_qh); + + __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; + __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; + __shared__ int tile_x_sc[mmq_y * (WARP_SIZE/8) + mmq_y/8]; + + *x_ql = tile_x_ql; + *x_dm = tile_x_dm; + *x_sc = tile_x_sc; +} + +template static __device__ __forceinline__ void load_tiles_q6_K( + const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, + int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { + GGML_UNUSED(x_qh); + + GGML_CUDA_ASSUME(i_offset >= 0); + GGML_CUDA_ASSUME(i_offset < nwarps); + GGML_CUDA_ASSUME(k >= 0); + GGML_CUDA_ASSUME(k < WARP_SIZE); + + const int kbx = k / QI6_K; // == 0 if QK_K == 256 + const int kqsx = k % QI6_K; // == k if QK_K == 256 + + const block_q6_K * bx0 = (const block_q6_K *) vx; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps) { + int i = i0 + i_offset; + + if (need_check) { + i = min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx; + const int ky = QR6_K*kqsx; + + const int ql = get_int_from_uint8(bxi->ql, kqsx); + const int ql0 = (ql >> 0) & 0x0F0F0F0F; + const int ql1 = (ql >> 4) & 0x0F0F0F0F; + + const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4)); + const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030; + const int qh1 = (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) & 0x30303030; + + const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0; + const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2); + + x_ql[i * (2*WARP_SIZE + 1) + kq0] = __vsubss4(ql0 | qh0, 0x20202020); + x_ql[i * (2*WARP_SIZE + 1) + kq1] = __vsubss4(ql1 | qh1, 0x20202020); + } + + const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256 + const int kbxd = k % blocks_per_tile_x_row; // == 0 if QK_K == 256 + float * x_dmf = (float *) x_dm; + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) { + int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd; + + x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d; + } + +#pragma unroll + for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) { + int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y; + + if (need_check) { + i = min(i, i_max); + } + + const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4; + + x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8)); + } +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( + const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, + const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { + GGML_UNUSED(x_qh); + + const float * x_dmf = (const float *) x_dm; + const float * y_df = (const float *) y_ds; + + const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]); + + const int index_x = i * (QR6_K*WARP_SIZE + 1) + QR6_K*k; + const int index_y = j * WARP_SIZE + (QR6_K*k) % WARP_SIZE; + return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]); +} + +#define MMQ_X_Q4_0_RDNA2 64 +#define MMQ_Y_Q4_0_RDNA2 128 +#define NWARPS_Q4_0_RDNA2 8 +#define MMQ_X_Q4_0_RDNA1 64 +#define MMQ_Y_Q4_0_RDNA1 64 +#define NWARPS_Q4_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_0_AMPERE 4 +#define MMQ_Y_Q4_0_AMPERE 32 +#define NWARPS_Q4_0_AMPERE 4 +#else +#define MMQ_X_Q4_0_AMPERE 64 +#define MMQ_Y_Q4_0_AMPERE 128 +#define NWARPS_Q4_0_AMPERE 4 +#endif +#define MMQ_X_Q4_0_PASCAL 64 +#define MMQ_Y_Q4_0_PASCAL 64 +#define NWARPS_Q4_0_PASCAL 8 + +template +static __device__ __forceinline__ void mul_mat_q( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + const int blocks_per_warp = WARP_SIZE / qi; + + const int & ncols_dst = ncols_y; + + const int row_dst_0 = blockIdx.x*mmq_y; + const int & row_x_0 = row_dst_0; + + const int col_dst_0 = blockIdx.y*mmq_x; + const int & col_y_0 = col_dst_0; + + int * tile_x_ql = nullptr; + half2 * tile_x_dm = nullptr; + int * tile_x_qh = nullptr; + int * tile_x_sc = nullptr; + + allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc); + + __shared__ int tile_y_qs[mmq_x * WARP_SIZE]; + __shared__ half2 tile_y_ds[mmq_x * WARP_SIZE/QI8_1]; + + float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}}; + + for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) { + + load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, + threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x); + +#pragma unroll + for (int ir = 0; ir < qr; ++ir) { + const int kqs = ir*WARP_SIZE + threadIdx.x; + const int kbxd = kqs / QI8_1; + +#pragma unroll + for (int i = 0; i < mmq_x; i += nwarps) { + const int col_y_eff = min(col_y_0 + threadIdx.y + i, ncols_y-1); // to prevent out-of-bounds memory accesses + + const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd]; + + const int index_y = (threadIdx.y + i) * WARP_SIZE + kqs % WARP_SIZE; + tile_y_qs[index_y] = get_int_from_int8_aligned(by0->qs, threadIdx.x % QI8_1); + } + +#pragma unroll + for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { + const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE/QI8_1)) % mmq_x; + const int kby = threadIdx.x % (WARP_SIZE/QI8_1); + const int col_y_eff = min(col_y_0 + ids, ncols_y-1); + + // if the sum is not needed it's faster to transform the scale to f32 ahead of time + const half2 * dsi_src = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + ir*(WARP_SIZE/QI8_1) + kby].ds; + half2 * dsi_dst = &tile_y_ds[ids * (WARP_SIZE/QI8_1) + kby]; + if (need_sum) { + *dsi_dst = *dsi_src; + } else { + float * dfi_dst = (float *) dsi_dst; + *dfi_dst = __low2float(*dsi_src); + } + } + + __syncthreads(); + +// #pragma unroll // unrolling this loop causes too much register pressure + for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) { +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + sum[i/WARP_SIZE][j/nwarps] += vec_dot( + tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc, tile_y_qs, tile_y_ds, + threadIdx.x + i, threadIdx.y + j, k); + } + } + } + + __syncthreads(); + } + } + +#pragma unroll + for (int j = 0; j < mmq_x; j += nwarps) { + const int col_dst = col_dst_0 + j + threadIdx.y; + + if (col_dst >= ncols_dst) { + return; + } + +#pragma unroll + for (int i = 0; i < mmq_y; i += WARP_SIZE) { + const int row_dst = row_dst_0 + threadIdx.x + i; + + if (row_dst >= nrows_dst) { + continue; + } + + dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps]; + } + } +} + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q4_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_0_RDNA2; + const int mmq_y = MMQ_Y_Q4_0_RDNA2; + const int nwarps = NWARPS_Q4_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_0_RDNA1; + const int mmq_y = MMQ_Y_Q4_0_RDNA1; + const int nwarps = NWARPS_Q4_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_0_AMPERE; + const int mmq_y = MMQ_Y_Q4_0_AMPERE; + const int nwarps = NWARPS_Q4_0_AMPERE; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_0_PASCAL; + const int mmq_y = MMQ_Y_Q4_0_PASCAL; + const int nwarps = NWARPS_Q4_0_PASCAL; + + mul_mat_q, + load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q4_1_RDNA2 64 +#define MMQ_Y_Q4_1_RDNA2 128 +#define NWARPS_Q4_1_RDNA2 8 +#define MMQ_X_Q4_1_RDNA1 64 +#define MMQ_Y_Q4_1_RDNA1 64 +#define NWARPS_Q4_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_1_AMPERE 4 +#define MMQ_Y_Q4_1_AMPERE 32 +#define NWARPS_Q4_1_AMPERE 4 +#else +#define MMQ_X_Q4_1_AMPERE 64 +#define MMQ_Y_Q4_1_AMPERE 128 +#define NWARPS_Q4_1_AMPERE 4 +#endif +#define MMQ_X_Q4_1_PASCAL 64 +#define MMQ_Y_Q4_1_PASCAL 64 +#define NWARPS_Q4_1_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q4_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_1_RDNA2; + const int mmq_y = MMQ_Y_Q4_1_RDNA2; + const int nwarps = NWARPS_Q4_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_1_RDNA1; + const int mmq_y = MMQ_Y_Q4_1_RDNA1; + const int nwarps = NWARPS_Q4_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_1_AMPERE; + const int mmq_y = MMQ_Y_Q4_1_AMPERE; + const int nwarps = NWARPS_Q4_1_AMPERE; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_1_PASCAL; + const int mmq_y = MMQ_Y_Q4_1_PASCAL; + const int nwarps = NWARPS_Q4_1_PASCAL; + + mul_mat_q, + load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_0_RDNA2 64 +#define MMQ_Y_Q5_0_RDNA2 128 +#define NWARPS_Q5_0_RDNA2 8 +#define MMQ_X_Q5_0_RDNA1 64 +#define MMQ_Y_Q5_0_RDNA1 64 +#define NWARPS_Q5_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_0_AMPERE 4 +#define MMQ_Y_Q5_0_AMPERE 32 +#define NWARPS_Q5_0_AMPERE 4 +#else +#define MMQ_X_Q5_0_AMPERE 128 +#define MMQ_Y_Q5_0_AMPERE 64 +#define NWARPS_Q5_0_AMPERE 4 +#endif +#define MMQ_X_Q5_0_PASCAL 64 +#define MMQ_Y_Q5_0_PASCAL 64 +#define NWARPS_Q5_0_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q5_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_0_RDNA2; + const int mmq_y = MMQ_Y_Q5_0_RDNA2; + const int nwarps = NWARPS_Q5_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_0_RDNA1; + const int mmq_y = MMQ_Y_Q5_0_RDNA1; + const int nwarps = NWARPS_Q5_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_0_AMPERE; + const int mmq_y = MMQ_Y_Q5_0_AMPERE; + const int nwarps = NWARPS_Q5_0_AMPERE; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_0_PASCAL; + const int mmq_y = MMQ_Y_Q5_0_PASCAL; + const int nwarps = NWARPS_Q5_0_PASCAL; + + mul_mat_q, + load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_1_RDNA2 64 +#define MMQ_Y_Q5_1_RDNA2 128 +#define NWARPS_Q5_1_RDNA2 8 +#define MMQ_X_Q5_1_RDNA1 64 +#define MMQ_Y_Q5_1_RDNA1 64 +#define NWARPS_Q5_1_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_1_AMPERE 4 +#define MMQ_Y_Q5_1_AMPERE 32 +#define NWARPS_Q5_1_AMPERE 4 +#else +#define MMQ_X_Q5_1_AMPERE 128 +#define MMQ_Y_Q5_1_AMPERE 64 +#define NWARPS_Q5_1_AMPERE 4 +#endif +#define MMQ_X_Q5_1_PASCAL 64 +#define MMQ_Y_Q5_1_PASCAL 64 +#define NWARPS_Q5_1_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_1( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_1_RDNA2; + const int mmq_y = MMQ_Y_Q5_1_RDNA2; + const int nwarps = NWARPS_Q5_1_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_1_RDNA1; + const int mmq_y = MMQ_Y_Q5_1_RDNA1; + const int nwarps = NWARPS_Q5_1_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_1_AMPERE; + const int mmq_y = MMQ_Y_Q5_1_AMPERE; + const int nwarps = NWARPS_Q5_1_AMPERE; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_1_PASCAL; + const int mmq_y = MMQ_Y_Q5_1_PASCAL; + const int nwarps = NWARPS_Q5_1_PASCAL; + + mul_mat_q, + load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q8_0_RDNA2 64 +#define MMQ_Y_Q8_0_RDNA2 128 +#define NWARPS_Q8_0_RDNA2 8 +#define MMQ_X_Q8_0_RDNA1 64 +#define MMQ_Y_Q8_0_RDNA1 64 +#define NWARPS_Q8_0_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q8_0_AMPERE 4 +#define MMQ_Y_Q8_0_AMPERE 32 +#define NWARPS_Q8_0_AMPERE 4 +#else +#define MMQ_X_Q8_0_AMPERE 128 +#define MMQ_Y_Q8_0_AMPERE 64 +#define NWARPS_Q8_0_AMPERE 4 +#endif +#define MMQ_X_Q8_0_PASCAL 64 +#define MMQ_Y_Q8_0_PASCAL 64 +#define NWARPS_Q8_0_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + mul_mat_q8_0( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q8_0_RDNA2; + const int mmq_y = MMQ_Y_Q8_0_RDNA2; + const int nwarps = NWARPS_Q8_0_RDNA2; +#else + const int mmq_x = MMQ_X_Q8_0_RDNA1; + const int mmq_y = MMQ_Y_Q8_0_RDNA1; + const int nwarps = NWARPS_Q8_0_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q8_0_AMPERE; + const int mmq_y = MMQ_Y_Q8_0_AMPERE; + const int nwarps = NWARPS_Q8_0_AMPERE; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q8_0_PASCAL; + const int mmq_y = MMQ_Y_Q8_0_PASCAL; + const int nwarps = NWARPS_Q8_0_PASCAL; + + mul_mat_q, + load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q2_K_RDNA2 64 +#define MMQ_Y_Q2_K_RDNA2 128 +#define NWARPS_Q2_K_RDNA2 8 +#define MMQ_X_Q2_K_RDNA1 128 +#define MMQ_Y_Q2_K_RDNA1 32 +#define NWARPS_Q2_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q2_K_AMPERE 4 +#define MMQ_Y_Q2_K_AMPERE 32 +#define NWARPS_Q2_K_AMPERE 4 +#else +#define MMQ_X_Q2_K_AMPERE 64 +#define MMQ_Y_Q2_K_AMPERE 128 +#define NWARPS_Q2_K_AMPERE 4 +#endif +#define MMQ_X_Q2_K_PASCAL 64 +#define MMQ_Y_Q2_K_PASCAL 64 +#define NWARPS_Q2_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q2_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q2_K_RDNA2; + const int mmq_y = MMQ_Y_Q2_K_RDNA2; + const int nwarps = NWARPS_Q2_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q2_K_RDNA1; + const int mmq_y = MMQ_Y_Q2_K_RDNA1; + const int nwarps = NWARPS_Q2_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q2_K_AMPERE; + const int mmq_y = MMQ_Y_Q2_K_AMPERE; + const int nwarps = NWARPS_Q2_K_AMPERE; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q2_K_PASCAL; + const int mmq_y = MMQ_Y_Q2_K_PASCAL; + const int nwarps = NWARPS_Q2_K_PASCAL; + + mul_mat_q, + load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q3_K_RDNA2 128 +#define MMQ_Y_Q3_K_RDNA2 64 +#define NWARPS_Q3_K_RDNA2 8 +#define MMQ_X_Q3_K_RDNA1 32 +#define MMQ_Y_Q3_K_RDNA1 128 +#define NWARPS_Q3_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q3_K_AMPERE 4 +#define MMQ_Y_Q3_K_AMPERE 32 +#define NWARPS_Q3_K_AMPERE 4 +#else +#define MMQ_X_Q3_K_AMPERE 128 +#define MMQ_Y_Q3_K_AMPERE 128 +#define NWARPS_Q3_K_AMPERE 4 +#endif +#define MMQ_X_Q3_K_PASCAL 64 +#define MMQ_Y_Q3_K_PASCAL 64 +#define NWARPS_Q3_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q3_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q3_K_RDNA2; + const int mmq_y = MMQ_Y_Q3_K_RDNA2; + const int nwarps = NWARPS_Q3_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q3_K_RDNA1; + const int mmq_y = MMQ_Y_Q3_K_RDNA1; + const int nwarps = NWARPS_Q3_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q3_K_AMPERE; + const int mmq_y = MMQ_Y_Q3_K_AMPERE; + const int nwarps = NWARPS_Q3_K_AMPERE; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q3_K_PASCAL; + const int mmq_y = MMQ_Y_Q3_K_PASCAL; + const int nwarps = NWARPS_Q3_K_PASCAL; + + mul_mat_q, + load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q4_K_RDNA2 64 +#define MMQ_Y_Q4_K_RDNA2 128 +#define NWARPS_Q4_K_RDNA2 8 +#define MMQ_X_Q4_K_RDNA1 32 +#define MMQ_Y_Q4_K_RDNA1 64 +#define NWARPS_Q4_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q4_K_AMPERE 4 +#define MMQ_Y_Q4_K_AMPERE 32 +#define NWARPS_Q4_K_AMPERE 4 +#else +#define MMQ_X_Q4_K_AMPERE 64 +#define MMQ_Y_Q4_K_AMPERE 128 +#define NWARPS_Q4_K_AMPERE 4 +#endif +#define MMQ_X_Q4_K_PASCAL 64 +#define MMQ_Y_Q4_K_PASCAL 64 +#define NWARPS_Q4_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q4_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q4_K_RDNA2; + const int mmq_y = MMQ_Y_Q4_K_RDNA2; + const int nwarps = NWARPS_Q4_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q4_K_RDNA1; + const int mmq_y = MMQ_Y_Q4_K_RDNA1; + const int nwarps = NWARPS_Q4_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q4_K_AMPERE; + const int mmq_y = MMQ_Y_Q4_K_AMPERE; + const int nwarps = NWARPS_Q4_K_AMPERE; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q4_K_PASCAL; + const int mmq_y = MMQ_Y_Q4_K_PASCAL; + const int nwarps = NWARPS_Q4_K_PASCAL; + + mul_mat_q, + load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q5_K_RDNA2 64 +#define MMQ_Y_Q5_K_RDNA2 128 +#define NWARPS_Q5_K_RDNA2 8 +#define MMQ_X_Q5_K_RDNA1 32 +#define MMQ_Y_Q5_K_RDNA1 64 +#define NWARPS_Q5_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q5_K_AMPERE 4 +#define MMQ_Y_Q5_K_AMPERE 32 +#define NWARPS_Q5_K_AMPERE 4 +#else +#define MMQ_X_Q5_K_AMPERE 64 +#define MMQ_Y_Q5_K_AMPERE 128 +#define NWARPS_Q5_K_AMPERE 4 +#endif +#define MMQ_X_Q5_K_PASCAL 64 +#define MMQ_Y_Q5_K_PASCAL 64 +#define NWARPS_Q5_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +mul_mat_q5_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q5_K_RDNA2; + const int mmq_y = MMQ_Y_Q5_K_RDNA2; + const int nwarps = NWARPS_Q5_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q5_K_RDNA1; + const int mmq_y = MMQ_Y_Q5_K_RDNA1; + const int nwarps = NWARPS_Q5_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q5_K_AMPERE; + const int mmq_y = MMQ_Y_Q5_K_AMPERE; + const int nwarps = NWARPS_Q5_K_AMPERE; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q5_K_PASCAL; + const int mmq_y = MMQ_Y_Q5_K_PASCAL; + const int nwarps = NWARPS_Q5_K_PASCAL; + + mul_mat_q, + load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +#define MMQ_X_Q6_K_RDNA2 64 +#define MMQ_Y_Q6_K_RDNA2 128 +#define NWARPS_Q6_K_RDNA2 8 +#define MMQ_X_Q6_K_RDNA1 32 +#define MMQ_Y_Q6_K_RDNA1 64 +#define NWARPS_Q6_K_RDNA1 8 +#if defined(CUDA_USE_TENSOR_CORES) +#define MMQ_X_Q6_K_AMPERE 4 +#define MMQ_Y_Q6_K_AMPERE 32 +#define NWARPS_Q6_K_AMPERE 4 +#else +#define MMQ_X_Q6_K_AMPERE 64 +#define MMQ_Y_Q6_K_AMPERE 64 +#define NWARPS_Q6_K_AMPERE 4 +#endif +#define MMQ_X_Q6_K_PASCAL 64 +#define MMQ_Y_Q6_K_PASCAL 64 +#define NWARPS_Q6_K_PASCAL 8 + +template static __global__ void +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2) +#endif // defined(RDNA3) || defined(RDNA2) +#elif __CUDA_ARCH__ < CC_VOLTA + __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2) +#endif // __CUDA_ARCH__ < CC_VOLTA + mul_mat_q6_K( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#if defined(RDNA3) || defined(RDNA2) + const int mmq_x = MMQ_X_Q6_K_RDNA2; + const int mmq_y = MMQ_Y_Q6_K_RDNA2; + const int nwarps = NWARPS_Q6_K_RDNA2; +#else + const int mmq_x = MMQ_X_Q6_K_RDNA1; + const int mmq_y = MMQ_Y_Q6_K_RDNA1; + const int nwarps = NWARPS_Q6_K_RDNA1; +#endif // defined(RDNA3) || defined(RDNA2) + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= CC_VOLTA + const int mmq_x = MMQ_X_Q6_K_AMPERE; + const int mmq_y = MMQ_Y_Q6_K_AMPERE; + const int nwarps = NWARPS_Q6_K_AMPERE; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + +#elif __CUDA_ARCH__ >= MIN_CC_DP4A + const int mmq_x = MMQ_X_Q6_K_PASCAL; + const int mmq_y = MMQ_Y_Q6_K_PASCAL; + const int nwarps = NWARPS_Q6_K_PASCAL; + + mul_mat_q, + load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); +#else + GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat); + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= CC_VOLTA +} + +static void ggml_mul_mat_q4_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_0_RDNA2; + mmq_y = MMQ_Y_Q4_0_RDNA2; + nwarps = NWARPS_Q4_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_0_RDNA1; + mmq_y = MMQ_Y_Q4_0_RDNA1; + nwarps = NWARPS_Q4_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_0_AMPERE; + mmq_y = MMQ_Y_Q4_0_AMPERE; + nwarps = NWARPS_Q4_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_0_PASCAL; + mmq_y = MMQ_Y_Q4_0_PASCAL; + nwarps = NWARPS_Q4_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q4_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q4_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_1_RDNA2; + mmq_y = MMQ_Y_Q4_1_RDNA2; + nwarps = NWARPS_Q4_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_1_RDNA1; + mmq_y = MMQ_Y_Q4_1_RDNA1; + nwarps = NWARPS_Q4_1_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_1_AMPERE; + mmq_y = MMQ_Y_Q4_1_AMPERE; + nwarps = NWARPS_Q4_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_1_PASCAL; + mmq_y = MMQ_Y_Q4_1_PASCAL; + nwarps = NWARPS_Q4_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q4_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q5_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_0_RDNA2; + mmq_y = MMQ_Y_Q5_0_RDNA2; + nwarps = NWARPS_Q5_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_0_RDNA1; + mmq_y = MMQ_Y_Q5_0_RDNA1; + nwarps = NWARPS_Q5_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_0_AMPERE; + mmq_y = MMQ_Y_Q5_0_AMPERE; + nwarps = NWARPS_Q5_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_0_PASCAL; + mmq_y = MMQ_Y_Q5_0_PASCAL; + nwarps = NWARPS_Q5_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q5_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q5_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_1_RDNA2; + mmq_y = MMQ_Y_Q5_1_RDNA2; + nwarps = NWARPS_Q5_1_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_1_RDNA1; + mmq_y = MMQ_Y_Q5_1_RDNA1; + nwarps = NWARPS_Q5_1_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_1_AMPERE; + mmq_y = MMQ_Y_Q5_1_AMPERE; + nwarps = NWARPS_Q5_1_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_1_PASCAL; + mmq_y = MMQ_Y_Q5_1_PASCAL; + nwarps = NWARPS_Q5_1_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q5_1<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q8_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q8_0_RDNA2; + mmq_y = MMQ_Y_Q8_0_RDNA2; + nwarps = NWARPS_Q8_0_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q8_0_RDNA1; + mmq_y = MMQ_Y_Q8_0_RDNA1; + nwarps = NWARPS_Q8_0_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q8_0_AMPERE; + mmq_y = MMQ_Y_Q8_0_AMPERE; + nwarps = NWARPS_Q8_0_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q8_0_PASCAL; + mmq_y = MMQ_Y_Q8_0_PASCAL; + nwarps = NWARPS_Q8_0_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q8_0<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q2_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q2_K_RDNA2; + mmq_y = MMQ_Y_Q2_K_RDNA2; + nwarps = NWARPS_Q2_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q2_K_RDNA1; + mmq_y = MMQ_Y_Q2_K_RDNA1; + nwarps = NWARPS_Q2_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q2_K_AMPERE; + mmq_y = MMQ_Y_Q2_K_AMPERE; + nwarps = NWARPS_Q2_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q2_K_PASCAL; + mmq_y = MMQ_Y_Q2_K_PASCAL; + nwarps = NWARPS_Q2_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q2_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q3_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + +#if QK_K == 256 + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q3_K_RDNA2; + mmq_y = MMQ_Y_Q3_K_RDNA2; + nwarps = NWARPS_Q3_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q3_K_RDNA1; + mmq_y = MMQ_Y_Q3_K_RDNA1; + nwarps = NWARPS_Q3_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q3_K_AMPERE; + mmq_y = MMQ_Y_Q3_K_AMPERE; + nwarps = NWARPS_Q3_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q3_K_PASCAL; + mmq_y = MMQ_Y_Q3_K_PASCAL; + nwarps = NWARPS_Q3_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q3_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +#endif +} + +static void ggml_mul_mat_q4_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q4_K_RDNA2; + mmq_y = MMQ_Y_Q4_K_RDNA2; + nwarps = NWARPS_Q4_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q4_K_RDNA1; + mmq_y = MMQ_Y_Q4_K_RDNA1; + nwarps = NWARPS_Q4_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q4_K_AMPERE; + mmq_y = MMQ_Y_Q4_K_AMPERE; + nwarps = NWARPS_Q4_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q4_K_PASCAL; + mmq_y = MMQ_Y_Q4_K_PASCAL; + nwarps = NWARPS_Q4_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q4_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q5_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q5_K_RDNA2; + mmq_y = MMQ_Y_Q5_K_RDNA2; + nwarps = NWARPS_Q5_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q5_K_RDNA1; + mmq_y = MMQ_Y_Q5_K_RDNA1; + nwarps = NWARPS_Q5_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q5_K_AMPERE; + mmq_y = MMQ_Y_Q5_K_AMPERE; + nwarps = NWARPS_Q5_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q5_K_PASCAL; + mmq_y = MMQ_Y_Q5_K_PASCAL; + nwarps = NWARPS_Q5_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q5_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +static void ggml_mul_mat_q6_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, + const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { + + int id = ggml_cuda_get_device(); + const int compute_capability = ggml_cuda_info().devices[id].cc; + + int mmq_x, mmq_y, nwarps; + if (compute_capability >= CC_RDNA2) { + mmq_x = MMQ_X_Q6_K_RDNA2; + mmq_y = MMQ_Y_Q6_K_RDNA2; + nwarps = NWARPS_Q6_K_RDNA2; + } else if (compute_capability >= CC_OFFSET_AMD) { + mmq_x = MMQ_X_Q6_K_RDNA1; + mmq_y = MMQ_Y_Q6_K_RDNA1; + nwarps = NWARPS_Q6_K_RDNA1; + } else if (compute_capability >= CC_VOLTA) { + mmq_x = MMQ_X_Q6_K_AMPERE; + mmq_y = MMQ_Y_Q6_K_AMPERE; + nwarps = NWARPS_Q6_K_AMPERE; + } else if (compute_capability >= MIN_CC_DP4A) { + mmq_x = MMQ_X_Q6_K_PASCAL; + mmq_y = MMQ_Y_Q6_K_PASCAL; + nwarps = NWARPS_Q6_K_PASCAL; + } else { + GGML_ASSERT(false); + } + + const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y; + const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x; + const dim3 block_nums(block_num_x, block_num_y, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + if (nrows_x % mmq_y == 0) { + const bool need_check = false; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } else { + const bool need_check = true; + mul_mat_q6_K<<>> + (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); + } +} + +void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + + const int64_t ne00 = src0->ne[0]; + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + const int64_t row_diff = row_high - row_low; + + int id = ggml_cuda_get_device(); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_1: + ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_0: + ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_1: + ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q8_0: + ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q2_K: + ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q3_K: + ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q4_K: + ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q5_K: + ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + case GGML_TYPE_Q6_K: + ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddf_i); +} + +bool ggml_cuda_supports_mmq(enum ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + return true; + default: + return false; + } +} diff --git a/ggml-cuda/mmq.cuh b/ggml-cuda/mmq.cuh new file mode 100644 index 000000000..807817c4a --- /dev/null +++ b/ggml-cuda/mmq.cuh @@ -0,0 +1,9 @@ +#include "common.cuh" + +void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream); + +bool ggml_cuda_supports_mmq(enum ggml_type type); diff --git a/ggml-cuda/mmvq.cu b/ggml-cuda/mmvq.cu new file mode 100644 index 000000000..65cc1bcaa --- /dev/null +++ b/ggml-cuda/mmvq.cu @@ -0,0 +1,404 @@ +#include "mmvq.cuh" +#include "vecdotq.cuh" + +typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs); + +template +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +// tell the compiler to use as many registers as it wants, see nwarps definition below +__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1) +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +static __global__ void mul_mat_vec_q( + const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) { + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3)) + constexpr int nwarps = 1; + constexpr int rows_per_cuda_block = 1; +#else + constexpr int nwarps = ncols_y <= 4 ? 4 : 2; + constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3) + + const int tid = WARP_SIZE*threadIdx.y + threadIdx.x; + const int row0 = rows_per_cuda_block*blockIdx.x; + const int blocks_per_row_x = ncols_x / qk; + const int blocks_per_col_y = nrows_y / QK8_1; + constexpr int blocks_per_iter = vdr * nwarps*WARP_SIZE / qi; + +// partial sum for each thread + float tmp[ncols_y][rows_per_cuda_block] = {0.0f}; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) { + const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx + + // x block quant index when casting the quants to int + const int kqs = vdr * (tid % (qi/vdr)); + +#pragma unroll + for (int j = 0; j < ncols_y; ++j) { +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { + tmp[j][i] += vec_dot_q_cuda( + &x[kbx + (row0 + i)*blocks_per_row_x], &y[j*blocks_per_col_y + kby], kqs); + } + } + } + + __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][WARP_SIZE]; + if (threadIdx.y > 0) { +#pragma unroll + for (int j = 0; j < ncols_y; ++j) { +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { + tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; + } + } + } + __syncthreads(); + if (threadIdx.y > 0) { + return; + } + + // sum up partial sums and write back result +#pragma unroll + for (int j = 0; j < ncols_y; ++j) { +#pragma unroll + for (int i = 0; i < rows_per_cuda_block; ++i) { +#pragma unroll + for (int l = 0; l < nwarps-1; ++l) { + tmp[j][i] += tmp_shared[l][j][i][threadIdx.x]; + } + tmp[j][i] = warp_reduce_sum(tmp[j][i]); + } + + if (threadIdx.x < rows_per_cuda_block) { + dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x]; + } + } +} + +template +static void mul_mat_vec_q_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + GGML_ASSERT(ncols_x % qk == 0); + GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE); + + int id = ggml_cuda_get_device(); + + int64_t nwarps = 1; + int64_t rows_per_cuda_block = 1; + + if (ggml_cuda_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 + switch(ncols_y) { + case 1: + nwarps = 4; + rows_per_cuda_block = 1; + break; + case 2: + case 3: + case 4: + nwarps = 4; + rows_per_cuda_block = 2; + break; + case 5: + case 6: + case 7: + case 8: + nwarps = 2; + rows_per_cuda_block = 2; + break; + default: + GGML_ASSERT(false); + break; + } + } + const int64_t nblocks = (nrows_x + rows_per_cuda_block - 1) / rows_per_cuda_block; + const dim3 block_nums(nblocks, 1, 1); + const dim3 block_dims(WARP_SIZE, nwarps, 1); + + switch (ncols_y) { + case 1: + mul_mat_vec_q<1, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 2: + mul_mat_vec_q<2, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 3: + mul_mat_vec_q<3, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 4: + mul_mat_vec_q<4, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 5: + mul_mat_vec_q<5, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 6: + mul_mat_vec_q<6, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 7: + mul_mat_vec_q<7, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + case 8: + mul_mat_vec_q<8, qk, qi, block_q_t, vdr, vec_dot> + <<>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst); + break; + default: + GGML_ASSERT(false); + break; + } +} + +static void mul_mat_vec_q4_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q4_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q5_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q5_1_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q8_0_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q2_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q3_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q4_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q5_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_q6_K_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq2_xxs_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq2_xs_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq2_s_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq3_xxs_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq1_s_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq1_m_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq4_nl_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq4_xs_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +static void mul_mat_vec_iq3_s_q8_1_cuda( + const void * vx, const void * vy, float * dst, + const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) { + + mul_mat_vec_q_cuda + (vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream); +} + +void ggml_cuda_op_mul_mat_vec_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream) { + + const int64_t ne00 = src0->ne[0]; + const int64_t row_diff = row_high - row_low; + + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); + + const int64_t ne0 = dst->ne[0]; + + int id = ggml_cuda_get_device(); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; + + switch (src0->type) { + case GGML_TYPE_Q4_0: + mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q4_1: + mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q5_0: + mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q5_1: + mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q8_0: + mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q2_K: + mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q3_K: + mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q4_K: + mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q5_K: + mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_Q6_K: + mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ2_XXS: + mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ3_XXS: + mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ4_NL: + mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + case GGML_TYPE_IQ3_S: + mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream); + break; + default: + GGML_ASSERT(false); + break; + } + + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddf_i); + GGML_UNUSED(src1_ncols); + GGML_UNUSED(src1_padded_row_size); +} diff --git a/ggml-cuda/mmvq.cuh b/ggml-cuda/mmvq.cuh new file mode 100644 index 000000000..88c42c4b7 --- /dev/null +++ b/ggml-cuda/mmvq.cuh @@ -0,0 +1,7 @@ +#include "common.cuh" + +void ggml_cuda_op_mul_mat_vec_q( + ggml_backend_cuda_context & ctx, + const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, + const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, + const int64_t src1_padded_row_size, cudaStream_t stream); diff --git a/ggml-cuda/norm.cu b/ggml-cuda/norm.cu new file mode 100644 index 000000000..86f774534 --- /dev/null +++ b/ggml-cuda/norm.cu @@ -0,0 +1,215 @@ +#include "norm.cuh" + +template +static __global__ void norm_f32(const float * x, float * dst, const int ncols, const float eps) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + + float2 mean_var = make_float2(0.f, 0.f); + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + mean_var.x += xi; + mean_var.y += xi * xi; + } + + // sum up partial sums + mean_var = warp_reduce_sum(mean_var); + if (block_size > WARP_SIZE) { + __shared__ float2 s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = mean_var; + } + __syncthreads(); + mean_var = s_sum[lane_id]; + mean_var = warp_reduce_sum(mean_var); + } + + const float mean = mean_var.x / ncols; + const float var = mean_var.y / ncols - mean * mean; + const float inv_std = rsqrtf(var + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std; + } +} + +template +static __global__ void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps) { + // blockIdx.x: num_groups idx + // threadIdx.x: block_size idx + int start = blockIdx.x * group_size; + int end = start + group_size; + + start += threadIdx.x; + + if (end >= ne_elements) { + end = ne_elements; + } + + float tmp = 0.0f; // partial sum for thread in warp + + for (int j = start; j < end; j += block_size) { + tmp += x[j]; + } + + tmp = warp_reduce_sum(tmp); + if (block_size > WARP_SIZE) { + __shared__ float s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + __syncthreads(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp); + } + + float mean = tmp / group_size; + tmp = 0.0f; + + for (int j = start; j < end; j += block_size) { + float xi = x[j] - mean; + dst[j] = xi; + tmp += xi * xi; + } + + tmp = warp_reduce_sum(tmp); + if (block_size > WARP_SIZE) { + __shared__ float s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + __syncthreads(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp); + } + + float variance = tmp / group_size; + float scale = rsqrtf(variance + eps); + for (int j = start; j < end; j += block_size) { + dst[j] *= scale; + } +} + +template +static __global__ void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps) { + const int row = blockIdx.x*blockDim.y + threadIdx.y; + const int tid = threadIdx.x; + + float tmp = 0.0f; // partial sum for thread in warp + + for (int col = tid; col < ncols; col += block_size) { + const float xi = x[row*ncols + col]; + tmp += xi * xi; + } + + // sum up partial sums + tmp = warp_reduce_sum(tmp); + if (block_size > WARP_SIZE) { + __shared__ float s_sum[32]; + int warp_id = threadIdx.x / WARP_SIZE; + int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = tmp; + } + __syncthreads(); + tmp = s_sum[lane_id]; + tmp = warp_reduce_sum(tmp); + } + + const float mean = tmp / ncols; + const float scale = rsqrtf(mean + eps); + + for (int col = tid; col < ncols; col += block_size) { + dst[row*ncols + col] = scale * x[row*ncols + col]; + } +} + +static void norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + norm_f32<<>>(x, dst, ncols, eps); + } else { + const dim3 block_dims(1024, 1, 1); + norm_f32<1024><<>>(x, dst, ncols, eps); + } +} + +static void group_norm_f32_cuda(const float * x, float * dst, const int num_groups, const int group_size, const int ne_elements, cudaStream_t stream) { + static const float eps = 1e-6f; + if (group_size < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + group_norm_f32<<>>(x, dst, group_size, ne_elements, eps); + } else { + const dim3 block_dims(1024, 1, 1); + group_norm_f32<1024><<>>(x, dst, group_size, ne_elements, eps); + } +} + +static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, const float eps, cudaStream_t stream) { + GGML_ASSERT(ncols % WARP_SIZE == 0); + if (ncols < 1024) { + const dim3 block_dims(WARP_SIZE, 1, 1); + rms_norm_f32<<>>(x, dst, ncols, eps); + } else { + const dim3 block_dims(1024, 1, 1); + rms_norm_f32<1024><<>>(x, dst, ncols, eps); + } +} + +void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); +} + +void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int num_groups = dst->op_params[0]; + int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); + group_norm_f32_cuda(src0_d, dst_d, num_groups * src0->ne[3], group_size, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + rms_norm_f32_cuda(src0_d, dst_d, ne00, nrows, eps, stream); +} diff --git a/ggml-cuda/norm.cuh b/ggml-cuda/norm.cuh new file mode 100644 index 000000000..431a8f74d --- /dev/null +++ b/ggml-cuda/norm.cuh @@ -0,0 +1,7 @@ +#include "common.cuh" + +void ggml_cuda_op_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_group_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_rms_norm(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/pad.cu b/ggml-cuda/pad.cu new file mode 100644 index 000000000..aba539e8d --- /dev/null +++ b/ggml-cuda/pad.cu @@ -0,0 +1,49 @@ +#include "pad.cuh" + +static __global__ void pad_f32(const float * x, float * dst, const int ne0, const int ne00, const int ne01, const int ne02, const int ne03) { + // blockIdx.z: idx of ne2*ne3, aka ne02*ne03 + // blockIdx.y: idx of ne1 + // blockIDx.x: idx of ne0 / BLOCK_SIZE + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + + // operation + int offset_dst = + nidx + + blockIdx.y * ne0 + + blockIdx.z * ne0 * gridDim.y; + if (nidx < ne00 && blockIdx.y < ne01 && blockIdx.z < ne02*ne03) { + int offset_src = + nidx + + blockIdx.y * ne00 + + blockIdx.z * ne00 * ne01; + dst[offset_dst] = x[offset_src]; + } else { + dst[offset_dst] = 0.0f; + } +} + +static void pad_f32_cuda(const float * x, float * dst, + const int ne00, const int ne01, const int ne02, const int ne03, + const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne1, ne2*ne3); + pad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); +} + +void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + pad_f32_cuda(src0_d, dst_d, + src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], + dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], stream); +} diff --git a/ggml-cuda/pad.cuh b/ggml-cuda/pad.cuh new file mode 100644 index 000000000..8fd386b00 --- /dev/null +++ b/ggml-cuda/pad.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_PAD_BLOCK_SIZE 256 + +void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/pool2d.cu b/ggml-cuda/pool2d.cu new file mode 100644 index 000000000..c6d51e4d6 --- /dev/null +++ b/ggml-cuda/pool2d.cu @@ -0,0 +1,94 @@ +#include "pool2d.cuh" + +template +static __global__ void pool2d_nchw_kernel( + const int ih, const int iw, const int oh, const int ow, + const int kh, const int kw, const int sh, const int sw, + const int ph, const int pw, const int parallel_elements, + const Ti* src, To* dst, const enum ggml_op_pool op) { + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= parallel_elements) { + return; + } + + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti* i_ptr = src + nc * I_HW; + To* o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = max(0, start_h); + const int eh = min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = max(0, start_w); + const int ew = min(iw, start_w + kw); + const To scale = 1. / (kh * kw); + To res = 0; + + switch (op) { + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + default: assert(false); + } + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { +#if __CUDA_ARCH__ >= 350 + Ti cur = __ldg(i_ptr + i * iw + j); +#else + Ti cur = i_ptr[i * iw + j]; +#endif + switch (op) { + case GGML_OP_POOL_AVG: res += cur * scale; break; + case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; + default: assert(false); + } + } + } + o_ptr[cur_oh * ow + cur_ow] = res; +} + +static void pool2d_nchw_kernel_f32_f32_cuda( + const int ih, const int iw, const int oh, const int ow, + const int kh, const int kw, const int sh, const int sw, + const int ph, const int pw, const int parallel_elements, + const float * src, float * dst, const enum ggml_op_pool op, + cudaStream_t stream) { + + const int num_blocks = (parallel_elements + CUDA_POOL2D_BLOCK_SIZE - 1) / CUDA_POOL2D_BLOCK_SIZE; + dim3 block_nums(num_blocks); + pool2d_nchw_kernel<<>>(ih, iw, oh, ow, kh, kw, sh, sw, ph, pw, parallel_elements, src, dst, op); +} + +void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + const int64_t IH = src0->ne[1]; + const int64_t IW = src0->ne[0]; + + const int64_t N = dst->ne[3]; + const int64_t OC = dst->ne[2]; + const int64_t OH = dst->ne[1]; + const int64_t OW = dst->ne[0]; + + const int parallel_elements = N * OC * OH * OW; + + pool2d_nchw_kernel_f32_f32_cuda(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_d, dst_d, op, stream); +} diff --git a/ggml-cuda/pool2d.cuh b/ggml-cuda/pool2d.cuh new file mode 100644 index 000000000..7841292bc --- /dev/null +++ b/ggml-cuda/pool2d.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_POOL2D_BLOCK_SIZE 256 + +void ggml_cuda_op_pool2d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/quantize.cu b/ggml-cuda/quantize.cu new file mode 100644 index 000000000..7578c4b6c --- /dev/null +++ b/ggml-cuda/quantize.cu @@ -0,0 +1,45 @@ +#include "quantize.cuh" + +static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx_padded) { + const int64_t ix = (int64_t)blockDim.x*blockIdx.x + threadIdx.x; + + if (ix >= kx_padded) { + return; + } + + const int64_t iy = (int64_t)blockDim.y*blockIdx.y + threadIdx.y; + + const int64_t i_padded = (int64_t)iy*kx_padded + ix; + + block_q8_1 * y = (block_q8_1 *) vy; + + const int64_t ib = i_padded / QK8_1; // block index + const int64_t iqs = i_padded % QK8_1; // quant index + + const float xi = ix < kx ? x[iy*kx + ix] : 0.0f; + float amax = fabsf(xi); + float sum = xi; + + amax = warp_reduce_max(amax); + sum = warp_reduce_sum(sum); + + const float d = amax / 127; + const int8_t q = amax == 0.0f ? 0 : roundf(xi / d); + + y[ib].qs[iqs] = q; + + if (iqs > 0) { + return; + } + + reinterpret_cast(y[ib].ds.x) = d; + reinterpret_cast(y[ib].ds.y) = sum; +} + +void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream) { + const int64_t block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE; + const dim3 num_blocks(block_num_x, ky, 1); + const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1); + quantize_q8_1<<>>(x, vy, kx, kx_padded); +} + diff --git a/ggml-cuda/quantize.cuh b/ggml-cuda/quantize.cuh new file mode 100644 index 000000000..b37a4752f --- /dev/null +++ b/ggml-cuda/quantize.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_QUANTIZE_BLOCK_SIZE 256 + +void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream); diff --git a/ggml-cuda/rope.cu b/ggml-cuda/rope.cu new file mode 100644 index 000000000..4b0d2e5ad --- /dev/null +++ b/ggml-cuda/rope.cu @@ -0,0 +1,308 @@ +#include "rope.cuh" + +struct rope_corr_dims { + float v[4]; +}; + +static __device__ float rope_yarn_ramp(const float low, const float high, const int i0) { + const float y = (i0 / 2 - low) / max(0.001f, high - low); + return 1.0f - min(1.0f, max(0.0f, y)); +} + +// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn +// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. +static __device__ void rope_yarn( + float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale, + float * cos_theta, float * sin_theta +) { + // Get n-d rotational scaling corrected for extrapolation + float theta_interp = freq_scale * theta_extrap; + float theta = theta_interp; + if (ext_factor != 0.0f) { + float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor; + theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; + + // Get n-d magnitude scaling corrected for interpolation + mscale *= 1.0f + 0.1f * logf(1.0f / freq_scale); + } + *cos_theta = cosf(theta) * mscale; + *sin_theta = sinf(theta) * mscale; +} + +// rope == RoPE == rotary positional embedding +template +static __global__ void rope( + const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + float ext_factor, float attn_factor, rope_corr_dims corr_dims +) { + const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); + + if (col >= ncols) { + return; + } + + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int i = row*ncols + col; + const int i2 = row/p_delta_rows; + + const int p = has_pos ? pos[i2] : 0; + const float theta_base = p*powf(freq_base, -float(col)/ncols); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + 1]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + 1] = x0*sin_theta + x1*cos_theta; +} + +template +static __global__ void rope_neox( + const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows, + float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims +) { + const int col = 2*(blockDim.y*blockIdx.y + threadIdx.y); + + if (col >= ncols) { + return; + } + + const int row = blockDim.x*blockIdx.x + threadIdx.x; + const int ib = col / n_dims; + const int ic = col % n_dims; + + if (ib > 0) { + const int i = row*ncols + ib*n_dims + ic; + + dst[i + 0] = x[i + 0]; + dst[i + 1] = x[i + 1]; + + return; + } + + const int i = row*ncols + ib*n_dims + ic/2; + const int i2 = row/p_delta_rows; + + float cur_rot = inv_ndims * ic - ib; + + const int p = has_pos ? pos[i2] : 0; + const float theta_base = p*freq_scale*powf(theta_scale, col/2.0f); + + float cos_theta, sin_theta; + rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + n_dims/2]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta; +} + +static __global__ void rope_glm_f32( + const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base, + int n_ctx +) { + const int col = blockDim.x*blockIdx.x + threadIdx.x; + const int half_n_dims = ncols/4; + + if (col >= half_n_dims) { + return; + } + + const int row = blockDim.y*blockIdx.y + threadIdx.y; + const int i = row*ncols + col; + const int i2 = row/p_delta_rows; + + const float col_theta_scale = powf(freq_base, -2.0f*col/ncols); + // FIXME: this is likely wrong + const int p = pos != nullptr ? pos[i2] : 0; + + const float theta = min(p, n_ctx - 2)*freq_scale*col_theta_scale; + const float sin_theta = sinf(theta); + const float cos_theta = cosf(theta); + + const float x0 = x[i + 0]; + const float x1 = x[i + half_n_dims]; + + dst[i + 0] = x0*cos_theta - x1*sin_theta; + dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta; + + const float block_theta = ((float)max(p - n_ctx - 2, 0))*col_theta_scale; + const float sin_block_theta = sinf(block_theta); + const float cos_block_theta = cosf(block_theta); + + const float x2 = x[i + half_n_dims * 2]; + const float x3 = x[i + half_n_dims * 3]; + + dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta; + dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; +} + + +template +static void rope_cuda( + const T * x, T * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream +) { + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(nrows, num_blocks_x, 1); + if (pos == nullptr) { + rope<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); + } else { + rope<<>>( + x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims + ); + } +} + +template +static void rope_neox_cuda( + const T * x, T * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream +) { + GGML_ASSERT(ncols % 2 == 0); + const dim3 block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1); + const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE); + const dim3 block_nums(nrows, num_blocks_x, 1); + + const float theta_scale = powf(freq_base, -2.0f/n_dims); + const float inv_ndims = -1.0f / n_dims; + + if (pos == nullptr) { + rope_neox<<>>( + x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, + theta_scale, inv_ndims + ); + } else { + rope_neox<<>>( + x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, + theta_scale, inv_ndims + ); + } +} + +static void rope_glm_f32_cuda( + const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, int n_ctx, cudaStream_t stream +) { + GGML_ASSERT(ncols % 4 == 0); + const dim3 block_dims(CUDA_ROPE_BLOCK_SIZE/4, 1, 1); + const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE; + const dim3 block_nums(num_blocks_x, nrows, 1); + rope_glm_f32<<>>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx); +} + +static void rope_cuda_f16( + const half * x, half * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) { + + rope_cuda(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); +} + +static void rope_cuda_f32( + const float * x, float * dst, int ncols, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) { + + rope_cuda(x, dst, ncols, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); +} + +static void rope_neox_cuda_f16( + const half * x, half * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream) { + + rope_neox_cuda(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); +} + +static void rope_neox_cuda_f32( + const float * x, float * dst, int ncols, int n_dims, int nrows, const int32_t * pos, float freq_scale, int p_delta_rows, + float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, cudaStream_t stream +) { + + rope_neox_cuda(x, dst, ncols, n_dims, nrows, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, stream); +} + +void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const float * src0_d = (const float *)src0->data; + const float * src1_d = (const float *)src1->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); + GGML_ASSERT(src0->type == dst->type); + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t nrows = ggml_nrows(src0); + + //const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_orig_ctx = ((int32_t *) dst->op_params)[4]; + + // RoPE alteration for extended context + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); + + const int32_t * pos = nullptr; + if ((mode & 1) == 0) { + GGML_ASSERT(src1->type == GGML_TYPE_I32); + GGML_ASSERT(src1->ne[0] == ne2); + pos = (const int32_t *) src1_d; + } + + const bool is_neox = mode & 2; + const bool is_glm = mode & 4; + + rope_corr_dims corr_dims; + ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v); + + // compute + if (is_glm) { + GGML_ASSERT(false); + rope_glm_f32_cuda(src0_d, dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, stream); + } else if (is_neox) { + if (src0->type == GGML_TYPE_F32) { + rope_neox_cuda_f32( + (const float *)src0_d, (float *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_neox_cuda_f16( + (const half *)src0_d, (half *)dst_d, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, stream + ); + } else { + GGML_ASSERT(false); + } + } else { + if (src0->type == GGML_TYPE_F32) { + rope_cuda_f32( + (const float *)src0_d, (float *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, stream + ); + } else if (src0->type == GGML_TYPE_F16) { + rope_cuda_f16( + (const half *)src0_d, (half *)dst_d, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor, + attn_factor, corr_dims, stream + ); + } else { + GGML_ASSERT(false); + } + } +} diff --git a/ggml-cuda/rope.cuh b/ggml-cuda/rope.cuh new file mode 100644 index 000000000..0f787a0b2 --- /dev/null +++ b/ggml-cuda/rope.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_ROPE_BLOCK_SIZE 256 + +void ggml_cuda_op_rope(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/scale.cu b/ggml-cuda/scale.cu new file mode 100644 index 000000000..1405e066e --- /dev/null +++ b/ggml-cuda/scale.cu @@ -0,0 +1,31 @@ +#include "scale.cuh" + +static __global__ void scale_f32(const float * x, float * dst, const float scale, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + dst[i] = scale * x[i]; +} + +static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE; + scale_f32<<>>(x, dst, scale, k); +} + +void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); + + scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream); +} diff --git a/ggml-cuda/scale.cuh b/ggml-cuda/scale.cuh new file mode 100644 index 000000000..8ff75c829 --- /dev/null +++ b/ggml-cuda/scale.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_SCALE_BLOCK_SIZE 256 + +void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/softmax.cu b/ggml-cuda/softmax.cu new file mode 100644 index 000000000..6ed225999 --- /dev/null +++ b/ggml-cuda/softmax.cu @@ -0,0 +1,227 @@ +#include "softmax.cuh" + +template +static __device__ __forceinline__ float t2f32(T val) { + return (float) val; +} + +template <> +__device__ float __forceinline__ t2f32(half val) { + return __half2float(val); +} + +template +static __global__ void soft_max_f32(const float * x, const T * mask, const T * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { + const int ncols = ncols_template == 0 ? ncols_par : ncols_template; + + const int tid = threadIdx.x; + const int rowx = blockIdx.x; + const int rowy = rowx % nrows_y; // broadcast the mask in the row dimension + + const int block_size = block_size_template == 0 ? blockDim.x : block_size_template; + + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + + float slope = 0.0f; + + // ALiBi + if (max_bias > 0.0f) { + const int h = rowx/nrows_y; // head index + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = powf(base, exp); + } + + extern __shared__ float data_soft_max_f32[]; + float * buf_iw = data_soft_max_f32; // shared memory buffer for inter-warp communication + // shared memory buffer to cache values between iterations: + float * vals = vals_smem ? buf_iw + WARP_SIZE : dst + (int64_t)rowx*ncols; + + float max_val = -INFINITY; + +#pragma unroll + for (int col0 = 0; col0 < ncols; col0 += block_size) { + const int col = col0 + tid; + + if (ncols_template == 0 && col >= ncols) { + break; + } + + const int64_t ix = (int64_t)rowx*ncols + col; + const int64_t iy = (int64_t)rowy*ncols + col; + + const float val = x[ix]*scale + (mask ? t2f32(mask[iy]) : 0.0f) + (pos ? slope*t2f32(pos[col]) : 0.0f); + + vals[col] = val; + max_val = max(max_val, val); + } + + // find the max value in the block + max_val = warp_reduce_max(max_val); + if (block_size > WARP_SIZE) { + if (warp_id == 0) { + buf_iw[lane_id] = -INFINITY; + } + __syncthreads(); + + if (lane_id == 0) { + buf_iw[warp_id] = max_val; + } + __syncthreads(); + + max_val = buf_iw[lane_id]; + max_val = warp_reduce_max(max_val); + } + + float tmp = 0.0f; // partial sum + +#pragma unroll + for (int col0 = 0; col0 < ncols; col0 += block_size) { + const int col = col0 + tid; + + if (ncols_template == 0 && col >= ncols) { + break; + } + + const float val = expf(vals[col] - max_val); + tmp += val; + vals[col] = val; + } + + // find the sum of exps in the block + tmp = warp_reduce_sum(tmp); + if (block_size > WARP_SIZE) { + __syncthreads(); + if (warp_id == 0) { + buf_iw[lane_id] = 0.0f; + } + __syncthreads(); + + if (lane_id == 0) { + buf_iw[warp_id] = tmp; + } + __syncthreads(); + + tmp = buf_iw[lane_id]; + tmp = warp_reduce_sum(tmp); + } + + const float inv_sum = 1.0f / tmp; + +#pragma unroll + for (int col0 = 0; col0 < ncols; col0 += block_size) { + const int col = col0 + tid; + + if (ncols_template == 0 && col >= ncols) { + return; + } + + const int64_t idst = (int64_t)rowx*ncols + col; + dst[idst] = vals[col] * inv_sum; + } +} + +template +static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) { + int nth = WARP_SIZE; + while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; + const dim3 block_dims(nth, 1, 1); + const dim3 block_nums(nrows_x, 1, 1); + const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); + static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); + + const uint32_t n_head_kv = nrows_x/nrows_y; + const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); + + const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); + const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); + + if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { + switch (ncols_x) { + case 32: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + case 64: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + case 128: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + case 256: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + case 512: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + case 1024: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + case 2048: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + case 4096: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + default: + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + break; + } + } else { + const size_t shmem_low = WARP_SIZE*sizeof(float); + soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); + } +} + +void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + const ggml_tensor * src2 = dst->src[2]; + + const float * src0_d = (const float *)src0->data; + const void * src1_d = src1 ? (const void *)src1->data : nullptr; + + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional + GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional + + const int64_t ne00 = src0->ne[0]; + const int64_t nrows_x = ggml_nrows(src0); + const int64_t nrows_y = src0->ne[1]; + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); + + // positions tensor + void * src2_d = nullptr; + + const bool use_src2 = src2 != nullptr; + + if (use_src2) { + src2_d = (void *)src2->data; + } + + const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16); + + if (use_f16) { + const half * src1_dd = (const half *)src1_d; + const half * src2_dd = (const half *)src2_d; + + soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); + } else { + const float * src1_dd = (const float *)src1_d; + const float * src2_dd = (const float *)src2_d; + + soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); + } +} diff --git a/ggml-cuda/softmax.cuh b/ggml-cuda/softmax.cuh new file mode 100644 index 000000000..4ef4ff86c --- /dev/null +++ b/ggml-cuda/softmax.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_SOFT_MAX_BLOCK_SIZE 1024 + +void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/sumrows.cu b/ggml-cuda/sumrows.cu new file mode 100644 index 000000000..82e8e875f --- /dev/null +++ b/ggml-cuda/sumrows.cu @@ -0,0 +1,40 @@ +#include "sumrows.cuh" + +static __global__ void k_sum_rows_f32(const float * x, float * dst, const int ncols) { + const int row = blockIdx.x; + const int col = threadIdx.x; + + float sum = 0.0f; + for (int i = col; i < ncols; i += blockDim.x) { + sum += x[row * ncols + i]; + } + + sum = warp_reduce_sum(sum); + + if (col == 0) { + dst[row] = sum; + } +} + +static void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + const dim3 block_dims(WARP_SIZE, 1, 1); + const dim3 block_nums(nrows, 1, 1); + k_sum_rows_f32<<>>(x, dst, ncols); +} + +void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + GGML_ASSERT(ggml_is_contiguous(src0)); + + + const int64_t ncols = src0->ne[0]; + const int64_t nrows = ggml_nrows(src0); + + sum_rows_f32_cuda(src0_d, dst_d, ncols, nrows, stream); +} diff --git a/ggml-cuda/sumrows.cuh b/ggml-cuda/sumrows.cuh new file mode 100644 index 000000000..e7545f83c --- /dev/null +++ b/ggml-cuda/sumrows.cuh @@ -0,0 +1,3 @@ +#include "common.cuh" + +void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/tsembd.cu b/ggml-cuda/tsembd.cu new file mode 100644 index 000000000..153ddbcda --- /dev/null +++ b/ggml-cuda/tsembd.cu @@ -0,0 +1,47 @@ +#include "tsembd.cuh" + +static __global__ void timestep_embedding_f32(const float * timesteps, float * dst, const int nb1, const int dim, const int max_period) { + // blockIDx.y: idx of timesteps->ne[0] + // blockIDx.x: idx of ((dim + 1) / 2) / BLOCK_SIZE + int i = blockIdx.y; + int j = threadIdx.x + blockIdx.x * blockDim.x; + float * embed_data = (float *)((char *)dst + i*nb1); + + if (dim % 2 != 0 && j == ((dim + 1) / 2)) { + embed_data[dim] = 0.f; + } + + int half = dim / 2; + if (j >= half) { + return; + } + + float timestep = timesteps[i]; + float freq = (float)expf(-logf(max_period) * j / half); + float arg = timestep * freq; + embed_data[j] = cosf(arg); + embed_data[j + half] = sinf(arg); +} + +static void timestep_embedding_f32_cuda(const float * x, float * dst, const int ne00, const int nb1, + const int dim, const int max_period, cudaStream_t stream) { + int half_ceil = (dim + 1) / 2; + int num_blocks = (half_ceil + CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE - 1) / CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne00, 1); + timestep_embedding_f32<<>>(x, dst, nb1, dim, max_period); +} + +void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + + timestep_embedding_f32_cuda(src0_d, dst_d, src0->ne[0], dst->nb[1], dim, max_period, stream); +} diff --git a/ggml-cuda/tsembd.cuh b/ggml-cuda/tsembd.cuh new file mode 100644 index 000000000..84340e3d7 --- /dev/null +++ b/ggml-cuda/tsembd.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_TIMESTEP_EMBEDDING_BLOCK_SIZE 256 + +void ggml_cuda_op_timestep_embedding(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/unary.cu b/ggml-cuda/unary.cu new file mode 100644 index 000000000..1a7f09469 --- /dev/null +++ b/ggml-cuda/unary.cu @@ -0,0 +1,240 @@ +#include "unary.cuh" + +static __global__ void gelu_f32(const float * x, float * dst, const int k) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + + float xi = x[i]; + dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))); +} + +static __global__ void gelu_quick_f32(const float * x, float * dst, int k) { + const float GELU_QUICK_COEF = -1.702f; + const int i = blockDim.x*blockIdx.x + threadIdx.x; + if (i >= k) { + return; + } + dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i]))); +} + +static __global__ void silu_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = x[i] / (1.0f + expf(-x[i])); +} + +static __global__ void tanh_f32(const float * x, float * dst, int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + if (i >= k) { + return; + } + dst[i] = tanhf(x[i]); +} + +static __global__ void relu_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = fmaxf(x[i], 0); +} + +static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); +} + +static __global__ void hardswish_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); +} + +static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + if (i >= k) { + return; + } + dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope; +} + +static __global__ void sqr_f32(const float * x, float * dst, const int k) { + const int i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= k) { + return; + } + dst[i] = x[i] * x[i]; +} + +static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + gelu_f32<<>>(x, dst, k); +} + +static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; + gelu_quick_f32<<>>(x, dst, k); +} + +static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; + silu_f32<<>>(x, dst, k); +} + +static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE; + tanh_f32<<>>(x, dst, k); +} + +static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; + relu_f32<<>>(x, dst, k); +} + +static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE; + hardsigmoid_f32<<>>(x, dst, k); +} + +static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE; + hardswish_f32<<>>(x, dst, k); +} + +static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) { + const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; + leaky_relu_f32<<>>(x, dst, k, negative_slope); +} + +static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { + const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE; + sqr_f32<<>>(x, dst, k); +} + +void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + gelu_quick_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + tanh_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardsigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} + +void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + + leaky_relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), negative_slope, stream); +} + +void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); +} diff --git a/ggml-cuda/unary.cuh b/ggml-cuda/unary.cuh new file mode 100644 index 000000000..2002ed989 --- /dev/null +++ b/ggml-cuda/unary.cuh @@ -0,0 +1,27 @@ +#include "common.cuh" + +#define CUDA_GELU_BLOCK_SIZE 256 +#define CUDA_SILU_BLOCK_SIZE 256 +#define CUDA_TANH_BLOCK_SIZE 256 +#define CUDA_RELU_BLOCK_SIZE 256 +#define CUDA_HARDSIGMOID_BLOCK_SIZE 256 +#define CUDA_HARDSWISH_BLOCK_SIZE 256 +#define CUDA_SQR_BLOCK_SIZE 256 + +void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/upscale.cu b/ggml-cuda/upscale.cu new file mode 100644 index 000000000..2f62fed48 --- /dev/null +++ b/ggml-cuda/upscale.cu @@ -0,0 +1,48 @@ +#include "upscale.cuh" + +static __global__ void upscale_f32(const float * x, float * dst, const int ne00, const int ne00xne01, const int scale_factor) { + // blockIdx.z: idx of ne02*ne03 + // blockIdx.y: idx of ne01*scale_factor, aka ne1 + // blockIDx.x: idx of ne00*scale_factor / BLOCK_SIZE + // ne00xne01: ne00 * ne01 + int ne0 = ne00 * scale_factor; + int nidx = threadIdx.x + blockIdx.x * blockDim.x; + if (nidx >= ne0) { + return; + } + // operation + int i00 = nidx / scale_factor; + int i01 = blockIdx.y / scale_factor; + int offset_src = + i00 + + i01 * ne00 + + blockIdx.z * ne00xne01; + int offset_dst = + nidx + + blockIdx.y * ne0 + + blockIdx.z * ne0 * gridDim.y; + dst[offset_dst] = x[offset_src]; +} + +static void upscale_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03, + const int scale_factor, cudaStream_t stream) { + int ne0 = (ne00 * scale_factor); + int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE; + dim3 gridDim(num_blocks, (ne01 * scale_factor), ne02*ne03); + upscale_f32<<>>(x, dst, ne00, ne00 * ne01, scale_factor); +} + +void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const float * src0_d = (const float *)src0->data; + float * dst_d = (float *)dst->data; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors + + const int scale_factor = dst->op_params[0]; + + upscale_f32_cuda(src0_d, dst_d, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, stream); +} diff --git a/ggml-cuda/upscale.cuh b/ggml-cuda/upscale.cuh new file mode 100644 index 000000000..d4d765230 --- /dev/null +++ b/ggml-cuda/upscale.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_UPSCALE_BLOCK_SIZE 256 + +void ggml_cuda_op_upscale(ggml_backend_cuda_context & ctx, ggml_tensor * dst); diff --git a/ggml-cuda/vecdotq.cuh b/ggml-cuda/vecdotq.cuh new file mode 100644 index 000000000..86b87fa93 --- /dev/null +++ b/ggml-cuda/vecdotq.cuh @@ -0,0 +1,1280 @@ +#include "common.cuh" + +static __device__ __forceinline__ int get_int_from_int8(const int8_t * x8, const int & i32) { + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __device__ __forceinline__ int get_int_from_uint8(const uint8_t * x8, const int & i32) { + const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment + + int x32 = 0; + x32 |= x16[0] << 0; + x32 |= x16[1] << 16; + + return x32; +} + +static __device__ __forceinline__ int get_int_from_int8_aligned(const int8_t * x8, const int & i32) { + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + +static __device__ __forceinline__ int get_int_from_uint8_aligned(const uint8_t * x8, const int & i32) { + return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment +} + + +// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called +// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q + +#define VDR_Q4_0_Q8_1_MMVQ 2 +#define VDR_Q4_0_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl( + const int * v, const int * u, const float & d4, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 8 from each quant value + return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y); +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_1_Q8_1_MMVQ 2 +#define VDR_Q4_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q4_1_q8_1_impl( + const int * v, const int * u, const half2 & dm4, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + const int vi0 = (v[i] >> 0) & 0x0F0F0F0F; + const int vi1 = (v[i] >> 4) & 0x0F0F0F0F; + + // SIMD dot product of quantized values + sumi = __dp4a(vi0, u[2*i+0], sumi); + sumi = __dp4a(vi1, u[2*i+1], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm4, ds8)); + const float d4d8 = tmp.x; + const float m4s8 = tmp.y; +#else + const float2 dm4f = __half22float2(dm4); + const float2 ds8f = __half22float2(ds8); + const float d4d8 = dm4f.x * ds8f.x; + const float m4s8 = dm4f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it + return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1)); +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_0_Q8_1_MMVQ 2 +#define VDR_Q5_0_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_0_q8_1_impl( + const int * vl, const int * vh, const int * u, const float & d5, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + + const float2 ds8f = __half22float2(ds8); + + // second part effectively subtracts 16 from each quant value + return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y); +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_1_Q8_1_MMVQ 2 +#define VDR_Q5_1_Q8_1_MMQ 4 + +template static __device__ __forceinline__ float vec_dot_q5_1_q8_1_impl( + const int * vl, const int * vh, const int * u, const half2 & dm5, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + int vi0 = (vl[i] >> 0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits + vi0 |= (vh[i] << 4) & 0x00000010; // 0 -> 4 + vi0 |= (vh[i] << 11) & 0x00001000; // 1 -> 12 + vi0 |= (vh[i] << 18) & 0x00100000; // 2 -> 20 + vi0 |= (vh[i] << 25) & 0x10000000; // 3 -> 28 + sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values + + int vi1 = (vl[i] >> 4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits + vi1 |= (vh[i] >> 12) & 0x00000010; // 16 -> 4 + vi1 |= (vh[i] >> 5) & 0x00001000; // 17 -> 12 + vi1 |= (vh[i] << 2) & 0x00100000; // 18 -> 20 + vi1 |= (vh[i] << 9) & 0x10000000; // 19 -> 28 + sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm5, ds8)); + const float d5d8 = tmp.x; + const float m5s8 = tmp.y; +#else + const float2 dm5f = __half22float2(dm5); + const float2 ds8f = __half22float2(ds8); + const float d5d8 = dm5f.x * ds8f.x; + const float m5s8 = dm5f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it + return sumi*d5d8 + m5s8 / (QI5_1 / vdr); + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q8_0_Q8_1_MMVQ 2 +#define VDR_Q8_0_Q8_1_MMQ 8 + +template static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl( + const int * v, const int * u, const float & d8_0, const float & d8_1) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + + return d8_0*d8_1 * sumi; +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +template static __device__ __forceinline__ float vec_dot_q8_1_q8_1_impl( + const int * v, const int * u, const half2 & dm8, const half2 & ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i = 0; i < vdr; ++i) { + // SIMD dot product of quantized values + sumi = __dp4a(v[i], u[i], sumi); + } + +#ifdef GGML_CUDA_F16 + const float2 tmp = __half22float2(__hmul2(dm8, ds8)); + const float d8d8 = tmp.x; + const float m8s8 = tmp.y; +#else + const float2 dm8f = __half22float2(dm8); + const float2 ds8f = __half22float2(ds8); + const float d8d8 = dm8f.x * ds8f.x; + const float m8s8 = dm8f.y * ds8f.y; +#endif // GGML_CUDA_F16 + + // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it + return sumi*d8d8 + m8s8 / (QI8_1 / vdr); +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q2_K_Q8_1_MMVQ 1 +#define VDR_Q2_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmvq( + const int & v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR2_K; ++i) { + const int sc = scales[2*i]; + + const int vi = (v >> (2*i)) & 0x03030303; + + sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values + } + + const float2 dm2f = __half22float2(dm2); + + return dm2f.x*sumf_d - dm2f.y*sumf_m; +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q2_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const half2 & dm2, const float & d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi_d = 0; + int sumi_m = 0; + +#pragma unroll + for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) { + int sumi_d_sc = 0; + + const int sc = scales[i0 / (QI8_1/2)]; + + // fill int with 4x m + int m = sc >> 4; + m |= m << 8; + m |= m << 16; + +#pragma unroll + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product + sumi_m = __dp4a(m, u[i], sumi_m); // multiply sum of q8_1 values with m + } + + sumi_d += sumi_d_sc * (sc & 0xF); + } + + const float2 dm2f = __half22float2(dm2); + + return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m); +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q3_K_Q8_1_MMVQ 1 +#define VDR_Q3_K_Q8_1_MMQ 2 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const uint8_t * __restrict__ scales, + const int & scale_offset, const float & d3, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + const int isc = scale_offset + 2*i; + + const int isc_low = isc % (QK_K/32); + const int sc_shift_low = 4 * (isc / (QK_K/32)); + const int sc_low = (scales[isc_low] >> sc_shift_low) & 0xF; + + const int isc_high = isc % (QK_K/64); + const int sc_shift_high = 2 * (isc / (QK_K/64)); + const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4; + + const int sc = (sc_low | sc_high) - 32; + + const int vil = (vl >> (2*i)) & 0x03030303; + + const int vih = ((vh >> i) << 2) & 0x04040404; + + const int vi = __vsubss4(vil, vih); + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d3 * sumf; +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q3_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d3, const float & d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + int sumi = 0; + +#pragma unroll + for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) { + int sumi_sc = 0; + + for (int i = i0; i < i0 + QI8_1/2; ++i) { + sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product + } + + sumi += sumi_sc * scales[i0 / (QI8_1/2)]; + } + + return d3*d8 * sumi; +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q4_K_Q8_1_MMVQ 2 +#define VDR_Q4_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_vmmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K; ++i) { + const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F; + const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F; + + const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q4_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q5_K_Q8_1_MMVQ 2 +#define VDR_Q5_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_vmmq( + const int * __restrict__ vl, const int * __restrict__ vh, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm5, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F; + const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F; + + const int vh0i = ((vh[0] >> i) << 4) & 0x10101010; + const int vh1i = ((vh[1] >> i) << 4) & 0x10101010; + + const int v0i = vl0i | vh0i; + const int v1i = vl1i | vh1i; + + const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product + const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u + + sumf_d += d8[i] * (dot1 * sc[i]); + sumf_m += d8[i] * (dot2 * m[i]); + + } + + const float2 dm5f = __half22float2(dm5); + + return dm5f.x*sumf_d - dm5f.y*sumf_m; + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q5_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const uint8_t * __restrict__ sc, + const uint8_t * __restrict__ m, const half2 & dm4, const half2 * __restrict__ ds8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + float sumf_m = 0.0f; + +#pragma unroll + for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) { + int sumi_d = 0; + +#pragma unroll + for (int j = 0; j < QI8_1; ++j) { + sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product + } + + const float2 ds8f = __half22float2(ds8[i]); + + sumf_d += ds8f.x * (sc[i] * sumi_d); + sumf_m += ds8f.y * m[i]; // sum of q8_1 block * q4_K min val + } + + const float2 dm4f = __half22float2(dm4); + + return dm4f.x*sumf_d - dm4f.y*sumf_m; + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +#define VDR_Q6_K_Q8_1_MMVQ 1 +#define VDR_Q6_K_Q8_1_MMQ 8 + +// contiguous v/x values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmvq( + const int & vl, const int & vh, const int * __restrict__ u, const int8_t * __restrict__ scales, + const float & d, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf = 0.0f; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + const int sc = scales[4*i]; + + const int vil = (vl >> (4*i)) & 0x0F0F0F0F; + + const int vih = ((vh >> (4*i)) << 4) & 0x30303030; + + const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32 + + sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product + } + + return d*sumf; +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +// contiguous u/y values +static __device__ __forceinline__ float vec_dot_q6_K_q8_1_impl_mmq( + const int * __restrict__ v, const int * __restrict__ u, const int8_t * __restrict__ sc, + const float & d6, const float * __restrict__ d8) { + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + float sumf_d = 0.0f; + +#pragma unroll + for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) { + int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale + +#pragma unroll + for (int i = i0; i < i0 + 2; ++i) { + sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product + sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product + + sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product + sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product + } + + sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y); + } + + return d6 * sumf_d; + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A +} + +static __device__ __forceinline__ float vec_dot_q4_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq; + + int v[VDR_Q4_0_Q8_1_MMVQ]; + int u[2*VDR_Q4_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8(bq4_0->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0); + } + + return vec_dot_q4_0_q8_1_impl(v, u, bq4_0->d, bq8_1->ds); +} + + +static __device__ __forceinline__ float vec_dot_q4_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq; + + int v[VDR_Q4_1_Q8_1_MMVQ]; + int u[2*VDR_Q4_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_uint8_aligned(bq4_1->qs, iqs + i); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1); + } + + return vec_dot_q4_1_q8_1_impl(v, u, bq4_1->dm, bq8_1->ds); +} + +static __device__ __forceinline__ float vec_dot_q5_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq; + + int vl[VDR_Q5_0_Q8_1_MMVQ]; + int vh[VDR_Q5_0_Q8_1_MMVQ]; + int u[2*VDR_Q5_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8(bq5_0->qs, iqs + i); + vh[i] = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0); + } + + return vec_dot_q5_0_q8_1_impl(vl, vh, u, bq5_0->d, bq8_1->ds); +} + +static __device__ __forceinline__ float vec_dot_q5_1_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq; + + int vl[VDR_Q5_1_Q8_1_MMVQ]; + int vh[VDR_Q5_1_Q8_1_MMVQ]; + int u[2*VDR_Q5_1_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) { + vl[i] = get_int_from_uint8_aligned(bq5_1->qs, iqs + i); + vh[i] = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i)); + u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1); + } + + return vec_dot_q5_1_q8_1_impl(vl, vh, u, bq5_1->dm, bq8_1->ds); +} + +static __device__ __forceinline__ float vec_dot_q8_0_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq; + + int v[VDR_Q8_0_Q8_1_MMVQ]; + int u[VDR_Q8_0_Q8_1_MMVQ]; + +#pragma unroll + for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) { + v[i] = get_int_from_int8(bq8_0->qs, iqs + i); + u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i); + } + + return vec_dot_q8_0_q8_1_impl(v, u, bq8_0->d, __low2half(bq8_1->ds)); +} + +static __device__ __forceinline__ float vec_dot_q2_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q2_K * bq2_K = (const block_q2_K *) vbq; + + const int bq8_offset = QR2_K * (iqs / QI8_1); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const uint8_t * scales = bq2_K->scales + scale_offset; + + const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs); + int u[QR2_K]; + float d8[QR2_K]; + +#pragma unroll + for (int i = 0; i < QR2_K; ++ i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = __low2float(bq8_1[bq8_offset + i].ds); + } + + return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8); +} + +static __device__ __forceinline__ float vec_dot_q3_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q3_K * bq3_K = (const block_q3_K *) vbq; + + const int bq8_offset = QR3_K * (iqs / (QI3_K/2)); + const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2); + + const float d = bq3_K->d; + + const int vl = get_int_from_uint8(bq3_K->qs, iqs); + + // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted + const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset; + + int u[QR3_K]; + float d8[QR3_K]; + +#pragma unroll + for (int i = 0; i < QR3_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1); + d8[i] = __low2float(bq8_1[bq8_offset + i].ds); + } + + return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8); +} + +static __device__ __forceinline__ float vec_dot_q4_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + +#ifndef GGML_QKK_64 + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + int v[2]; + int u[2*QR4_K]; + float d8[QR4_K]; + + // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6 + const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2)); + + // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12 + // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44 + // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76 + // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108 + + const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + v[0] = q4[0]; + v[1] = q4[4]; + + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + + for (int i = 0; i < QR4_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = __low2float(bq8i->ds); + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q4_K * bq4_K = (const block_q4_K *) vbq; + + float sumf_d = 0.0f; + float sumf_m = 0.0f; + + uint16_t aux16[2]; + const uint8_t * s = (const uint8_t *)aux16; + + const uint16_t * a = (const uint16_t *)bq4_K->scales; + aux16[0] = a[0] & 0x0f0f; + aux16[1] = (a[0] >> 4) & 0x0f0f; + + const float dall = bq4_K->dm[0]; + const float dmin = bq4_K->dm[1]; + + const float d8_1 = __low2float(bq8_1[0].ds); + const float d8_2 = __low2float(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * q4 = (const int *)bq4_K->qs + (iqs/2); + const int v1 = q4[0]; + const int v2 = q4[4]; + + const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0)); + const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0)); + const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0)); + const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0)); + + sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]); + sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]); + + return dall * sumf_d - dmin * sumf_m; + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +static __device__ __forceinline__ float vec_dot_q5_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + +#ifndef GGML_QKK_64 + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + int vl[2]; + int vh[2]; + int u[2*QR5_K]; + float d8[QR5_K]; + + const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2)); + const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4)); + const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4)); + + vl[0] = ql[0]; + vl[1] = ql[4]; + + vh[0] = qh[0] >> bq8_offset; + vh[1] = qh[4] >> bq8_offset; + + const uint16_t * scales = (const uint16_t *)bq5_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; + +#pragma unroll + for (int i = 0; i < QR5_K; ++i) { + const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; + d8[i] = __low2float(bq8i->ds); + + const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4); + u[2*i+0] = q8[0]; + u[2*i+1] = q8[4]; + } + + return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8); + +#else + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q5_K * bq5_K = (const block_q5_K *) vbq; + + const int8_t * s = bq5_K->scales; + + const float d = bq5_K->d; + + const float d8_1 = __low2half(bq8_1[0].ds); + const float d8_2 = __low2half(bq8_1[1].ds); + + const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2)); + const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4); + const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2)); + const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4); + + const int * ql = (const int *)bq5_K->qs + (iqs/2); + const int vl1 = ql[0]; + const int vl2 = ql[4]; + + const int step = 4 * (iqs/2); // 0, 4, 8, 12 + const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6 + const int in = step%8; // 0, 4, 0, 4 + const int vh = (*((const int *)(bq5_K->qh + in))) >> im; + + const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f); + const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f); + const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f); + const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f); + + const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1]) + + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]); + + return d * sumf_d; + +#else + NO_DEVICE_CODE; +#endif // __CUDA_ARCH__ >= MIN_CC_DP4A + +#endif +} + +static __device__ __forceinline__ float vec_dot_q6_K_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_q6_K * bq6_K = (const block_q6_K *) vbq; + + const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4); + const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8); + const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4)); + + const int vl = get_int_from_uint8(bq6_K->ql, iqs); + const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift; + + const int8_t * scales = bq6_K->scales + scale_offset; + + int u[QR6_K]; + float d8[QR6_K]; + +#pragma unroll + for (int i = 0; i < QR6_K; ++i) { + u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1); + d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds); + } + + return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8); +} + +static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if QK_K == 256 + const block_iq2_xxs * bq2 = (const block_iq2_xxs *) vbq; + +#if QR2_XXS == 8 + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const uint8_t * aux8 = (const uint8_t *)q2; + const int8_t * q8 = bq8_1[ib32].qs; + uint32_t aux32 = q2[2] | (q2[3] << 16); + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); + const uint8_t signs = ksigns_iq2xs[aux32 & 127]; + for (int j = 0; j < 8; ++j) { + sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + aux32 >>= 7; + } + const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.25f; + return d * sumi; +#else + // iqs is 0...15 + const int ib32 = iqs/2; + const int il = iqs%2; + const uint16_t * q2 = bq2->qs + 4*ib32; + const uint8_t * aux8 = (const uint8_t *)q2; + const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); + const uint32_t aux32 = q2[2] | (q2[3] << 16); + const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * __low2float(bq8_1[ib32].ds) * 0.25f; + const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; + const uint8_t signs2 = ksigns_iq2xs[(aux32 >> (14*il + 7)) & 127]; + const int8_t * q8 = bq8_1[ib32].qs + 16*il; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j+0] * grid1[j] * (signs1 & kmask_iq2xs[j] ? -1 : 1); + sumi2 += q8[j+8] * grid2[j] * (signs2 & kmask_iq2xs[j] ? -1 : 1); + } + return d * (sumi1 + sumi2); +#endif +#else + NO_DEVICE_CODE; +#endif +} + +static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; + + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); + sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2xs_grid + (q2[l] & 511)); + const uint32_t * signs = (const uint32_t *)(ksigns64 + (q2[l] >> 9)); + const int grid_l = __vsub4(grid[0] ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid[1] ^ signs[1], signs[1]); + sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + GGML_UNUSED(ksigns64); + NO_DEVICE_CODE; +#endif +#else + GGML_UNUSED(ksigns64); + NO_DEVICE_CODE; +#endif +} + +// TODO +static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq2_s * bq2 = (const block_iq2_s *) vbq; + + const int ib32 = iqs; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); + const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); + const int grid_l = __vsub4(grid[0] ^ signs0, signs0); + const int grid_h = __vsub4(grid[1] ^ signs1, signs1); + sumi1 = __dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = __dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = __vcmpeq4(((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); + const uint32_t signs1 = __vcmpeq4(((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); + const int grid_l = __vsub4(grid[0] ^ signs0, signs0); + const int grid_h = __vsub4(grid[1] ^ signs1, signs1); + sumi2 = __dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = __dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + GGML_UNUSED(ksigns64); + NO_DEVICE_CODE; +#endif +#else + GGML_UNUSED(ksigns64); + NO_DEVICE_CODE; +#endif +} + +static __device__ __forceinline__ float vec_dot_iq3_xxs_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_xxs * bq2 = (const block_iq3_xxs *) vbq; + + const int ib32 = iqs; + const uint8_t * q3 = bq2->qs + 8*ib32; + const uint16_t * gas = (const uint16_t *)(bq2->qs + QK_K/4) + 2*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + uint32_t aux32 = gas[0] | (gas[1] << 16); + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3xxs_grid + q3[2*l+0]; + const uint32_t * grid2 = iq3xxs_grid + q3[2*l+1]; + const uint32_t * signs = (const uint32_t *)(ksigns64 + (aux32 & 127)); + const int grid_l = __vsub4(grid1[0] ^ signs[0], signs[0]); + const int grid_h = __vsub4(grid2[0] ^ signs[1], signs[1]); + sumi = __dp4a(grid_l, *((int *)q8+0), sumi); + sumi = __dp4a(grid_h, *((int *)q8+1), sumi); + q8 += 8; + aux32 >>= 7; + } + const float d = (float)bq2->d * (0.5f + aux32) * __low2float(bq8_1[ib32].ds) * 0.5f; + return d * sumi; +#else + NO_DEVICE_CODE; +#endif +#else + NO_DEVICE_CODE; +#endif +} + +// TODO: don't use lookup table for signs +static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +#if QK_K == 256 + const block_iq3_s * bq2 = (const block_iq3_s *) vbq; + + const int ib32 = iqs; + const uint8_t * qs = bq2->qs + 8*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + int sumi = 0; + for (int l = 0; l < 4; ++l) { + const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); + uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); + const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); + const int grid_h = __vsub4(grid2[0] ^ signs1, signs1); + sumi = __dp4a(grid_l, *((int *)q8+0), sumi); + sumi = __dp4a(grid_h, *((int *)q8+1), sumi); + q8 += 8; + } + const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds); + return d * sumi; +#else + NO_DEVICE_CODE; +#endif +#else + NO_DEVICE_CODE; +#endif +} + +static __device__ __forceinline__ float vec_dot_iq1_s_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if QK_K == 256 + const block_iq1_s * bq1 = (const block_iq1_s *) vbq; + + const int ib32 = iqs; + int sumi = 0; +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi)); + } +#else + const int8_t * q8 = bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + for (int j = 0; j < 4; ++j) { + sumi += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4); + } + q8 += 8; + } +#endif + const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; + const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); + const float d = d1q * __low2float (bq8_1[ib32].ds); + const float m = d1q * __high2float(bq8_1[ib32].ds); + return d * sumi + m * delta; +#else + NO_DEVICE_CODE; +#endif +} + +static __device__ __forceinline__ float vec_dot_iq1_m_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if QK_K == 256 + const block_iq1_m * bq1 = (const block_iq1_m *) vbq; + + const int ib32 = iqs; + int sumi[2] = {0, 0}; + float sumf[2] = {0.f, 0.f}; +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi[l/2] = __dp4a(q8[2*l+1], grid1, __dp4a(q8[2*l+0], grid0, sumi[l/2])); + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + const int sumy = __dp4a(q8[2*l+1], 0x01010101, __dp4a(q8[2*l+0], 0x01010101, 0)); + sumf[l/2] += delta*sumy; + } +#else + const int8_t * q8 = bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int sumy = 0; + for (int j = 0; j < 4; ++j) { + sumi[l/2] += q8[j] * (grid[j] & 0xf) + q8[j+4] * (grid[j] >> 4); + sumy += q8[j] + q8[j+4]; + } + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + sumf[l/2] += delta*sumy; + q8 += 8; + } +#endif + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)bq1->scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = (float)scale.f16 * __low2float (bq8_1[ib32].ds); + return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); +#else + NO_DEVICE_CODE; +#endif +} + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics +static __device__ __forceinline__ void get_int_from_table_16(const uint32_t & q4, const uint8_t * values, + int & val1, int & val2) { + + uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; + aux32 = q4 & 0x0f0f0f0f; + uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); + uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); + val1 = v1 | (v2 << 16); + aux32 = (q4 >> 4) & 0x0f0f0f0f; + v1 = values[q8[0]] | (values[q8[1]] << 8); + v2 = values[q8[2]] | (values[q8[3]] << 8); + val2 = v1 | (v2 << 16); +} +#endif + +static __device__ __forceinline__ float vec_dot_iq4_nl_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + + const block_iq4_nl * bq = (const block_iq4_nl *) vbq; + +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; + const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { + const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); + get_int_from_table_16(aux, values, v1, v2); + sumi1 = __dp4a(v1, q8[l+0], sumi1); + sumi2 = __dp4a(v2, q8[l+4], sumi2); + } + +#else + const uint8_t * q4 = bq->qs + 4*iqs; + const int8_t * q8 = bq8_1->qs + 4*iqs; + + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < 4*VDR_Q4_0_Q8_1_MMVQ; ++l) { + sumi1 += q8[l+ 0] * kvalues_iq4nl[q4[l] & 0xf]; + sumi2 += q8[l+16] * kvalues_iq4nl[q4[l] >> 4]; + } +#endif + const float d = (float)bq->d * __low2float(bq8_1->ds); + return d * (sumi1 + sumi2); +} + +static __device__ __forceinline__ float vec_dot_iq4_xs_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { + +#if QK_K == 256 +#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics + + const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + // iqs is 0...7 + const int ib32 = iqs; + const int32_t * q8 = (const int *)bq8_1[ib32].qs; + const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; + const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)bq4->d * (ls - 32) * __low2float(bq8_1[ib32].ds); + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 4; ++j) { + get_int_from_table_16(q4[j], values, v1, v2); + sumi1 = __dp4a(v1, q8[j+0], sumi1); + sumi2 = __dp4a(v2, q8[j+4], sumi2); + } + return d * (sumi1 + sumi2); + +#else + NO_DEVICE_CODE; +#endif +#else + return vec_dot_iq4_xs_q8_1(vbq, bq8_1, iqs); +#endif +} diff --git a/ggml-impl.h b/ggml-impl.h index e68b72877..d85b152bf 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -11,6 +11,89 @@ #include // memcpy #include // fabsf +#undef MIN +#undef MAX + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +/** + * Converts brain16 to float32. + * + * The bfloat16 floating point format has the following structure: + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───┐ + * 0b0000000000000000 brain16 + * + * Since bf16 has the same number of exponent bits as a 32bit float, + * encoding and decoding numbers becomes relatively straightforward. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌──┴───┐┌─┴───────────────────┐ + * 0b00000000000000000000000000000000 IEEE binary32 + * + * For comparison, the standard fp16 format has fewer exponent bits. + * + * ┌sign + * │ + * │ ┌exponent + * │ │ + * │ │ ┌mantissa + * │ │ │ + * │┌─┴─┐┌─┴──────┐ + * 0b0000000000000000 IEEE binary16 + * + * @see IEEE 754-2008 + */ +static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) { + union { + float f; + uint32_t i; + } u; + u.i = (uint32_t)h.bits << 16; + return u.f; +} + +/** + * Converts float32 to brain16. + * + * This function is binary identical to AMD Zen4 VCVTNEPS2BF16. + * Subnormals shall be flushed to zero, and NANs will be quiet. + * This code should vectorize nicely if using modern compilers. + */ +static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) { + ggml_bf16_t h; + union { + float f; + uint32_t i; + } u; + u.f = s; + if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */ + h.bits = (u.i >> 16) | 64; /* force to quiet */ + return h; + } + if (!(u.i & 0x7f800000)) { /* subnormal */ + h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */ + return h; + } + h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16; + return h; +} + +#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x) +#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x) + #ifdef __cplusplus extern "C" { #endif @@ -45,7 +128,7 @@ extern "C" { // 16-bit float // on Arm, we use __fp16 // on x86, we use uint16_t -#if defined(__ARM_NEON) && !defined(_MSC_VER) +#if defined(__ARM_NEON) // if YCM cannot find , make a symbolic link to it, for example: // @@ -53,8 +136,262 @@ extern "C" { // #include +#ifdef _MSC_VER + +typedef uint16_t ggml_fp16_internal_t; + +#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } + +#else + typedef __fp16 ggml_fp16_internal_t; +#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } + +#endif // _MSC_VER + +#if !defined(__aarch64__) + +// 32-bit ARM compatibility + +// vaddvq_s16 +// vpaddq_s16 +// vpaddq_s32 +// vaddvq_s32 +// vaddvq_f32 +// vmaxvq_f32 +// vcvtnq_s32_f32 +// vzip1_u8 +// vzip2_u8 + +inline static int32_t vaddvq_s16(int16x8_t v) { + return + (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + + (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + + (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + + (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); +} + +inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { + int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); + int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); + return vcombine_s16(a0, b0); +} + +inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { + int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); + int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); + return vcombine_s32(a0, b0); +} + +inline static int32_t vaddvq_s32(int32x4_t v) { + return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); +} + +inline static float vaddvq_f32(float32x4_t v) { + return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); +} + +inline static float vmaxvq_f32(float32x4_t v) { + return + MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), + MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); +} + +inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + +inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[0]; res[1] = b[0]; + res[2] = a[1]; res[3] = b[1]; + res[4] = a[2]; res[5] = b[2]; + res[6] = a[3]; res[7] = b[3]; + + return res; +} + +inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { + uint8x8_t res; + + res[0] = a[4]; res[1] = b[4]; + res[2] = a[5]; res[3] = b[5]; + res[4] = a[6]; res[5] = b[6]; + res[6] = a[7]; res[7] = b[7]; + + return res; +} + +// vld1q_s16_x2 +// vld1q_u8_x2 +// vld1q_u8_x4 +// vld1q_s8_x2 +// vld1q_s8_x4 +// TODO: double-check these work correctly + +typedef struct ggml_int16x8x2_t { + int16x8_t val[2]; +} ggml_int16x8x2_t; + +inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { + ggml_int16x8x2_t res; + + res.val[0] = vld1q_s16(ptr + 0); + res.val[1] = vld1q_s16(ptr + 8); + + return res; +} + +typedef struct ggml_uint8x16x2_t { + uint8x16_t val[2]; +} ggml_uint8x16x2_t; + +inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { + ggml_uint8x16x2_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + + return res; +} + +typedef struct ggml_uint8x16x4_t { + uint8x16_t val[4]; +} ggml_uint8x16x4_t; + +inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { + ggml_uint8x16x4_t res; + + res.val[0] = vld1q_u8(ptr + 0); + res.val[1] = vld1q_u8(ptr + 16); + res.val[2] = vld1q_u8(ptr + 32); + res.val[3] = vld1q_u8(ptr + 48); + + return res; +} + +typedef struct ggml_int8x16x2_t { + int8x16_t val[2]; +} ggml_int8x16x2_t; + +inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { + ggml_int8x16x2_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + + return res; +} + +typedef struct ggml_int8x16x4_t { + int8x16_t val[4]; +} ggml_int8x16x4_t; + +inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { + ggml_int8x16x4_t res; + + res.val[0] = vld1q_s8(ptr + 0); + res.val[1] = vld1q_s8(ptr + 16); + res.val[2] = vld1q_s8(ptr + 32); + res.val[3] = vld1q_s8(ptr + 48); + + return res; +} + +// NOTE: not tested +inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { + int8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +// NOTE: not tested +inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { + uint8x16_t res; + + res[ 0] = a[b[ 0]]; + res[ 1] = a[b[ 1]]; + res[ 2] = a[b[ 2]]; + res[ 3] = a[b[ 3]]; + res[ 4] = a[b[ 4]]; + res[ 5] = a[b[ 5]]; + res[ 6] = a[b[ 6]]; + res[ 7] = a[b[ 7]]; + res[ 8] = a[b[ 8]]; + res[ 9] = a[b[ 9]]; + res[10] = a[b[10]]; + res[11] = a[b[11]]; + res[12] = a[b[12]]; + res[13] = a[b[13]]; + res[14] = a[b[14]]; + res[15] = a[b[15]]; + + return res; +} + +#else + +#define ggml_int16x8x2_t int16x8x2_t +#define ggml_uint8x16x2_t uint8x16x2_t +#define ggml_uint8x16x4_t uint8x16x4_t +#define ggml_int8x16x2_t int8x16x2_t +#define ggml_int8x16x4_t int8x16x4_t + +#define ggml_vld1q_s16_x2 vld1q_s16_x2 +#define ggml_vld1q_u8_x2 vld1q_u8_x2 +#define ggml_vld1q_u8_x4 vld1q_u8_x4 +#define ggml_vld1q_s8_x2 vld1q_s8_x2 +#define ggml_vld1q_s8_x4 vld1q_s8_x4 +#define ggml_vqtbl1q_s8 vqtbl1q_s8 +#define ggml_vqtbl1q_u8 vqtbl1q_u8 + +#endif // !defined(__aarch64__) + +#if !defined(__ARM_FEATURE_DOTPROD) + +inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { + const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); + const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); + + return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); +} + +#else + +#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) + +#endif // !defined(__ARM_FEATURE_DOTPROD) + +#endif // defined(__ARM_NEON) + +#if defined(__ARM_NEON) && !defined(_MSC_VER) + #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) @@ -75,8 +412,6 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #else -typedef uint16_t ggml_fp16_internal_t; - #ifdef __wasm_simd128__ #include #else @@ -88,7 +423,7 @@ typedef uint16_t ggml_fp16_internal_t; #if defined(_MSC_VER) || defined(__MINGW32__) #include #else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) || defined(__SSE__) #if !defined(__riscv) #include #endif @@ -221,7 +556,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { #endif // __F16C__ -#endif // __ARM_NEON +#endif // defined(__ARM_NEON) && (!defined(__MSC_VER) // precomputed f32 table for f16 (256 KB) // defined in ggml.c, initialized in ggml_init() diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 81dd50678..9a469821d 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1427,9 +1427,14 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml for (int i = node_start; i < node_end; ++i) { struct ggml_tensor * src0 = gf->nodes[i]->src[0]; struct ggml_tensor * src1 = gf->nodes[i]->src[1]; + struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2); struct ggml_tensor * dst = gf->nodes[i]; GGML_ASSERT(dst->data != nullptr); + if (ggml_is_empty(dst)) { + continue; + } + switch (dst->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -1555,6 +1560,12 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml { float scale; memcpy(&scale, dst->op_params, sizeof(float)); + +#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support") +#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") + GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); + GGML_ASSERT(src2 == nullptr); + ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale); } break; case GGML_OP_DIAG_MASK_INF: diff --git a/ggml-metal.m b/ggml-metal.m index 54d1f1d5a..c6817f01f 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -37,13 +37,19 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_DIV_ROW, GGML_METAL_KERNEL_TYPE_SCALE, GGML_METAL_KERNEL_TYPE_SCALE_4, + GGML_METAL_KERNEL_TYPE_CLAMP, GGML_METAL_KERNEL_TYPE_TANH, GGML_METAL_KERNEL_TYPE_RELU, GGML_METAL_KERNEL_TYPE_GELU, + GGML_METAL_KERNEL_TYPE_GELU_4, GGML_METAL_KERNEL_TYPE_GELU_QUICK, + GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, GGML_METAL_KERNEL_TYPE_SILU, - GGML_METAL_KERNEL_TYPE_SOFT_MAX, - GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, + GGML_METAL_KERNEL_TYPE_SILU_4, + GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, + GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, + GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, + GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, @@ -64,6 +70,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, + GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, @@ -91,6 +98,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, @@ -114,6 +122,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, @@ -134,6 +143,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, @@ -154,6 +164,7 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, + GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32, @@ -168,6 +179,14 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, + GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, GGML_METAL_KERNEL_TYPE_CPY_F32_F16, GGML_METAL_KERNEL_TYPE_CPY_F32_F32, GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, @@ -246,11 +265,20 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){ static void * ggml_metal_host_malloc(size_t n) { void * data = NULL; + +#if TARGET_OS_OSX + kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE); + if (err != KERN_SUCCESS) { + GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__); + return NULL; + } +#else const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n); if (result != 0) { GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__); return NULL; } +#endif return data; } @@ -434,7 +462,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { } /* - GGML_METAL_LOG_INFO("%s: loaded %-32s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ + GGML_METAL_LOG_INFO("%s: loaded %-40s %16p | th_max = %4d | th_width = %4d\n", __func__, "kernel_"#name, (void *) kernel->pipeline, \ (int) kernel->pipeline.maxTotalThreadsPerThreadgroup, \ (int) kernel->pipeline.threadExecutionWidth); \ */ @@ -450,163 +478,182 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) { return NULL; \ } \ } else { \ - GGML_METAL_LOG_WARN("%s: skipping %-32s (not supported)\n", __func__, "kernel_"#name); \ + GGML_METAL_LOG_WARN("%s: skipping %-40s (not supported)\n", __func__, "kernel_"#name); \ } // simd_sum and simd_max requires MTLGPUFamilyApple7 - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX, soft_max, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_4, soft_max_4, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, get_rows_q5_1, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, get_rows_q8_0, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, get_rows_q2_K, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, get_rows_q3_K, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, get_rows_q4_K, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, get_rows_q5_K, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction); - //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, ctx->support_simdgroup_reduction); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true); - GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD, add, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL, mul, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV, div, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV_ROW, div_row, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE, scale, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SCALE_4, scale_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK_4, gelu_quick_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU, silu, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SILU_4, silu_4, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16, soft_max_f16, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4, soft_max_f16_4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32, soft_max_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4, soft_max_f32_4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF, diag_mask_inf, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIAG_MASK_INF_8, diag_mask_inf_8, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F32, get_rows_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_F16, get_rows_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_0, get_rows_q4_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_1, get_rows_q4_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_0, get_rows_q5_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_1, get_rows_q5_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q8_0, get_rows_q8_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q2_K, get_rows_q2_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q3_K, get_rows_q3_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q4_K, get_rows_q4_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q5_K, get_rows_q5_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_Q6_K, get_rows_q6_K, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XXS, get_rows_iq2_xxs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_XS, get_rows_iq2_xs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_XXS, get_rows_iq3_xxs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S, get_rows_iq3_s, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S, get_rows_iq2_s, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S, get_rows_iq1_s, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M, get_rows_iq1_m, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL, get_rows_iq4_nl, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS, get_rows_iq4_xs, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GET_ROWS_I32, get_rows_i32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RMS_NORM, rms_norm, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GROUP_NORM, group_norm, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NORM, norm, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32, mul_mv_f32_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16, mul_mv_f16_f16, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32, mul_mv_f16_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW, mul_mv_f16_f32_1row, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4, mul_mv_f16_f32_l4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_0_F32, mul_mv_q4_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_1_F32, mul_mv_q4_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_0_F32, mul_mv_q5_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_1_F32, mul_mv_q5_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q8_0_F32, mul_mv_q8_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q2_K_F32, mul_mv_q2_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q3_K_F32, mul_mv_q3_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q4_K_F32, mul_mv_q4_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q5_K_F32, mul_mv_q5_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_Q6_K_F32, mul_mv_q6_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XXS_F32, mul_mv_iq2_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_XS_F32, mul_mv_iq2_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_XXS_F32, mul_mv_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ3_S_F32, mul_mv_iq3_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ2_S_F32, mul_mv_iq2_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32, mul_mv_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32, mul_mv_iq1_m_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_NL_F32, mul_mv_iq4_nl_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_IQ4_XS_F32, mul_mv_iq4_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F32_F32, mul_mv_id_f32_f32, ctx->support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F16, mul_mv_id_f16_f16, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32, mul_mv_id_f16_f32, ctx->support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_1ROW, mul_mv_id_f16_f32_1row, ctx->support_simdgroup_reduction); + //GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_F16_F32_L4, mul_mv_id_f16_f32_l4, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_0_F32, mul_mv_id_q4_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_1_F32, mul_mv_id_q4_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_0_F32, mul_mv_id_q5_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_1_F32, mul_mv_id_q5_1_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q8_0_F32, mul_mv_id_q8_0_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q2_K_F32, mul_mv_id_q2_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q3_K_F32, mul_mv_id_q3_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q4_K_F32, mul_mv_id_q4_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q5_K_F32, mul_mv_id_q5_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_Q6_K_F32, mul_mv_id_q6_K_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XXS_F32, mul_mv_id_iq2_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_XS_F32, mul_mv_id_iq2_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_XXS_F32, mul_mv_id_iq3_xxs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ3_S_F32, mul_mv_id_iq3_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ2_S_F32, mul_mv_id_iq2_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32, mul_mv_id_iq1_s_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32, mul_mv_id_iq1_m_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32, mul_mv_id_iq4_nl_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32, mul_mv_id_iq4_xs_f32, ctx->support_simdgroup_reduction); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F32_F32, mul_mm_f32_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_F16_F32, mul_mm_f16_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_0_F32, mul_mm_q4_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_1_F32, mul_mm_q4_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_0_F32, mul_mm_q5_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_1_F32, mul_mm_q5_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q8_0_F32, mul_mm_q8_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q2_K_F32, mul_mm_q2_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q3_K_F32, mul_mm_q3_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q4_K_F32, mul_mm_q4_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q5_K_F32, mul_mm_q5_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_Q6_K_F32, mul_mm_q6_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XXS_F32, mul_mm_iq2_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_XS_F32, mul_mm_iq2_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_XXS_F32, mul_mm_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32, mul_mm_iq3_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32, mul_mm_iq2_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32, mul_mm_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32, mul_mm_iq1_m_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32, mul_mm_iq4_nl_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32, mul_mm_iq4_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32, mul_mm_id_f32_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32, mul_mm_id_f16_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32, mul_mm_id_q4_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_1_F32, mul_mm_id_q4_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_0_F32, mul_mm_id_q5_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_1_F32, mul_mm_id_q5_1_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q8_0_F32, mul_mm_id_q8_0_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q2_K_F32, mul_mm_id_q2_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q3_K_F32, mul_mm_id_q3_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_K_F32, mul_mm_id_q4_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q5_K_F32, mul_mm_id_q5_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q6_K_F32, mul_mm_id_q6_K_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XXS_F32, mul_mm_id_iq2_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_XS_F32, mul_mm_id_iq2_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_XXS_F32, mul_mm_id_iq3_xxs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32, mul_mm_id_iq3_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32, mul_mm_id_iq2_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32, mul_mm_id_iq1_s_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32, mul_mm_id_iq1_m_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32, mul_mm_id_iq4_nl_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_PAD_F32, pad_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TIMESTEP_EMBEDDING_F32, timestep_embedding_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARANGE_F32, arange_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_ASC, argsort_f32_i32_asc, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGSORT_F32_I32_DESC, argsort_f32_i32_desc, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_LEAKY_RELU_F32, leaky_relu_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64, flash_attn_ext_f16_h64, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80, flash_attn_ext_f16_h80, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96, flash_attn_ext_f16_h96, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112, flash_attn_ext_f16_h112, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128, flash_attn_ext_f16_h128, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256, flash_attn_ext_f16_h256, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128, flash_attn_ext_vec_f16_h128, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256, flash_attn_ext_vec_f16_h256, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F16, cpy_f32_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_F32, cpy_f32_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q8_0, cpy_f32_q8_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_0, cpy_f32_q4_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q4_1, cpy_f32_q4_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_0, cpy_f32_q5_0, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_Q5_1, cpy_f32_q5_1, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F16, cpy_f16_f16, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F16_F32, cpy_f16_f32, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT, concat, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR, sqr, true); + GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true); } [metal_library release]; @@ -703,6 +750,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_SCALE: + case GGML_OP_CLAMP: case GGML_OP_SQR: case GGML_OP_SUM_ROWS: return true; @@ -724,6 +772,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_ARGSORT: case GGML_OP_LEAKY_RELU: + case GGML_OP_FLASH_ATTN_EXT: return true; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: @@ -763,7 +812,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const case GGML_OP_DIAG_MASK_INF: case GGML_OP_GET_ROWS: { - return op->ne[3] == 1; + return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1; } default: return false; @@ -837,6 +886,10 @@ static enum ggml_status ggml_metal_graph_compute( struct ggml_tensor * src2 = gf->nodes[i]->src[2]; struct ggml_tensor * dst = gf->nodes[i]; + if (ggml_is_empty(dst)) { + continue; + } + switch (dst->op) { case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -1140,8 +1193,30 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_CLAMP: + { + id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline; + + float min; + float max; + memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float)); + memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float)); + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&min length:sizeof(min) atIndex:2]; + [encoder setBytes:&max length:sizeof(max) atIndex:3]; + + const int64_t n = ggml_nelements(dst); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(gf->nodes[i])) { + // we are not taking into account the strides, so for now require contiguous tensors + GGML_ASSERT(ggml_is_contiguous(src0)); + case GGML_UNARY_OP_TANH: { id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_TANH].pipeline; @@ -1168,42 +1243,60 @@ static enum ggml_status ggml_metal_graph_compute( } break; case GGML_UNARY_OP_GELU: { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU].pipeline; + } [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_UNARY_OP_GELU_QUICK: { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GELU_QUICK].pipeline; + } [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_UNARY_OP_SILU: { - id pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; + int64_t n = ggml_nelements(dst); + + id pipeline = nil; + + if (n % 4 == 0) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU_4].pipeline; + n /= 4; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SILU].pipeline; + } [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - const int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; default: { @@ -1263,20 +1356,33 @@ static enum ggml_status ggml_metal_graph_compute( } break; case GGML_OP_SOFT_MAX: { + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); + GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32); + int nth = 32; // SIMD width id pipeline = nil; + const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16); + if (ne00%4 == 0) { while (nth < ne00/4 && nth < 256) { nth *= 2; } - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_4].pipeline; + if (use_f16) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16_4].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32_4].pipeline; + } } else { while (nth < ne00 && nth < 1024) { nth *= 2; } - pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX].pipeline; + if (use_f16) { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F16].pipeline; + } else { + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SOFT_MAX_F32].pipeline; + } } float scale; @@ -1392,6 +1498,14 @@ static enum ggml_status ggml_metal_graph_compute( (ne11 > ne11_mm_min || (ggml_is_quantized(src0t) && ne12 > 1))) { //printf("matrix: ne00 = %6d, ne01 = %6d, ne02 = %6d, ne11 = %6d, ne12 = %6d\n", ne00, ne01, ne02, ne11, ne12); + // some Metal matrix data types require aligned pointers + // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) + switch (src0->type) { + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + default: break; + } + id pipeline = nil; switch (src0->type) { @@ -1413,6 +1527,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ2_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_IQ4_XS_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); @@ -1567,6 +1682,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_IQ1_M_F32].pipeline; + } break; case GGML_TYPE_IQ4_NL: { nth0 = 4; @@ -1611,9 +1732,9 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBytes:&r2 length:sizeof(r2) atIndex:17]; [encoder setBytes:&r3 length:sizeof(r3) atIndex:18]; - if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || - src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || - src0t == GGML_TYPE_Q2_K || src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ2_S) { + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { @@ -1654,47 +1775,37 @@ static enum ggml_status ggml_metal_graph_compute( } break; case GGML_OP_MUL_MAT_ID: { - //GGML_ASSERT(ne00 == ne10); - //GGML_ASSERT(ne03 == ne13); + const int n_as = src0->ne[2]; - GGML_ASSERT(src0t == GGML_TYPE_I32); + // src2 = ids + const int64_t ne20 = src2->ne[0]; + const int64_t ne21 = src2->ne[1]; + const int64_t ne22 = src2->ne[2]; GGML_UNUSED(ne22); + const int64_t ne23 = src2->ne[3]; GGML_UNUSED(ne23); - const int n_as = ((int32_t *) dst->op_params)[1]; + const uint64_t nb20 = src2->nb[0]; GGML_UNUSED(nb20); + const uint64_t nb21 = src2->nb[1]; + const uint64_t nb22 = src2->nb[2]; GGML_UNUSED(nb22); + const uint64_t nb23 = src2->nb[3]; GGML_UNUSED(nb23); - // TODO: make this more general - GGML_ASSERT(n_as <= 8); + const enum ggml_type src2t = src2->type; GGML_UNUSED(src2t); - // max size of the src1ids array in the kernel shared buffer - GGML_ASSERT(ne11 <= 4096); + GGML_ASSERT(src2t == GGML_TYPE_I32); - const int64_t ne20 = src2 ? src2->ne[0] : 0; - const int64_t ne21 = src2 ? src2->ne[1] : 0; - const int64_t ne22 = src2 ? src2->ne[2] : 0; - const int64_t ne23 = src2 ? src2->ne[3] : 0; GGML_UNUSED(ne23); - - const uint64_t nb20 = src2 ? src2->nb[0] : 0; GGML_UNUSED(nb20); - const uint64_t nb21 = src2 ? src2->nb[1] : 0; - const uint64_t nb22 = src2 ? src2->nb[2] : 0; - const uint64_t nb23 = src2 ? src2->nb[3] : 0; GGML_UNUSED(nb23); - - const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); - - GGML_ASSERT(!ggml_is_transposed(src2)); + GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(src1t == GGML_TYPE_F32); - const uint r2 = ne12/ne22; - const uint r3 = ne13/ne23; - // find the break-even point where the matrix-matrix kernel becomes more efficient compared // to the matrix-vector kernel - int ne11_mm_min = n_as; + // ne20 = n_used_experts + // ne21 = n_rows + const int dst_rows = ne20*ne21; + const int dst_rows_min = n_as; - const int idx = ((int32_t *) dst->op_params)[0]; - - // batch size - GGML_ASSERT(ne01 == ne11); + // max size of the rowids array in the kernel shared buffer + GGML_ASSERT(dst_rows <= 2048); // for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs // AMD GPU and older A-chips will reuse matrix-vector multiplication kernel @@ -1703,12 +1814,20 @@ static enum ggml_status ggml_metal_graph_compute( // indirect matrix multiplication // !!! if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && - ne20 % 32 == 0 && ne20 >= 64 && - ne11 > ne11_mm_min) { + ne00 % 32 == 0 && ne00 >= 64 && + dst_rows > dst_rows_min) { + + // some Metal matrix data types require aligned pointers + // ref: https://developer.apple.com/metal/Metal-Shading-Language-Specification.pdf (Table 2.5) + switch (src0->type) { + case GGML_TYPE_F32: GGML_ASSERT(nb01 % 16 == 0); break; + case GGML_TYPE_F16: GGML_ASSERT(nb01 % 8 == 0); break; + default: break; + } id pipeline = nil; - switch (src2->type) { + switch (src0->type) { case GGML_TYPE_F32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F32_F32 ].pipeline; break; case GGML_TYPE_F16: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_F16_F32 ].pipeline; break; case GGML_TYPE_Q4_0: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_Q4_0_F32 ].pipeline; break; @@ -1727,6 +1846,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ3_S_F32 ].pipeline; break; case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ2_S_F32 ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_S_F32 ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ1_M_F32 ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_NL_F32 ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32 ].pipeline; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); @@ -1736,36 +1856,27 @@ static enum ggml_status ggml_metal_graph_compute( [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; - [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:5]; + [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; - [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:7]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:8]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:9]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:10]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13]; - [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:16]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:17]; - [encoder setBytes:&idx length:sizeof(idx) atIndex:18]; - // TODO: how to make this an array? read Metal docs - for (int j = 0; j < 8; ++j) { - // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 - struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:8]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:9]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:10]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:11]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:18]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; - size_t offs_src_cur = 0; - id id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur); + [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + dst_rows*4/*sizeof(ushort2)*/, 16) atIndex:0]; - [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:19 + j]; - } - - [encoder setThreadgroupMemoryLength:GGML_PAD(8192 + 2*ne11, 16) atIndex:0]; - - [encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 31)/32, (ne01 + 63)/64, n_as) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)]; } else { int nth0 = 32; int nth1 = 1; @@ -1775,7 +1886,7 @@ static enum ggml_status ggml_metal_graph_compute( id pipeline = nil; // use custom matrix x vector kernel - switch (src2t) { + switch (src0t) { case GGML_TYPE_F32: { GGML_ASSERT(src1t == GGML_TYPE_F32); @@ -1884,6 +1995,12 @@ static enum ggml_status ggml_metal_graph_compute( nth1 = 16; pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_S_F32].pipeline; } break; + case GGML_TYPE_IQ1_M: + { + nth0 = 4; + nth1 = 16; + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ1_M_F32].pipeline; + } break; case GGML_TYPE_IQ4_NL: { nth0 = 4; @@ -1894,7 +2011,12 @@ static enum ggml_status ggml_metal_graph_compute( { nth0 = 4; nth1 = 16; + #if QK_K == 64 + pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_NL_F32].pipeline; + #else pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_MV_ID_IQ4_XS_F32].pipeline; + #endif + } break; default: { @@ -1903,85 +2025,76 @@ static enum ggml_status ggml_metal_graph_compute( } }; - if (ggml_is_quantized(src2t)) { - GGML_ASSERT(ne20 >= nth0*nth1); + if (ggml_is_quantized(src0t)) { + GGML_ASSERT(ne00 >= nth0*nth1); } - const int64_t _ne1 = 1; // kernels needs a reference in constant memory - [encoder setComputePipelineState:pipeline]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; - [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:3]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:3]; [encoder setBytes:&ne20 length:sizeof(ne20) atIndex:4]; [encoder setBytes:&ne21 length:sizeof(ne21) atIndex:5]; - [encoder setBytes:&ne22 length:sizeof(ne22) atIndex:6]; - [encoder setBytes:&nb20 length:sizeof(nb20) atIndex:7]; - [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:8]; - [encoder setBytes:&nb22 length:sizeof(nb22) atIndex:9]; - [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:10]; - [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:11]; - [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:12]; - [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:13]; - [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:14]; - [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:15]; - [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:16]; - [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:17]; - [encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:18]; - [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:19]; - [encoder setBytes:&r2 length:sizeof(r2) atIndex:20]; - [encoder setBytes:&r3 length:sizeof(r3) atIndex:21]; - [encoder setBytes:&idx length:sizeof(idx) atIndex:22]; - // TODO: how to make this an array? read Metal docs - for (int j = 0; j < 8; ++j) { - // NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8 - struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)]; + [encoder setBytes:&nb21 length:sizeof(nb21) atIndex:6]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:7]; + [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:8]; + [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:9]; + [encoder setBytes:&nb00 length:sizeof(nb00) atIndex:10]; + [encoder setBytes:&nb01 length:sizeof(nb01) atIndex:11]; + [encoder setBytes:&nb02 length:sizeof(nb02) atIndex:12]; + [encoder setBytes:&ne10 length:sizeof(ne10) atIndex:13]; + [encoder setBytes:&ne11 length:sizeof(ne11) atIndex:14]; + [encoder setBytes:&ne12 length:sizeof(ne12) atIndex:15]; + [encoder setBytes:&ne13 length:sizeof(ne13) atIndex:16]; + [encoder setBytes:&nb10 length:sizeof(nb10) atIndex:17]; + [encoder setBytes:&nb11 length:sizeof(nb11) atIndex:18]; + [encoder setBytes:&nb12 length:sizeof(nb12) atIndex:19]; + [encoder setBytes:&ne0 length:sizeof(ne0) atIndex:20]; + [encoder setBytes:&ne1 length:sizeof(ne1) atIndex:21]; + [encoder setBytes:&nb1 length:sizeof(nb1) atIndex:22]; - size_t offs_src_cur = 0; - id id_src_cur = ggml_metal_get_buffer(src_cur, &offs_src_cur); + const int64_t _ne1 = 1; + const int tgz = dst_rows; - [encoder setBuffer:id_src_cur offset:offs_src_cur atIndex:23 + j]; + if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || + src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || src0t == GGML_TYPE_Q2_K || + src0t == GGML_TYPE_IQ1_S || src0t == GGML_TYPE_IQ1_M || src0t == GGML_TYPE_IQ2_S) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - - if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || - src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || - src2t == GGML_TYPE_Q2_K || src2t == GGML_TYPE_IQ1_S || src2t == GGML_TYPE_IQ2_S) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; - } - else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { - const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_IQ3_XXS || src2t == GGML_TYPE_IQ3_S) { - const int mem_size = src2t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; + else if (src0t == GGML_TYPE_IQ3_XXS || src0t == GGML_TYPE_IQ3_S) { + const int mem_size = src0t == GGML_TYPE_IQ3_XXS ? 256*4+128 : 512*4; [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_IQ4_NL || src2t == GGML_TYPE_IQ4_XS) { + else if (src0t == GGML_TYPE_IQ4_NL || src0t == GGML_TYPE_IQ4_XS) { const int mem_size = 32*sizeof(float); [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_Q4_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + else if (src0t == GGML_TYPE_Q4_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_Q3_K) { + else if (src0t == GGML_TYPE_Q3_K) { #ifdef GGML_QKK_64 - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #else - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; #endif } - else if (src2t == GGML_TYPE_Q5_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + else if (src0t == GGML_TYPE_Q5_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_Q6_K) { - [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 1)/2, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + else if (src0t == GGML_TYPE_Q6_K) { + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 1)/2, _ne1, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else { - const int64_t ny = (_ne1 + nrows - 1)/nrows; - [encoder dispatchThreadgroups:MTLSizeMake(ne21, ny, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; + const int64_t ny = (_ne1 + nrows - 1)/nrows; // = _ne1 + [encoder dispatchThreadgroups:MTLSizeMake(ne01, ny, tgz) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } } } break; @@ -2008,6 +2121,7 @@ static enum ggml_status ggml_metal_graph_compute( case GGML_TYPE_IQ3_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ3_S ].pipeline; break; case GGML_TYPE_IQ2_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ2_S ].pipeline; break; case GGML_TYPE_IQ1_S: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_S ].pipeline; break; + case GGML_TYPE_IQ1_M: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ1_M ].pipeline; break; case GGML_TYPE_IQ4_NL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_NL ].pipeline; break; case GGML_TYPE_IQ4_XS: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_IQ4_XS ].pipeline; break; case GGML_TYPE_I32: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_GET_ROWS_I32 ].pipeline; break; @@ -2387,6 +2501,16 @@ static enum ggml_status ggml_metal_graph_compute( enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0]; + // bitonic sort requires the number of elements to be power of 2 + int64_t ne00_padded = 1; + while (ne00_padded < ne00) { + ne00_padded *= 2; + } + + // Metal kernels require the buffer size to be multiple of 16 bytes + // https://developer.apple.com/documentation/metal/mtlcomputecommandencoder/1443142-setthreadgroupmemorylength + const int mem_size = GGML_PAD(ne00_padded*sizeof(int32_t), 16); + id pipeline = nil; switch (order) { @@ -2396,11 +2520,13 @@ static enum ggml_status ggml_metal_graph_compute( }; [encoder setComputePipelineState:pipeline]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; - [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; - [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2]; + [encoder setBytes:&ne00_padded length:sizeof( int64_t) atIndex:3]; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; - [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(1, nrows, 1) threadsPerThreadgroup:MTLSizeMake(ne00_padded, 1, 1)]; } break; case GGML_OP_LEAKY_RELU: { @@ -2420,6 +2546,161 @@ static enum ggml_status ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_FLASH_ATTN_EXT: + { + GGML_ASSERT(ne00 % 4 == 0); + GGML_ASSERT(src0->type == GGML_TYPE_F32); + + struct ggml_tensor * src3 = gf->nodes[i]->src[3]; + + GGML_ASSERT(ggml_are_same_shape(src1, src2)); + GGML_ASSERT(src3); + + size_t offs_src3 = 0; + + id id_src3 = src3 ? ggml_metal_get_buffer(src3, &offs_src3) : nil; + + GGML_ASSERT(!src3 || src3->type == GGML_TYPE_F16); + GGML_ASSERT(!src3 || src3->ne[1] >= GGML_PAD(src0->ne[1], 8) && + "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); + + const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); + const int64_t ne31 = src3 ? src3->ne[1] : 0; + const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); + const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33); + + const uint64_t nb30 = src3 ? src3->nb[0] : 0; GGML_UNUSED(nb30); + const uint64_t nb31 = src3 ? src3->nb[1] : 0; + const uint64_t nb32 = src3 ? src3->nb[2] : 0; GGML_UNUSED(nb32); + const uint64_t nb33 = src3 ? src3->nb[3] : 0; GGML_UNUSED(nb33); + + const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); + + float scale; + memcpy(&scale, dst->op_params, sizeof(float)); + + id pipeline = nil; + + bool use_vec_kernel = false; + + if (ne01 >= 4 || (ne00%128 != 0)) { + switch (ne00) { + case 64: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H64 ].pipeline; break; + case 80: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H80 ].pipeline; break; + case 96: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H96 ].pipeline; break; + case 112: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H112].pipeline; break; + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_F16_H256].pipeline; break; + default: + { + GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_METAL_LOG_ERROR("add template specialization for this size\n"); + GGML_ASSERT(false && "add template specialization for this size"); + } + } + } else { + use_vec_kernel = true; + + switch (ne00) { + case 128: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H128].pipeline; break; + case 256: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_FLASH_ATTN_EXT_VEC_F16_H256].pipeline; break; + default: + { + GGML_METAL_LOG_ERROR("unsupported size: %lld\n", ne00); + GGML_METAL_LOG_ERROR("add template specialization for this size\n"); + GGML_ASSERT(false && "add template specialization for this size"); + } + } + } + + [encoder setComputePipelineState:pipeline]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; + [encoder setBuffer:id_src3 offset:offs_src3 atIndex:3]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:4]; + [encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5]; + [encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6]; + [encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7]; + [encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8]; + [encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9]; + [encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10]; + [encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11]; + [encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12]; + [encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13]; + [encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14]; + [encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15]; + [encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16]; + [encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17]; + [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18]; + [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19]; + [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20]; + [encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21]; + [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22]; + [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23]; + [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24]; + [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25]; + [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26]; + [encoder setBytes:&scale length:sizeof( float) atIndex:27]; + + if (!use_vec_kernel) { + // half8x8 kernel + const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! + const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! + + GGML_ASSERT(nqptg <= 32); + GGML_ASSERT(nqptg % 8 == 0); + GGML_ASSERT(ncpsg % 32 == 0); + + int64_t nsgmax = 2; + + while (true) { + const size_t smem = nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg))*(sizeof(float)/2); + if (smem > ctx->device.maxThreadgroupMemoryLength) { + break; + } + nsgmax *= 2; + } + nsgmax /= 2; + + // simdgroups per threadgroup (a.k.a. warps) + const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4; + + const size_t smem = nqptg*(ne00 + 2*nsg*(ncpsg + nqptg))*(sizeof(float)/2); + + //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); + GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); + + [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + } else { + // half1x4 kernel + const int64_t nqptg = 1; // queries per threadgroup !! sync with kernel template arguments !! + const int64_t ncpsg = 32; // cache values per simdgroup !! sync with kernel template arguments !! + + GGML_ASSERT(nqptg <= 32); + GGML_ASSERT(nqptg % 1 == 0); + GGML_ASSERT(ncpsg % 32 == 0); + + // simdgroups per threadgroup (a.k.a. warps) + const int64_t nsgt = MAX(2, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32)); + + int64_t nsg = 1; + while (nsg <= nsgt) { + nsg *= 2; + } + nsg /= 2; + + const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + nsg*ne00)*(sizeof(float)/2); + + //printf("smem: %zu, max: %zu\n", smem, ctx->device.maxThreadgroupMemoryLength); + GGML_ASSERT(smem <= ctx->device.maxThreadgroupMemoryLength); + [encoder setThreadgroupMemoryLength:GGML_PAD(smem, 16) atIndex:0]; + + [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + } + } break; case GGML_OP_DUP: case GGML_OP_CPY: case GGML_OP_CONT: @@ -2507,6 +2788,11 @@ static enum ggml_status ggml_metal_graph_compute( MTLCommandBufferStatus status = [command_buffer status]; if (status != MTLCommandBufferStatusCompleted) { GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status); + if (status == MTLCommandBufferStatusError) { + NSString * error_code = [command_buffer error].localizedDescription; + GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]); + } + return GGML_STATUS_FAILED; } } @@ -2563,7 +2849,11 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_ ggml_backend_metal_free_device(); if (ctx->owned) { +#if TARGET_OS_OSX + vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size); +#else free(ctx->all_data); +#endif } free(ctx); @@ -2623,10 +2913,13 @@ GGML_CALL static const char * ggml_backend_metal_buffer_type_get_name(ggml_backe UNUSED(buft); } -static void ggml_backend_metal_log_allocated_size(id device) { +static void ggml_backend_metal_log_allocated_size(id device, size_t size_aligned) { +#ifndef GGML_METAL_NDEBUG #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15) if (@available(macOS 10.12, iOS 16.0, *)) { - GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)", + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)", + __func__, + size_aligned / 1024.0 / 1024.0, device.currentAllocatedSize / 1024.0 / 1024.0, device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0); @@ -2636,10 +2929,15 @@ static void ggml_backend_metal_log_allocated_size(id device) { GGML_METAL_LOG_INFO("\n"); } } else { - GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0); + GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n", + __func__, + size_aligned / 1024.0 / 1024.0, + device.currentAllocatedSize / 1024.0 / 1024.0); } +#endif #endif UNUSED(device); + UNUSED(size_aligned); } GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { @@ -2659,22 +2957,23 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff ctx->owned = true; ctx->n_buffers = 1; - ctx->buffers[0].data = ctx->all_data; - ctx->buffers[0].size = size; - ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data - length:size_aligned - options:MTLResourceStorageModeShared - deallocator:nil]; + if (ctx->all_data != NULL) { + ctx->buffers[0].data = ctx->all_data; + ctx->buffers[0].size = size; + ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data + length:size_aligned + options:MTLResourceStorageModeShared + deallocator:nil]; + } - if (ctx->buffers[0].metal == nil) { + if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) { GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0); free(ctx); ggml_backend_metal_free_device(); return NULL; } - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); - ggml_backend_metal_log_allocated_size(device); + //ggml_backend_metal_log_allocated_size(device, size_aligned); return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size); } @@ -2761,7 +3060,7 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, return false; } - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0); + ggml_backend_metal_log_allocated_size(device, size_aligned); ++ctx->n_buffers; } else { @@ -2784,7 +3083,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, return false; } - GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i); + ggml_backend_metal_log_allocated_size(device, size_step_aligned); + if (i + size_step < size) { GGML_METAL_LOG_INFO("\n"); } @@ -2793,8 +3093,6 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, } } - ggml_backend_metal_log_allocated_size(device); - return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size); } diff --git a/ggml-metal.metal b/ggml-metal.metal index 5552c8b28..46c7d5039 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -13,8 +13,8 @@ using namespace metal; #define N_SIMDWIDTH 32 // assuming SIMD group size is 32 enum ggml_sort_order { - GGML_SORT_ASC, - GGML_SORT_DESC, + GGML_SORT_ORDER_ASC, + GGML_SORT_ORDER_DESC, }; // general-purpose kernel for addition, multiplication and division of two tensors @@ -213,6 +213,15 @@ kernel void kernel_scale_4( dst[tpig] = src0[tpig] * scale; } +kernel void kernel_clamp( + device const float * src0, + device float * dst, + constant float & min, + constant float & max, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] < min ? min : (src0[tpig] > max ? max : src0[tpig]); +} + kernel void kernel_relu( device const float * src0, device float * dst, @@ -233,6 +242,15 @@ constant float GELU_QUICK_COEF = -1.702f; constant float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; kernel void kernel_gelu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + + dst[tpig] = 0.5f*x*(1.0f + precise::tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); +} + +kernel void kernel_gelu_4( device const float4 * src0, device float4 * dst, uint tpig[[thread_position_in_grid]]) { @@ -246,6 +264,15 @@ kernel void kernel_gelu( } kernel void kernel_gelu_quick( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + + dst[tpig] = x*(1.0f/(1.0f+exp(GELU_QUICK_COEF*x))); +} + +kernel void kernel_gelu_quick_4( device const float4 * src0, device float4 * dst, uint tpig[[thread_position_in_grid]]) { @@ -255,6 +282,14 @@ kernel void kernel_gelu_quick( } kernel void kernel_silu( + device const float * src0, + device float * dst, + uint tpig[[thread_position_in_grid]]) { + device const float & x = src0[tpig]; + dst[tpig] = x / (1.0f + exp(-x)); +} + +kernel void kernel_silu_4( device const float4 * src0, device float4 * dst, uint tpig[[thread_position_in_grid]]) { @@ -317,11 +352,12 @@ kernel void kernel_sum_rows( dst_row[0] = row_sum; } +template kernel void kernel_soft_max( - device const float * src0, - device const float * src1, - device const float * src2, - device float * dst, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -340,10 +376,10 @@ kernel void kernel_soft_max( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; - device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr; - device const float * ppos = src2 != src0 ? src2 : nullptr; - device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr; + device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr; + device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); float slope = 0.0f; @@ -421,11 +457,12 @@ kernel void kernel_soft_max( } } +template kernel void kernel_soft_max_4( - device const float * src0, - device const float * src1, - device const float * src2, - device float * dst, + device const char * src0, + device const char * src1, + device const char * src2, + device char * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, @@ -444,10 +481,10 @@ kernel void kernel_soft_max_4( const int64_t i02 = (tgpig - i03*ne02*ne01) / ne01; const int64_t i01 = (tgpig - i03*ne02*ne01 - i02*ne01); - device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); - device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr; - device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr; - device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; + device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr; + device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr; + device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; float slope = 0.0f; @@ -464,7 +501,7 @@ kernel void kernel_soft_max_4( float4 lmax4 = -INFINITY; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); + lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); @@ -490,7 +527,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } @@ -527,6 +564,14 @@ kernel void kernel_soft_max_4( } } +typedef decltype(kernel_soft_max) kernel_soft_max_t; +typedef decltype(kernel_soft_max_4) kernel_soft_max_4_t; + +template [[host_name("kernel_soft_max_f16")]] kernel kernel_soft_max_t kernel_soft_max; +template [[host_name("kernel_soft_max_f32")]] kernel kernel_soft_max_t kernel_soft_max; +template [[host_name("kernel_soft_max_f16_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; +template [[host_name("kernel_soft_max_f32_4")]] kernel kernel_soft_max_4_t kernel_soft_max_4; + kernel void kernel_diag_mask_inf( device const float * src0, device float * dst, @@ -866,6 +911,7 @@ void mul_vec_q_n_f32_impl( int64_t ne1, uint r2, uint r3, + threadgroup int8_t * shared_values, uint3 tgpig, uint tiisg, uint sgitg) { const int nb = ne00/QK4_0; @@ -942,7 +988,7 @@ kernel void kernel_mul_mv_q4_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q4_1_f32( @@ -968,7 +1014,7 @@ kernel void kernel_mul_mv_q4_1_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_0_f32( @@ -994,7 +1040,7 @@ kernel void kernel_mul_mv_q5_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } kernel void kernel_mul_mv_q5_1_f32( @@ -1020,7 +1066,7 @@ kernel void kernel_mul_mv_q5_1_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + mul_vec_q_n_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } @@ -1030,18 +1076,19 @@ void kernel_mul_mv_q8_0_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nr = N_DST; const int nsg = N_SIMDGROUP; const int nw = N_SIMDWIDTH; @@ -1119,7 +1166,7 @@ kernel void kernel_mul_mv_q8_0_f32( uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,tgpig,tiisg,sgitg); + kernel_mul_mv_q8_0_f32_impl(src0,src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,nullptr,tgpig,tiisg,sgitg); } #define N_F32_F32 4 @@ -1128,24 +1175,24 @@ void kernel_mul_mv_f32_f32_impl( device const char * src0, device const char * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + uint64_t nb00, + uint64_t nb01, + uint64_t nb02, + int64_t ne10, + int64_t ne11, + int64_t ne12, + uint64_t nb10, + uint64_t nb11, + uint64_t nb12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + uint3 tgpig, + uint tiisg) { const int64_t r0 = tgpig.x; const int64_t rb = tgpig.y*N_F32_F32; @@ -1398,24 +1445,24 @@ void kernel_mul_mv_f16_f32_impl( device const char * src0, device const char * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + uint64_t nb00, + uint64_t nb01, + uint64_t nb02, + int64_t ne10, + int64_t ne11, + int64_t ne12, + uint64_t nb10, + uint64_t nb11, + uint64_t nb12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + uint3 tgpig, + uint tiisg) { const int64_t r0 = tgpig.x; const int64_t rb = tgpig.y*N_F16_F32; @@ -1973,9 +2020,11 @@ kernel void kernel_timestep_embedding_f32( // bitonic sort implementation following the CUDA kernels as reference typedef void (argsort_t)( - device const float * x, - device int32_t * dst, - constant int64_t & ncols, + device const float * x, + device int32_t * dst, + constant int64_t & ncols, + constant int64_t & ncols_pad, + threadgroup int32_t * shared_values [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]]); @@ -1984,33 +2033,42 @@ kernel void kernel_argsort_f32_i32( device const float * x, device int32_t * dst, constant int64_t & ncols, + constant int64_t & ncols_pad, + threadgroup int32_t * shared_values [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint3 tpitg[[thread_position_in_threadgroup]]) { // bitonic sort int col = tpitg[0]; int row = tgpig[1]; - if (col >= ncols) return; + if (col >= ncols_pad) return; - device const float * x_row = x + row * ncols; - device int32_t * dst_row = dst + row * ncols; + device const float * x_row = x + row * ncols; + threadgroup int32_t * dst_row = shared_values; // initialize indices - if (col < ncols) { - dst_row[col] = col; - } + dst_row[col] = col; + threadgroup_barrier(mem_flags::mem_threadgroup); - for (int k = 2; k <= ncols; k *= 2) { + for (int k = 2; k <= ncols_pad; k *= 2) { for (int j = k / 2; j > 0; j /= 2) { int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { + if (dst_row[col] >= ncols || + (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]])) + ) { SWAP(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { + if (dst_row[ixj] >= ncols || + (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]])) + ) { SWAP(dst_row[col], dst_row[ixj]); } } @@ -2018,10 +2076,15 @@ kernel void kernel_argsort_f32_i32( threadgroup_barrier(mem_flags::mem_threadgroup); } } + + // copy the result to dst without the padding + if (col < ncols) { + dst[row * ncols + col] = dst_row[col]; + } } -template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32; -template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32; +template [[host_name("kernel_argsort_f32_i32_asc")]] kernel argsort_t kernel_argsort_f32_i32; +template [[host_name("kernel_argsort_f32_i32_desc")]] kernel argsort_t kernel_argsort_f32_i32; kernel void kernel_leaky_relu_f32( device const float * src0, @@ -2031,6 +2094,632 @@ kernel void kernel_leaky_relu_f32( dst[tpig] = src0[tpig] > 0.0f ? src0[tpig] : src0[tpig] * slope; } +typedef void (flash_attn_ext_f16_t)( + device const char * q, + device const char * k, + device const char * v, + device const char * mask, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne31, + constant uint64_t & nb31, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant float & scale, + threadgroup half * shared, + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]); + +// ref: https://arxiv.org/pdf/2307.08691.pdf +template // head size, queries per threadgroup, cache items per threadgroup +kernel void kernel_flash_attn_ext_f16( + device const char * q, + device const char * k, + device const char * v, + device const char * mask, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne31, + constant uint64_t & nb31, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant float & scale, + threadgroup half * shared [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + const short nsg = ntg.y; // number of simdgroups + + const short iq3 = tgpig[2]; + const short iq2 = tgpig[1]; + const short iq1 = tgpig[0]*Q; + + const short D4 = D/4; + const short D8 = D/8; + //const short Q8 = Q/8; + const short NW = N_SIMDWIDTH; + const short SH = (C + Q); // shared memory per simdgroup in (half) + + const short T = D + 2*nsg*SH; // shared memory size per query in (half) + const short TF = T/2; // shared memory size per query in (float) + const short T4 = T/4; // shared memory size per query in (half4) + + threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data + threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 + threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix + + // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) + simdgroup_half8x8 lo[D8]; + + // load heads from Q to shared memory + for (short j = sgitg; j < Q; j += nsg) { + device const float4 * q4 = (device const float4 *) ((device const char *) q + ((iq1 + j)*nb01 + iq2*nb02 + iq3*nb03)); + + for (short i = tiisg; i < D4; i += NW) { + if (iq1 + j < ne01) { + sq4[j*T4 + i] = (half4) q4[i]; + } else { + sq4[j*T4 + i] = 0.0h; + } + } + } + + // zero out lo + for (short i = 0; i < D8; ++i) { + lo[i] = make_filled_simdgroup_matrix(0.0h); + } + + // zero out shared memory SH + for (short j = 0; j < Q; ++j) { + for (short i = tiisg; i < SH; i += NW) { + ss[j*TF + i] = 0.0f; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + { + float S[Q] = { [0 ... Q-1] = 0.0h }; + float M[Q] = { [0 ... Q-1] = -FLT_MAX/2 }; + + // assume K and V are same shape + const short ne22 = ne12; + const short ne23 = ne13; + + const uint nb21 = nb11; + const uint nb22 = nb12; + const uint nb23 = nb13; + + // broadcast + const short rk2 = ne02/ne12; + const short rk3 = ne03/ne13; + + const short rv2 = ne02/ne22; + const short rv3 = ne03/ne23; + + // k indices + const short ik2 = iq2/rk2; + const short ik3 = iq3/rk3; + + // v indices + const short iv2 = iq2/rv2; + const short iv3 = iq3/rv3; + + // load the queries from shared memory into local memory + simdgroup_half8x8 mq[D8]; + + for (short i = 0; i < D8; ++i) { + simdgroup_load(mq[i], sq + i*8, T); + } + + // pointer to the mask + device const half * mp = (device const half *) (mask + iq1*nb31); + + // prepare diagonal scale matrix + simdgroup_float8x8 mscale(scale); + + // loop over the KV cache + // each simdgroup handles blocks of Q rows and C columns + for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) { + const int ic = ic0 + C*sgitg; + if (ic >= ne11) { + break; + } + + // Q*K^T + { + for (short cc = 0; cc < C/8; ++cc) { + simdgroup_float8x8 mqk = make_filled_simdgroup_matrix(0.h); + + device const half * pk = (device const half *) ((device const char *) k + ((ic + 8*cc)*nb11 + ik2*nb12 + ik3*nb13)); + + for (short i = 0; i < D8; ++i) { + simdgroup_half8x8 mk; + simdgroup_load(mk, pk + i*8, nb11/sizeof(half), 0, true); // transpose + + simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); + } + + // mqk = mqk*scale + mask + simdgroup_half8x8 mm; + simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false); + simdgroup_multiply_accumulate(mqk, mqk, mscale, mm); + + simdgroup_store(mqk, ss + 8*cc, TF, 0, false); + } + } + + // used to detect blocks full of -INF + float smax = -INFINITY; + + // online softmax + { + float ms[Q]; + + for (short j = 0; j < Q; ++j) { + const short p = tiisg; + + const float m = M[j]; + const float s = ss[j*TF + p]; + + smax = simd_max(max(smax, s)); + M[j] = simd_max(max(M[j], s)); + + ms[j] = exp(m - M[j]); + const float vs = exp(s - M[j]); + + S[j] = S[j]*ms[j] + simd_sum(vs); + + // the P matrix from the paper (Q rows, C columns) + ss[j*TF + p] = vs; + } + + // create a QxQ diagonal matrix for rescaling the output + if (tiisg < Q) { + ss[tiisg*TF + C + tiisg] = ms[tiisg]; + } + } + + // skip -INF blocks + if (smax == -INFINITY) { + continue; + } + + // O = diag(ms)*O + { + simdgroup_float8x8 mm; + simdgroup_load(mm, ss + C, TF, 0, false); + + for (short i = 0; i < D8; ++i) { + simdgroup_multiply(lo[i], mm, lo[i]); + } + } + + // O = O + (Q*K^T)*V + { + for (short cc = 0; cc < C/8; ++cc) { + device const half * pv = (device const half *) ((device const char *) v + ((ic + 8*cc)*nb21 + iv2*nb22 + iv3*nb23)); + + for (short i = 0; i < D8; ++i) { + simdgroup_half8x8 mk; + simdgroup_load(mk, pv + i*8, nb21/sizeof(half), 0, false); + + simdgroup_float8x8 mv; + simdgroup_load(mv, ss + 8*cc, TF, 0, false); + + simdgroup_multiply_accumulate(lo[i], mv, mk, lo[i]); + } + } + } + } + + // these are needed for reducing the results from the simdgroups (reuse the ss buffer) + for (short j = 0; j < Q; ++j) { + if (tiisg == 0) { + ss[j*TF + 0] = S[j]; + ss[j*TF + 1] = M[j]; + } + } + } + + // reduce the warps sequentially + for (short sg = 1; sg < nsg; ++sg) { + float S = { 0.0h }; + float M = { -FLT_MAX/2 }; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // each simdgroup stores its output to shared memory, reusing sq + if (sgitg == sg) { + for (short i = 0; i < D8; ++i) { + simdgroup_store(lo[i], sq + i*8, T, 0, false); + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // the first simdgroup accumulates the results from the other simdgroups + if (sgitg == 0) { + for (short j = 0; j < Q; ++j) { + const float S0 = ss[j*TF + 0]; + const float S1 = ss[j*TF + sg*SH + 0]; + + const float M0 = ss[j*TF + 1]; + const float M1 = ss[j*TF + sg*SH + 1]; + + M = max(M0, M1); + + const float ms0 = exp(M0 - M); + const float ms1 = exp(M1 - M); + + S = S0*ms0 + S1*ms1; + + if (tiisg == 0) { + ss[j*TF + 0] = S; + ss[j*TF + 1] = M; + + ss[j*TF + C + j ] = ms0; + ss[j*TF + C + j + sg*SH] = ms1; + } + } + + // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 + { + simdgroup_half8x8 t; + simdgroup_float8x8 ms0; + simdgroup_float8x8 ms1; + + simdgroup_load(ms0, ss + C, TF, 0, false); + simdgroup_load(ms1, ss + C + sg*SH, TF, 0, false); + + for (short i = 0; i < D8; ++i) { + simdgroup_load (t, sq + i*8, T, 0, false); + simdgroup_multiply(t, ms1, t); + + simdgroup_multiply_accumulate(lo[i], ms0, lo[i], t); + } + } + } + } + + // store result to shared memory (reuse sq) + if (sgitg == 0) { + for (short i = 0; i < D8; ++i) { + simdgroup_store(lo[i], sq + i*8, T, 0, false); + } + } + + device float4 * dst4 = (device float4 *) dst; + + // final rescale with 1/S and store to global memory + if (sgitg == 0) { + for (short j = 0; j < Q && iq1 + j < ne01; ++j) { + const float S = ss[j*TF + 0]; + + for (short i = tiisg; i < D4; i += NW) { + dst4[(iq3*ne2*ne1 + iq2 + (iq1 + j)*ne1)*D4 + i] = (float4) sq4[j*T4 + i]/S; + } + } + } +} + +template [[host_name("kernel_flash_attn_ext_f16_h64" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<64>; +template [[host_name("kernel_flash_attn_ext_f16_h80" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<80>; +template [[host_name("kernel_flash_attn_ext_f16_h96" )]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<96>; +template [[host_name("kernel_flash_attn_ext_f16_h112")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<112>; +template [[host_name("kernel_flash_attn_ext_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<128>; +template [[host_name("kernel_flash_attn_ext_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_f16<256>; + +template // head size, queries per threadgroup, cache items per threadgroup +kernel void kernel_flash_attn_ext_vec_f16( + device const char * q, + device const char * k, + device const char * v, + device const char * mask, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne03, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant uint64_t & nb03, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant uint64_t & nb13, + constant int64_t & ne31, + constant uint64_t & nb31, + constant int64_t & ne0, + constant int64_t & ne1, + constant int64_t & ne2, + constant int64_t & ne3, + constant float & scale, + threadgroup half * shared [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint3 tpitg[[thread_position_in_threadgroup]], + uint3 ntg[[threads_per_threadgroup]], + ushort tiisg[[thread_index_in_simdgroup]], + ushort sgitg[[simdgroup_index_in_threadgroup]]) { + const short nsg = ntg.y; // number of simdgroups + + const short iq3 = tgpig[2]; + const short iq2 = tgpig[1]; + const short iq1 = tgpig[0]; + + const short D4 = D/4; + const short NW = N_SIMDWIDTH; + const short SH = (C + Q); // shared memory per simdgroup in (half) + + const short T = D + 2*nsg*SH; // shared memory size per query in (half) + + //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data + threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 + threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix + threadgroup float4 * ss4 = (threadgroup float4 *) (shared + 2*sgitg*SH + 1*D); // same as above but in half4 + threadgroup half4 * sr4 = (threadgroup half4 *) (shared + sgitg*D + 1*T); // scratch buffer for the results + + // store the result for all queries in local memory in 8x8 matrices (the O matrix from the paper) + half4 lo[D4/NW]; + + // load heads from Q to shared memory + device const float4 * q4 = (device const float4 *) ((device const char *) q + (iq1*nb01 + iq2*nb02 + iq3*nb03)); + + for (short i = tiisg; i < D4; i += NW) { + if (iq1 < ne01) { + sq4[i] = (half4) q4[i]; + } else { + sq4[i] = 0.0h; + } + } + + // zero out lo + for (short i = tiisg; i < D4; i += NW) { + lo[i/NW] = 0.0h; + } + + // zero out shared memory SH + for (short i = tiisg; i < SH/4; i += NW) { + ss4[i] = 0.0h; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + { + float S = { 0.0h }; + float M = { -FLT_MAX/2 }; + + // assume K and V are same shape + const short ne22 = ne12; + const short ne23 = ne13; + + const uint nb21 = nb11; + const uint nb22 = nb12; + const uint nb23 = nb13; + + // broadcast + const short rk2 = ne02/ne12; + const short rk3 = ne03/ne13; + + const short rv2 = ne02/ne22; + const short rv3 = ne03/ne23; + + // k indices + const short ik2 = iq2 / rk2; + const short ik3 = iq3 / rk3; + + // v indices + const short iv2 = iq2 / rv2; + const short iv3 = iq3 / rv3; + + // load the queries from shared memory into local memory + half4 mq[D4]; + + for (short ii = 0; ii < D4; ii += NW) { + short i = ii + tiisg; + mq[i] = sq4[i]; + } + + // pointer to the mask + device const half4 * mp4 = (device const half4 *) (mask + iq1*nb31); + + // loop over the KV cache + // each simdgroup handles blocks of Q rows and C columns + for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) { + const int ic = ic0 + C*sgitg; + if (ic >= ne11) { + break; + } + + // Q*K^T + { +#pragma unroll + for (short cc = 0; cc < C/4; ++cc) { + float4 mqk = { 0.0h }; + + device const half4 * pk4 = (device const half4 *) ((device const char *) k + ((ic + 4*cc)*nb11 + ik2*nb12 + ik3*nb13)); + +#pragma unroll + for (short ii = 0; ii < D4; ii += NW) { + const short i = ii + tiisg; + + half4x4 mk; + mk[0] = pk4[i + 0*(nb11/8)]; + mk[1] = pk4[i + 1*(nb11/8)]; + mk[2] = pk4[i + 2*(nb11/8)]; + mk[3] = pk4[i + 3*(nb11/8)]; + + mqk += (float4) (mq[i] * mk); + } + + // reduce the results from the threads in the simdgroup + mqk += simd_shuffle_down(mqk, 16); + mqk += simd_shuffle_down(mqk, 8); + mqk += simd_shuffle_down(mqk, 4); + mqk += simd_shuffle_down(mqk, 2); + mqk += simd_shuffle_down(mqk, 1); + + // mqk = mqk*scale + mask + if (tiisg == 0) { + float4 mm = (float4) mp4[ic/4 + cc]; + mqk = mqk*scale + mm; + + ss4[cc] = mqk; + } + } + } + + // online softmax + { + const short p = tiisg; + + const float m = M; + const float s = ss[p]; + + M = simd_max(max(M, s)); + + const float ms = exp(m - M); + const float vs = exp(s - M); + + S = S*ms + simd_sum(vs); + + // the P matrix from the paper (Q rows, C columns) + ss[p] = vs; + + // O = diag(ms)*O +#pragma unroll + for (short ii = 0; ii < D4; ii += NW) { + const short i = ii + tiisg; + lo[i/NW] *= ms; + } + } + + // O = O + (Q*K^T)*V + { +#pragma unroll + for (short cc = 0; cc < C/4; ++cc) { + device const half4 * pv4 = (device const half4 *) ((device const char *) v + ((ic + 4*cc)*nb21 + iv2*nb22 + iv3*nb23)); + +#pragma unroll + for (short ii = 0; ii < D4; ii += NW) { + const short i = ii + tiisg; + + lo[i/NW] += pv4[i + 0*(nb21/8)] * ss[4*cc + 0]; + lo[i/NW] += pv4[i + 1*(nb21/8)] * ss[4*cc + 1]; + lo[i/NW] += pv4[i + 2*(nb21/8)] * ss[4*cc + 2]; + lo[i/NW] += pv4[i + 3*(nb21/8)] * ss[4*cc + 3]; + } + } + } + + } + + // these are needed for reducing the results from the simdgroups (reuse the ss buffer) + if (tiisg == 0) { + ss[0] = S; + ss[1] = M; + } + } + + // store results to shared memory + for (short ii = 0; ii < D4; ii += NW) { + short i = ii + tiisg; + sr4[i] = lo[ii/NW]; + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + // parallel reduce + for (short r = nsg/2; r > 0; r >>= 1) { + if (sgitg < r) { + const float S0 = ss[ 0]; + const float S1 = ss[r*SH + 0]; + + const float M0 = ss[ 1]; + const float M1 = ss[r*SH + 1]; + + const float M = max(M0, M1); + + const float ms0 = exp(M0 - M); + const float ms1 = exp(M1 - M); + + const float S = S0*ms0 + S1*ms1; + + if (tiisg == 0) { + ss[0] = S; + ss[1] = M; + } + + // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1 + for (short ii = 0; ii < D4; ii += NW) { + short i = ii + tiisg; + sr4[i] = sr4[i]*ms0 + sr4[i + r*D4]*ms1; + } + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + } + + device float4 * dst4 = (device float4 *) dst; + + // final rescale with 1/S and store to global memory + if (sgitg == 0) { + const float S = ss[0]; + + for (short ii = 0; ii < D4; ii += NW) { + short i = ii + tiisg; + dst4[(iq3*ne2*ne1 + iq2 + (iq1)*ne1)*D4 + i] = (float4) sr4[i]/S; + } + } +} + +template [[host_name("kernel_flash_attn_ext_vec_f16_h128")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<128>; +template [[host_name("kernel_flash_attn_ext_vec_f16_h256")]] kernel flash_attn_ext_f16_t kernel_flash_attn_ext_vec_f16<256>; + kernel void kernel_cpy_f16_f16( device const half * src0, device half * dst, @@ -2684,18 +3373,19 @@ void kernel_mul_mv_q2_K_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; const int r0 = tgpig.x; @@ -2855,7 +3545,7 @@ kernel void kernel_mul_mv_q2_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); + kernel_mul_mv_q2_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } #if QK_K == 256 @@ -2863,18 +3553,19 @@ void kernel_mul_mv_q3_K_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; @@ -3030,6 +3721,7 @@ void kernel_mul_mv_q3_K_f32_impl( constant int64_t & ne1, constant uint & r2, constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3119,7 +3811,7 @@ kernel void kernel_mul_mv_q3_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); + kernel_mul_mv_q3_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } #if QK_K == 256 @@ -3127,18 +3819,19 @@ void kernel_mul_mv_q4_K_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const uint16_t kmask1 = 0x3f3f; const uint16_t kmask2 = 0x0f0f; @@ -3249,6 +3942,7 @@ void kernel_mul_mv_q4_K_f32_impl( constant int64_t & ne1, constant uint & r2, constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -3357,25 +4051,26 @@ kernel void kernel_mul_mv_q4_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); + kernel_mul_mv_q4_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } void kernel_mul_mv_q5_K_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; @@ -3563,25 +4258,26 @@ kernel void kernel_mul_mv_q5_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); + kernel_mul_mv_q5_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } void kernel_mul_mv_q6_K_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const uint8_t kmask1 = 0x03; const uint8_t kmask2 = 0x0C; @@ -3697,7 +4393,7 @@ kernel void kernel_mul_mv_q6_K_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); + kernel_mul_mv_q6_K_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } // ======================= "True" 2-bit @@ -3706,19 +4402,19 @@ void kernel_mul_mv_iq2_xxs_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; const int r0 = tgpig.x; @@ -3835,19 +4531,19 @@ void kernel_mul_mv_iq2_xs_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; const int r0 = tgpig.x; @@ -3974,19 +4670,19 @@ void kernel_mul_mv_iq3_xxs_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; const int r0 = tgpig.x; @@ -4106,19 +4802,19 @@ void kernel_mul_mv_iq3_s_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; const int r0 = tgpig.x; @@ -4238,19 +4934,19 @@ void kernel_mul_mv_iq2_s_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; const int r0 = tgpig.x; @@ -4371,18 +5067,19 @@ void kernel_mul_mv_iq1_s_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_value, + uint3 tgpig, + uint tiisg, + uint sgitg) { const int nb = ne00/QK_K; const int r0 = tgpig.x; @@ -4456,24 +5153,134 @@ void kernel_mul_mv_iq1_s_f32_impl( } } +void kernel_mul_mv_iq1_m_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_value, + uint3 tgpig, + uint tiisg, + uint sgitg) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + device const block_iq1_m * x = (device const block_iq1_m *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + +#if QK_K != 64 + iq1m_scale_t scale; +#endif + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + float4 sumy = {0.f}; + for (int i = 0; i < 8; ++i) { + yl[i+ 0] = y4[i+ 0]; sumy[0] += yl[i+ 0]; + yl[i+ 8] = y4[i+ 8]; sumy[1] += yl[i+ 8]; + yl[i+16] = y4[i+16]; sumy[2] += yl[i+16]; + yl[i+24] = y4[i+24]; sumy[3] += yl[i+24]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq1_m * xr = x + ibl; + device const uint8_t * qs = xr->qs + 4 * ib; + device const uint8_t * qh = xr->qh + 2 * ib; + device const uint16_t * sc = (device const uint16_t *)xr->scales; + + for (int row = 0; row < N_DST; row++) { + +#if QK_K != 64 + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); +#endif + + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + constant uint8_t * grid3 = (constant uint8_t *)(iq1s_grid_gpu + (qs[2] | ((qh[1] << 8) & 0x700))); + constant uint8_t * grid4 = (constant uint8_t *)(iq1s_grid_gpu + (qs[3] | ((qh[1] << 4) & 0x700))); + + float2 sum = {0.f}; + for (int j = 0; j < 4; ++j) { + sum[0] += yl[j+ 0] * (grid1[j] & 0xf) + yl[j+ 4] * (grid1[j] >> 4) + + yl[j+ 8] * (grid2[j] & 0xf) + yl[j+12] * (grid2[j] >> 4); + sum[1] += yl[j+16] * (grid3[j] & 0xf) + yl[j+20] * (grid3[j] >> 4) + + yl[j+24] * (grid4[j] & 0xf) + yl[j+28] * (grid4[j] >> 4); + } + const float delta1 = sumy[0] * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[1] * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float delta2 = sumy[2] * (qh[1] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA) + sumy[3] * (qh[1] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); +#if QK_K == 64 + const float d = (float) *((device const half *)(sc - 1)); + sumf[row] += d * ((sum[0] + delta1) * (2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1) + + (sum[1] + delta2) * (2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1)); +#else + sumf[row] += (float)scale.f16 * ((sum[0] + delta1) * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 7) + 1) + + (sum[1] + delta2) * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 7) + 1)); +#endif + + sc += nb*sizeof(block_iq1_m)/2; + qs += nb*sizeof(block_iq1_m); + qh += nb*sizeof(block_iq1_m); + } + + y4 += 32 * 32; + } + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; + } + } +} + void kernel_mul_mv_iq4_nl_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup float * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values_i8, + uint3 tgpig, + uint tiisg, + uint sgitg) { + threadgroup float * shared_values = (threadgroup float *)shared_values_i8; const int nb = ne00/QK4_NL; const int r0 = tgpig.x; const int r1 = tgpig.y; @@ -4554,20 +5361,21 @@ void kernel_mul_mv_iq4_xs_f32_impl( device const void * src0, device const float * src1, device float * dst, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant int64_t & ne10, - constant int64_t & ne12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup float * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values_i8, + uint3 tgpig, + uint tiisg, + uint sgitg) { + threadgroup float * shared_values = (threadgroup float *)shared_values_i8; const int nb = ne00/QK_K; const int r0 = tgpig.x; const int r1 = tgpig.y; @@ -4670,7 +5478,35 @@ kernel void kernel_mul_mv_iq1_s_f32( uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, tgpig, tiisg, sgitg); + kernel_mul_mv_iq1_s_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); +} + +[[host_name("kernel_mul_mv_iq1_m_f32")]] +kernel void kernel_mul_mv_iq1_m_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq1_m_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, nullptr, tgpig, tiisg, sgitg); } [[host_name("kernel_mul_mv_iq4_nl_f32")]] @@ -4694,7 +5530,7 @@ kernel void kernel_mul_mv_iq4_nl_f32( constant int64_t & ne1, constant uint & r2, constant uint & r3, - threadgroup float * shared_values [[threadgroup(0)]], + threadgroup int8_t * shared_values [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -4723,7 +5559,7 @@ kernel void kernel_mul_mv_iq4_xs_f32( constant int64_t & ne1, constant uint & r2, constant uint & r3, - threadgroup float * shared_values [[threadgroup(0)]], + threadgroup int8_t * shared_values [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { @@ -5146,6 +5982,38 @@ void dequantize_iq1_s(device const block_iq1_s * xb, short il, thread type4x4 & } } +template +void dequantize_iq1_m(device const block_iq1_m * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const int ib32 = il/2; + il = il%2; + device const uint16_t * sc = (device const uint16_t *)xb->scales; +#if QK_K == 64 + const float d = xb->d; +#else + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = scale.f16; +#endif + device const uint8_t * qs = xb->qs + 4*ib32 + 2*il; + device const uint8_t * qh = xb->qh + 2*ib32 + il; +#if QK_K == 64 + const float dl = d * (2*((sc[ib32/2] >> (8*(ib32%2)+4*il)) & 0xf) + 1); +#else + const float dl = d * (2*((sc[ib32/2] >> (6*(ib32%2)+3*il)) & 7) + 1); +#endif + const float ml1 = dl * (qh[0] & 0x08 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + const float ml2 = dl * (qh[0] & 0x80 ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA); + constant uint8_t * grid1 = (constant uint8_t *)(iq1s_grid_gpu + (qs[0] | ((qh[0] << 8) & 0x700))); + constant uint8_t * grid2 = (constant uint8_t *)(iq1s_grid_gpu + (qs[1] | ((qh[0] << 4) & 0x700))); + for (int i = 0; i < 4; ++i) { + reg[0][i] = dl * (grid1[i] & 0xf) + ml1; + reg[1][i] = dl * (grid1[i] >> 4) + ml1; + reg[2][i] = dl * (grid2[i] & 0xf) + ml2; + reg[3][i] = dl * (grid2[i] >> 4) + ml2; + } +} + template void dequantize_iq4_nl(device const block_iq4_nl * xb, short il, thread type4x4 & reg) { device const uint16_t * q4 = (device const uint16_t *)xb->qs; @@ -5448,25 +6316,25 @@ void kernel_mul_mm_impl(device const uchar * src0, } } -// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in src1ids +// same as kernel_mul_mm_impl, but src1 and dst are accessed via indices stored in rowids template void kernel_mul_mm_id_impl( device const uchar * src0, device const uchar * src1, - threadgroup short * src1ids, + threadgroup ushort2 * rowids, device float * dst, constant int64_t & ne00, constant int64_t & ne02, constant uint64_t & nb01, constant uint64_t & nb02, + constant int64_t & ne11, constant int64_t & ne12, constant uint64_t & nb10, constant uint64_t & nb11, constant uint64_t & nb12, constant int64_t & ne0, int64_t ne1, - constant uint & r2, - constant uint & r3, + int64_t ne0ne1, threadgroup uchar * shared_memory, uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], @@ -5477,7 +6345,6 @@ void kernel_mul_mm_id_impl( const uint r0 = tgpig.y; const uint r1 = tgpig.x; - const uint im = tgpig.z; if (r1 * BLOCK_SIZE_N >= ne1) return; @@ -5495,19 +6362,16 @@ void kernel_mul_mm_id_impl( for (int i = 0; i < 8; i++){ c_res[i] = make_filled_simdgroup_matrix(0.f); } - short il = (tiitg % THREAD_PER_ROW); - const uint i12 = im%ne12; - const uint i13 = im/ne12; - - uint offset0 = (i12/r2)*nb02 + (i13/r3)*(nb02*ne02); ushort offset1 = il/nl; - device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01 + offset0) + offset1; + threadgroup const auto & id = rowids[r1 * BLOCK_SIZE_N + thread_col]; + + device const block_q * x = (device const block_q *)(src0 + (r0 * BLOCK_SIZE_M + thread_row) * nb01) + offset1; device const float * y = (device const float *)(src1 - + nb12 * im - + nb11 * src1ids[r1 * BLOCK_SIZE_N + thread_col] + + nb12 * id[1] + + nb11 * (id[0] % ne11) + nb10 * (BLOCK_SIZE_K / THREAD_PER_COL * (tiitg % THREAD_PER_COL))); for (int loop_k = 0; loop_k < ne00; loop_k += BLOCK_SIZE_K) { @@ -5536,11 +6400,11 @@ void kernel_mul_mm_id_impl( for (int ik = 0; ik < BLOCK_SIZE_K / 8; ik++) { for (int i = 0; i < 4; i++) { - simdgroup_load(ma[i],lsma + SG_MAT_SIZE * i); + simdgroup_load(ma[i], lsma + SG_MAT_SIZE * i); } simdgroup_barrier(mem_flags::mem_none); for (int i = 0; i < 2; i++) { - simdgroup_load(mb[i],lsmb + SG_MAT_SIZE * i); + simdgroup_load(mb[i], lsmb + SG_MAT_SIZE * i); } lsma += BLOCK_SIZE_M / SG_MAT_ROW * SG_MAT_SIZE; @@ -5562,11 +6426,13 @@ void kernel_mul_mm_id_impl( threadgroup_barrier(mem_flags::mem_threadgroup); - device float * C = dst + (BLOCK_SIZE_M * r0) + im*ne1*ne0; + device float * C = dst + (BLOCK_SIZE_M * r0); if (sgitg == 0) { - for (int i = 0; i < n_rows; i++) { - for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { - *(C + i + src1ids[j + r1*BLOCK_SIZE_N] * ne0) = *(temp_str + i + j * BLOCK_SIZE_M); + for (int j = tiitg; j < n_cols; j += BLOCK_SIZE_N) { + threadgroup const auto & jid = rowids[r1 * BLOCK_SIZE_N + j]; + int joff = jid[0] * ne0 + jid[1] * ne0ne1; + for (int i = 0; i < n_rows; i++) { + *(C + i + joff) = *(temp_str + i + j * BLOCK_SIZE_M); } } } @@ -5617,14 +6483,18 @@ kernel void kernel_mul_mm(device const uchar * src0, template kernel void kernel_mul_mm_id( - device const uchar * ids, + device const uchar * src0s, device const uchar * src1, device float * dst, + device const uchar * ids, + constant int64_t & nei0, + constant int64_t & nei1, constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne02, constant uint64_t & nb01, constant uint64_t & nb02, + constant int64_t & ne11, constant int64_t & ne12, constant int64_t & ne13, constant uint64_t & nb10, @@ -5633,55 +6503,52 @@ kernel void kernel_mul_mm_id( constant int64_t & ne0, constant int64_t & ne1, constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const uchar * src00, - device const uchar * src01, - device const uchar * src02, - device const uchar * src03, - device const uchar * src04, - device const uchar * src05, - device const uchar * src06, - device const uchar * src07, threadgroup uchar * shared_memory [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const uchar * src0s[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - // expert id - const int32_t id = tgpig.z/(ne12*ne13); + const int32_t i02 = tgpig.z; + tgpig.z = 0; - tgpig.z = tgpig.z%(ne12*ne13); + device const uchar * src0 = src0s + i02*nb02; - // row indices of src1 for expert id - threadgroup short * src1ids = (threadgroup short *)(shared_memory + 8192); + // row indices + threadgroup ushort2 * rowids = (threadgroup ushort2 *)(shared_memory + 8192); + // TODO: parallelize this loop int64_t _ne1 = 0; - for (int64_t i1 = 0; i1 < ne1; i1++) { - if (((device int32_t *) (ids + i1*nbi1))[idx] == id) { - src1ids[_ne1++] = i1; + for (ushort ii1 = 0; ii1 < nei1; ii1++) { + for (ushort ii0 = 0; ii0 < nei0; ii0++) { + int32_t id = ((device int32_t *) (ids + ii1*nbi1))[ii0]; + if (id == i02) { + //if (tiitg == 0) { + rowids[_ne1] = ushort2(ii0, ii1); + //} + _ne1++; + } } } + threadgroup_barrier(mem_flags::mem_threadgroup); + kernel_mul_mm_id_impl( - src0s[id], + src0, src1, - src1ids, + rowids, dst, ne00, ne02, nb01, nb02, + ne11, ne12, nb10, nb11, nb12, ne0, _ne1, - r2, - r3, + ne0*ne1, shared_memory, tgpig, tiitg, @@ -5730,6 +6597,7 @@ template [[host_name("kernel_get_rows_iq3_xxs")]] kernel get_rows_t kernel_get_r template [[host_name("kernel_get_rows_iq3_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_s")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq1_s")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq1_m")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq4_nl")]] kernel get_rows_t kernel_get_rows; #if QK_K == 64 template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_rows; @@ -5741,43 +6609,27 @@ template [[host_name("kernel_get_rows_iq4_xs")]] kernel get_rows_t kernel_get_r // matrix-matrix multiplication // -typedef void (mat_mm_t)( - device const uchar * src0, - device const uchar * src1, - device float * dst, - constant int64_t & ne00, - constant int64_t & ne02, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne12, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint & r2, - constant uint & r3, - threadgroup uchar *, - uint3, uint, uint); +typedef decltype(kernel_mul_mm) mat_mm_t; -template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; -template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f32_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_f16_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_1_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_1_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q8_0_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q2_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q3_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q5_K_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq3_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq3_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_s_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq1_s_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq1_m_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq4_nl_f32")]] kernel mat_mm_t kernel_mul_mm; #if QK_K == 64 template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_mm; @@ -5789,55 +6641,27 @@ template [[host_name("kernel_mul_mm_iq4_xs_f32")]] kernel mat_mm_t kernel_mul_m // indirect matrix-matrix multiplication // -typedef void (mat_mm_id_t)( - device const uchar * ids, - device const uchar * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne02, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const uchar * src00, - device const uchar * src01, - device const uchar * src02, - device const uchar * src03, - device const uchar * src04, - device const uchar * src05, - device const uchar * src06, - device const uchar * src07, - threadgroup uchar *, - uint3, uint, uint); +typedef decltype(kernel_mul_mm_id) mat_mm_id_t; -template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; -template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f32_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_f16_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_1_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q8_0_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q2_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q3_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq3_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq3_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq1_s_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq1_m_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq4_nl_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; #if QK_K == 64 template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; @@ -5849,149 +6673,119 @@ template [[host_name("kernel_mul_mm_id_iq4_xs_f32")]] kernel mat_mm_id_t kernel // matrix-vector multiplication // -[[host_name("kernel_mul_mv_id_f32_f32")]] -kernel void kernel_mul_mv_id_f32_f32( - device const char * ids, +typedef void (kernel_mul_mv_impl_t)( + device const char * src0, + device const char * src1, + device float * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + uint64_t nb00, + uint64_t nb01, + uint64_t nb02, + int64_t ne10, + int64_t ne11, + int64_t ne12, + uint64_t nb10, + uint64_t nb11, + uint64_t nb12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + uint3 tgpig, + uint tiisg); + +typedef void (kernel_mul_mv2_impl_t)( + device const void * src0, + device const float * src1, + device float * dst, + int64_t ne00, + int64_t ne01, + int64_t ne02, + int64_t ne10, + int64_t ne12, + int64_t ne0, + int64_t ne1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiisg, + uint sgitg); + +template +void mmv_fn( + device const char * src0, device const char * src1, device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_f32_f32_impl( - src0[id], - src1 + bid*nb11, - dst + bid*ne0, - ne00, - ne01, - ne02, - nb00, - nb01, - nb02, - ne10, - ne11, - ne12, - nb10, - nb11, - nb12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg); + int64_t ne00, + int64_t ne01, + int64_t ne02, + uint64_t nb00, + uint64_t nb01, + uint64_t nb02, + int64_t ne10, + int64_t ne11, + int64_t ne12, + int64_t ne13, + uint64_t nb10, + uint64_t nb11, + uint64_t nb12, + int64_t ne0, + int64_t ne1, + uint64_t nb1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiitg, + uint tiisg, + uint sgitg) { + impl_fn(src0,src1,dst,ne00,ne01,ne02,nb00,nb01,nb02,ne10,ne11,ne12,nb10,nb11,nb12,ne0,ne1,r2,r3,tgpig,tiisg); } -[[host_name("kernel_mul_mv_id_f16_f32")]] -kernel void kernel_mul_mv_id_f16_f32( - device const char * ids, +template +void mmv_fn( + device const char * src0, device const char * src1, device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_f16_f32_impl( - src0[id], - src1 + bid*nb11, - dst + bid*ne0, - ne00, - ne01, - ne02, - nb00, - nb01, - nb02, - ne10, - ne11, - ne12, - nb10, - nb11, - nb12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg); + int64_t ne00, + int64_t ne01, + int64_t ne02, + uint64_t nb00, + uint64_t nb01, + uint64_t nb02, + int64_t ne10, + int64_t ne11, + int64_t ne12, + int64_t ne13, + uint64_t nb10, + uint64_t nb11, + uint64_t nb12, + int64_t ne0, + int64_t ne1, + uint64_t nb1, + uint r2, + uint r3, + threadgroup int8_t * shared_values, + uint3 tgpig, + uint tiitg, + uint tiisg, + uint sgitg) { + impl_fn(src0,(const device float *)src1,dst,ne00,ne01,ne02,ne10,ne12,ne0,ne1,r2,r3,shared_values,tgpig,tiisg,sgitg); } -[[host_name("kernel_mul_mv_id_q8_0_f32")]] -kernel void kernel_mul_mv_id_q8_0_f32( - device const char * ids, +typedef decltype(mmv_fn) mul_mv_impl_fn_t; + +template +kernel void kernel_mul_mv_id( + device const char * src0s, device const char * src1, device float * dst, + device const char * ids, + constant int64_t & nei0, + constant int64_t & nei1, constant uint64_t & nbi1, constant int64_t & ne00, constant int64_t & ne01, @@ -6009,1132 +6803,80 @@ kernel void kernel_mul_mv_id_q8_0_f32( constant int64_t & ne0, constant int64_t & ne1, constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_q8_0_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q4_0_f32")]] -kernel void kernel_mul_mv_id_q4_0_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - mul_vec_q_n_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q4_1_f32")]] -kernel void kernel_mul_mv_id_q4_1_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - mul_vec_q_n_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q5_0_f32")]] -kernel void kernel_mul_mv_id_q5_0_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - mul_vec_q_n_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q5_1_f32")]] -kernel void kernel_mul_mv_id_q5_1_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - mul_vec_q_n_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q2_K_f32")]] -kernel void kernel_mul_mv_id_q2_K_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_q2_K_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q3_K_f32")]] -kernel void kernel_mul_mv_id_q3_K_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_q3_K_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q4_K_f32")]] -kernel void kernel_mul_mv_id_q4_K_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_q4_K_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q5_K_f32")]] -kernel void kernel_mul_mv_id_q5_K_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_q5_K_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_q6_K_f32")]] -kernel void kernel_mul_mv_id_q6_K_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_q6_K_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] -kernel void kernel_mul_mv_id_iq2_xxs_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, threadgroup int8_t * shared_values [[threadgroup(0)]], uint3 tgpig[[threadgroup_position_in_grid]], uint tiitg[[thread_index_in_threadgroup]], uint tiisg[[thread_index_in_simdgroup]], uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + const int iid1 = tgpig.z/nei0; + const int idx = tgpig.z%nei0; - const int64_t bid = tgpig.z/(ne12*ne13); + tgpig.z = 0; - tgpig.z = tgpig.z%(ne12*ne13); + const int32_t i02 = ((device const int32_t *) (ids + iid1*nbi1))[idx]; - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + const int64_t i11 = idx % ne11; + const int64_t i12 = iid1; - kernel_mul_mv_iq2_xxs_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, + const int64_t i1 = idx; + const int64_t i2 = i12; + + device const char * src0_cur = src0s + i02*nb02; + device const char * src1_cur = src1 + i11*nb11 + i12*nb12; + device float * dst_cur = dst + i1*ne0 + i2*ne1*ne0; + + impl_fn( + /* src0 */ src0_cur, + /* src1 */ src1_cur, + /* dst */ dst_cur, + /* ne00 */ ne00, + /* ne01 */ ne01, + /* ne02 */ 1,//ne02, + /* nb00 */ nb00, + /* nb01 */ nb01, + /* nb02 */ nb02, + /* ne10 */ ne10, + /* ne11 */ 1,//ne11, + /* ne12 */ 1,//ne12, + /* ne13 */ 1,//ne13, + /* nb10 */ nb10, + /* nb11 */ nb11, + /* nb12 */ nb12, + /* ne0 */ ne0, + /* ne1 */ 1,//ne1, + /* nb1 */ nb1, + /* r2 */ 1, + /* r3 */ 1, shared_values, tgpig, + tiitg, tiisg, sgitg); } -[[host_name("kernel_mul_mv_id_iq2_xs_f32")]] -kernel void kernel_mul_mv_id_iq2_xs_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; +typedef decltype(kernel_mul_mv_id>) kernel_mul_mv_id_t; - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_iq2_xs_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - shared_values, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] -kernel void kernel_mul_mv_id_iq3_xxs_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_iq3_xxs_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - shared_values, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_iq3_s_f32")]] -kernel void kernel_mul_mv_id_iq3_s_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_iq3_s_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - shared_values, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_iq2_s_f32")]] -kernel void kernel_mul_mv_id_iq2_s_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - threadgroup int8_t * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_iq2_s_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - shared_values, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_iq1_s_f32")]] -kernel void kernel_mul_mv_id_iq1_s_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_iq1_s_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_iq4_nl_f32")]] -kernel void kernel_mul_mv_id_iq4_nl_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - threadgroup float * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - - kernel_mul_mv_iq4_nl_f32_impl( - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - shared_values, - tgpig, - tiisg, - sgitg); -} - -[[host_name("kernel_mul_mv_id_iq4_xs_f32")]] -kernel void kernel_mul_mv_id_iq4_xs_f32( - device const char * ids, - device const char * src1, - device float * dst, - constant uint64_t & nbi1, - constant int64_t & ne00, - constant int64_t & ne01, - constant int64_t & ne02, - constant uint64_t & nb00, - constant uint64_t & nb01, - constant uint64_t & nb02, - constant int64_t & ne10, - constant int64_t & ne11, - constant int64_t & ne12, - constant int64_t & ne13, - constant uint64_t & nb10, - constant uint64_t & nb11, - constant uint64_t & nb12, - constant int64_t & ne0, - constant int64_t & ne1, - constant uint64_t & nb1, - constant uint & r2, - constant uint & r3, - constant int & idx, - device const char * src00, - device const char * src01, - device const char * src02, - device const char * src03, - device const char * src04, - device const char * src05, - device const char * src06, - device const char * src07, - threadgroup float * shared_values [[threadgroup(0)]], - uint3 tgpig[[threadgroup_position_in_grid]], - uint tiitg[[thread_index_in_threadgroup]], - uint tiisg[[thread_index_in_simdgroup]], - uint sgitg[[simdgroup_index_in_threadgroup]]) { - device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; - - const int64_t bid = tgpig.z/(ne12*ne13); - - tgpig.z = tgpig.z%(ne12*ne13); - - const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; - -#if QK_K == 64 - kernel_mul_mv_iq4_nl_f32_impl( -#else - kernel_mul_mv_iq4_xs_f32_impl( +template [[host_name("kernel_mul_mv_id_f32_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_f16_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_q8_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_q4_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q4_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q5_0_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q5_1_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>>; +template [[host_name("kernel_mul_mv_id_q2_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_q3_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_q4_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_q5_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_q6_K_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq1_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq1_m_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq2_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq2_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq3_xxs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq3_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq2_s_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +template [[host_name("kernel_mul_mv_id_iq4_nl_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; +#if QK_K != 64 +template [[host_name("kernel_mul_mv_id_iq4_xs_f32")]] kernel kernel_mul_mv_id_t kernel_mul_mv_id>; #endif - src0[id], - (device const float *) (src1 + bid*nb11), - dst + bid*ne0, - ne00, - ne01, - ne02, - ne10, - ne12, - ne0, - ne1, - r2, - r3, - shared_values, - tgpig, - tiisg, - sgitg); -} + diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index aa73d67df..880a14958 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_ if (alignment == (cl_uint)-1) { ggml_cl_init(); clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL); + alignment /= 8; // bits to bytes } return alignment; @@ -2234,6 +2235,11 @@ static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(gg static ggml_status ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { for (int i = 0; i < graph->n_nodes; ++i) { ggml_tensor * node = graph->nodes[i]; + + if (ggml_is_empty(node)) { + continue; + } + switch (node->op) { case GGML_OP_MUL_MAT: ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0); diff --git a/ggml-quants.c b/ggml-quants.c index 2eaca0593..9883b6f8c 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -14,47 +14,6 @@ #include // for qsort #include // for GGML_ASSERT -#ifdef __ARM_NEON - -// if YCM cannot find , make a symbolic link to it, for example: -// -// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ -// -#include - -#else - -#ifdef __wasm_simd128__ -#include -#else -#if defined(__POWER9_VECTOR__) || defined(__powerpc64__) -#include -#undef bool -#define bool _Bool -#else -#if defined(_MSC_VER) || defined(__MINGW32__) -#include -#else -#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) -#if !defined(__riscv) -#include -#endif -#endif -#endif -#endif -#endif -#endif - -#ifdef __riscv_v_intrinsic -#include -#endif - -#undef MIN -#undef MAX - -#define MIN(a, b) ((a) < (b) ? (a) : (b)) -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - #define UNUSED GGML_UNUSED // some compilers don't provide _mm256_set_m128i, e.g. gcc 7 @@ -132,7 +91,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) { } static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { -#if __AVXVNNI__ +#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) const __m256i zero = _mm256_setzero_si256(); const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); return _mm256_cvtepi32_ps(summed_pairs); @@ -276,258 +235,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128 #endif // __AVX__ || __AVX2__ || __AVX512F__ #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) -#if defined(__ARM_NEON) - -#ifdef _MSC_VER - -#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) } - -#else - -#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) } - -#endif - -#if !defined(__aarch64__) - -// 64-bit compatibility - -// vaddvq_s16 -// vpaddq_s16 -// vpaddq_s32 -// vaddvq_s32 -// vaddvq_f32 -// vmaxvq_f32 -// vcvtnq_s32_f32 -// vzip1_u8 -// vzip2_u8 - -inline static int32_t vaddvq_s16(int16x8_t v) { - return - (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) + - (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) + - (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) + - (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7); -} - -inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) { - int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a)); - int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b)); - return vcombine_s16(a0, b0); -} - -inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) { - int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a)); - int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b)); - return vcombine_s32(a0, b0); -} - -inline static int32_t vaddvq_s32(int32x4_t v) { - return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3); -} - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -inline static float vmaxvq_f32(float32x4_t v) { - return - MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)), - MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3))); -} - -inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) { - int32x4_t res; - - res[0] = roundf(vgetq_lane_f32(v, 0)); - res[1] = roundf(vgetq_lane_f32(v, 1)); - res[2] = roundf(vgetq_lane_f32(v, 2)); - res[3] = roundf(vgetq_lane_f32(v, 3)); - - return res; -} - -inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[0]; res[1] = b[0]; - res[2] = a[1]; res[3] = b[1]; - res[4] = a[2]; res[5] = b[2]; - res[6] = a[3]; res[7] = b[3]; - - return res; -} - -inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { - uint8x8_t res; - - res[0] = a[4]; res[1] = b[4]; - res[2] = a[5]; res[3] = b[5]; - res[4] = a[6]; res[5] = b[6]; - res[6] = a[7]; res[7] = b[7]; - - return res; -} - -// vld1q_s16_x2 -// vld1q_u8_x2 -// vld1q_u8_x4 -// vld1q_s8_x2 -// vld1q_s8_x4 -// TODO: double-check these work correctly - -typedef struct ggml_int16x8x2_t { - int16x8_t val[2]; -} ggml_int16x8x2_t; - -inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) { - ggml_int16x8x2_t res; - - res.val[0] = vld1q_s16(ptr + 0); - res.val[1] = vld1q_s16(ptr + 8); - - return res; -} - -typedef struct ggml_uint8x16x2_t { - uint8x16_t val[2]; -} ggml_uint8x16x2_t; - -inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) { - ggml_uint8x16x2_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - - return res; -} - -typedef struct ggml_uint8x16x4_t { - uint8x16_t val[4]; -} ggml_uint8x16x4_t; - -inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) { - ggml_uint8x16x4_t res; - - res.val[0] = vld1q_u8(ptr + 0); - res.val[1] = vld1q_u8(ptr + 16); - res.val[2] = vld1q_u8(ptr + 32); - res.val[3] = vld1q_u8(ptr + 48); - - return res; -} - -typedef struct ggml_int8x16x2_t { - int8x16_t val[2]; -} ggml_int8x16x2_t; - -inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) { - ggml_int8x16x2_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - - return res; -} - -typedef struct ggml_int8x16x4_t { - int8x16_t val[4]; -} ggml_int8x16x4_t; - -inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) { - ggml_int8x16x4_t res; - - res.val[0] = vld1q_s8(ptr + 0); - res.val[1] = vld1q_s8(ptr + 16); - res.val[2] = vld1q_s8(ptr + 32); - res.val[3] = vld1q_s8(ptr + 48); - - return res; -} - -// NOTE: not tested -inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) { - int8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -// NOTE: not tested -inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) { - uint8x16_t res; - - res[ 0] = a[b[ 0]]; - res[ 1] = a[b[ 1]]; - res[ 2] = a[b[ 2]]; - res[ 3] = a[b[ 3]]; - res[ 4] = a[b[ 4]]; - res[ 5] = a[b[ 5]]; - res[ 6] = a[b[ 6]]; - res[ 7] = a[b[ 7]]; - res[ 8] = a[b[ 8]]; - res[ 9] = a[b[ 9]]; - res[10] = a[b[10]]; - res[11] = a[b[11]]; - res[12] = a[b[12]]; - res[13] = a[b[13]]; - res[14] = a[b[14]]; - res[15] = a[b[15]]; - - return res; -} - -#else - -#define ggml_int16x8x2_t int16x8x2_t -#define ggml_uint8x16x2_t uint8x16x2_t -#define ggml_uint8x16x4_t uint8x16x4_t -#define ggml_int8x16x2_t int8x16x2_t -#define ggml_int8x16x4_t int8x16x4_t - -#define ggml_vld1q_s16_x2 vld1q_s16_x2 -#define ggml_vld1q_u8_x2 vld1q_u8_x2 -#define ggml_vld1q_u8_x4 vld1q_u8_x4 -#define ggml_vld1q_s8_x2 vld1q_s8_x2 -#define ggml_vld1q_s8_x4 vld1q_s8_x4 -#define ggml_vqtbl1q_s8 vqtbl1q_s8 -#define ggml_vqtbl1q_u8 vqtbl1q_u8 - -#endif - -#if !defined(__ARM_FEATURE_DOTPROD) - -inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) { - const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b)); - const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b)); - - return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1))); -} - -#else - -#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c) - -#endif - -#endif - #if defined(__ARM_NEON) || defined(__wasm_simd128__) #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) @@ -544,7 +251,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif // reference implementation for deterministic creation of model files -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { +void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -581,12 +288,12 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict } } -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { +void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) { quantize_row_q4_0_reference(x, y, k); } -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { +void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) { const int qk = QK4_1; assert(k % qk == 0); @@ -623,11 +330,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict } } -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { +void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) { quantize_row_q4_1_reference(x, y, k); } -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { +void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -671,11 +378,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict } } -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { +void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) { quantize_row_q5_0_reference(x, y, k); } -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { +void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) { const int qk = QK5_1; assert(k % qk == 0); @@ -719,12 +426,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict } } -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { +void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) { quantize_row_q5_1_reference(x, y, k); } // reference implementation for deterministic creation of model files -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { +void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -749,7 +456,7 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict } } -void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -938,7 +645,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { } // reference implementation for deterministic creation of model files -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { +void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) { assert(QK8_1 == 32); assert(k % QK8_1 == 0); const int nb = k / QK8_1; @@ -973,7 +680,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict } } -void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; @@ -1192,7 +899,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { #endif } -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -1212,7 +919,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int } } -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int64_t k) { static const int qk = QK4_1; assert(k % qk == 0); @@ -1233,7 +940,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int } } -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -1259,7 +966,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int } } -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int64_t k) { static const int qk = QK5_1; assert(k % qk == 0); @@ -1286,7 +993,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int } } -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int64_t k) { static const int qk = QK8_0; assert(k % qk == 0); @@ -1581,7 +1288,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { +void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1658,7 +1365,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { +void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1704,7 +1411,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int } } -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) { quantize_row_q2_K_reference(x, vy, k); } @@ -1960,14 +1667,14 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri } } -size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row); if (!quant_weights) { - quantize_row_q2_K_reference(src, dst, nrow*n_per_row); + quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row); } else { char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -1978,7 +1685,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, //========================= 3-bit (de)-quantization -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { +void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2092,7 +1799,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } #if QK_K == 256 -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2142,7 +1849,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } } #else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); assert(QK_K == 64); const int nb = k / QK_K; @@ -2175,11 +1882,11 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } #endif -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) { quantize_row_q3_K_reference(x, vy, k); } -static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) { +static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) { #if QK_K != 256 (void)quant_weights; quantize_row_q3_K_reference(x, y, n_per_row); @@ -2268,14 +1975,14 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri #endif } -size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row); if (!quant_weights) { - quantize_row_q3_K_reference(src, dst, nrow*n_per_row); + quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row); } else { char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q3_K_impl(src, (block_q3_K*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -2286,7 +1993,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, // ====================== 4-bit (de)-quantization -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { +void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2393,7 +2100,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { +void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2432,19 +2139,19 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int } } -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK_K == 0); block_q4_K * restrict y = vy; quantize_row_q4_K_reference(x, y, k); } -static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) { +static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; quantize_row_q4_K_reference(x, y, n_per_row); #else assert(n_per_row % QK_K == 0); - const int nb = n_per_row / QK_K; + const int64_t nb = n_per_row / QK_K; uint8_t L[QK_K]; uint8_t Laux[32]; @@ -2516,14 +2223,14 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri #endif } -size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row); if (!quant_weights) { - quantize_row_q4_K_reference(src, dst, nrow*n_per_row); + quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row); } else { char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q4_K_impl(src, (block_q4_K*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -2534,9 +2241,9 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, // ====================== 5-bit (de)-quantization -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { +void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; #if QK_K == 256 uint8_t L[QK_K]; @@ -2676,9 +2383,9 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { +void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -2721,19 +2428,19 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } } -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK_K == 0); block_q5_K * restrict y = vy; quantize_row_q5_K_reference(x, y, k); } -static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) { +static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; quantize_row_q5_K_reference(x, y, n_per_row); #else assert(n_per_row % QK_K == 0); - const int nb = n_per_row / QK_K; + const int64_t nb = n_per_row / QK_K; uint8_t L[QK_K]; uint8_t Laux[32]; @@ -2825,14 +2532,14 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri #endif } -size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row); if (!quant_weights) { - quantize_row_q5_K_reference(src, dst, nrow*n_per_row); + quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row); } else { char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q5_K_impl(src, (block_q5_K*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -2843,9 +2550,9 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, // ====================== 6-bit (de)-quantization -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { +void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; int8_t L[QK_K]; float scales[QK_K/16]; @@ -2925,9 +2632,9 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { +void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -2972,19 +2679,19 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int } } -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK_K == 0); block_q6_K * restrict y = vy; quantize_row_q6_K_reference(x, y, k); } -static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) { +static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) { #if QK_K != 256 (void)quant_weights; quantize_row_q6_K_reference(x, y, n_per_row); #else assert(n_per_row % QK_K == 0); - const int nb = n_per_row / QK_K; + const int64_t nb = n_per_row / QK_K; int8_t L[QK_K]; float scales[QK_K/16]; @@ -3067,14 +2774,14 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri #endif } -size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row); if (!quant_weights) { - quantize_row_q6_K_reference(src, dst, nrow*n_per_row); + quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row); } else { char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q6_K_impl(src, (block_q6_K*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -3083,7 +2790,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, return nrow * row_size; } -static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int n_per_row, const float * quant_weights) { +static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) { static_assert(QK4_0 == 32, "QK4_0 must be 32"); if (!quant_weights) { @@ -3098,7 +2805,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; float sigma2 = sum_x2/n_per_row; - const int nb = n_per_row/QK4_0; + const int64_t nb = n_per_row/QK4_0; for (int ib = 0; ib < nb; ++ib) { const float * xb = x + QK4_0 * ib; const float * qw = quant_weights + QK4_0 * ib; @@ -3111,14 +2818,14 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri } } -size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { - quantize_row_q4_0_reference(src, dst, nrow*n_per_row); + quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row); char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q4_0_impl(src, (block_q4_0*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -3126,7 +2833,7 @@ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, return nrow * row_size; } -static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int n_per_row, const float * quant_weights) { +static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restrict y, int64_t n_per_row, const float * quant_weights) { static_assert(QK4_1 == 32, "QK4_1 must be 32"); if (!quant_weights) { @@ -3141,7 +2848,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; float sigma2 = sum_x2/n_per_row; - const int nb = n_per_row/QK4_1; + const int64_t nb = n_per_row/QK4_1; for (int ib = 0; ib < nb; ++ib) { const float * xb = x + QK4_1 * ib; const float * qw = quant_weights + QK4_1 * ib; @@ -3156,14 +2863,14 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri } } -size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { - quantize_row_q4_1_reference(src, dst, nrow*n_per_row); + quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row); char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q4_1_impl(src, (block_q4_1*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -3171,7 +2878,7 @@ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, return nrow * row_size; } -static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int n_per_row, const float * quant_weights) { +static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restrict y, int64_t n_per_row, const float * quant_weights) { static_assert(QK5_0 == 32, "QK5_0 must be 32"); if (!quant_weights) { @@ -3186,7 +2893,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; float sigma2 = sum_x2/n_per_row; - const int nb = n_per_row/QK5_0; + const int64_t nb = n_per_row/QK5_0; for (int ib = 0; ib < nb; ++ib) { const float * xb = x + QK5_0 * ib; const float * qw = quant_weights + QK5_0 * ib; @@ -3210,14 +2917,14 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri } } -size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { - quantize_row_q5_0_reference(src, dst, nrow*n_per_row); + quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row); char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q5_0_impl(src, (block_q5_0*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -3225,7 +2932,7 @@ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, return nrow * row_size; } -static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int n_per_row, const float * quant_weights) { +static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restrict y, int64_t n_per_row, const float * quant_weights) { static_assert(QK5_1 == 32, "QK5_1 must be 32"); if (!quant_weights) { @@ -3240,7 +2947,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri for (int j = 0; j < n_per_row; ++j) sum_x2 += x[j]*x[j]; float sigma2 = sum_x2/n_per_row; - const int nb = n_per_row/QK5_1; + const int64_t nb = n_per_row/QK5_1; for (int ib = 0; ib < nb; ++ib) { const float * xb = x + QK5_1 * ib; const float * qw = quant_weights + QK5_1 * ib; @@ -3263,14 +2970,14 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri } } -size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { if (!quant_weights) { - quantize_row_q5_1_reference(src, dst, nrow*n_per_row); + quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row); return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row); } size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row); char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_q5_1_impl(src, (block_q5_1*)qrow, n_per_row, quant_weights); src += n_per_row; qrow += row_size; @@ -3278,18 +2985,18 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, return nrow * row_size; } -size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { (void)quant_weights; // not used const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row); - quantize_row_q8_0_reference(src, dst, nrow*n_per_row); + quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row); return nrow * row_size; } // ====================== "True" 2-bit (de)-quantization -void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { +void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; uint32_t aux32[2]; const uint8_t * aux8 = (const uint8_t *)aux32; @@ -3315,9 +3022,9 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y // ====================== 2.3125 bpw (de)-quantization -void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) { +void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; float db[2]; @@ -3342,9 +3049,9 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, // ====================== 2.5625 bpw (de)-quantization -void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int k) { +void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; float db[2]; @@ -3374,9 +3081,9 @@ void dequantize_row_iq2_s(const block_iq2_s * restrict x, float * restrict y, in // ====================== 3.0625 bpw (de)-quantization -void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int k) { +void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; uint32_t aux32; @@ -3406,9 +3113,9 @@ void dequantize_row_iq3_xxs(const block_iq3_xxs * restrict x, float * restrict y // ====================== 3.3125 bpw (de)-quantization -void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int k) { +void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -3449,9 +3156,9 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in // ====================== 1.5625 bpw (de)-quantization -void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int k) { +void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -3474,11 +3181,70 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in } } +void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int64_t k) { + assert(k % QK_K == 0); + const int64_t nb = k / QK_K; + + float delta[4]; + uint16_t idx[4]; + +#if QK_K != 64 + iq1m_scale_t scale; +#endif + + for (int i = 0; i < nb; i++) { + + const uint16_t * sc = (const uint16_t *)x[i].scales; +#if QK_K == 64 + const float d = GGML_FP16_TO_FP32(x[i].d); +#else + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = GGML_FP16_TO_FP32(scale.f16); +#endif + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + + for (int ib = 0; ib < QK_K/32; ++ib) { +#if QK_K == 64 + const float dl1 = d * (2*((sc[ib/2] >> (8*(ib%2)+0)) & 0xf) + 1); + const float dl2 = d * (2*((sc[ib/2] >> (8*(ib%2)+4)) & 0xf) + 1); +#else + const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1); + const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1); +#endif + idx[0] = qs[0] | ((qh[0] << 8) & 0x700); + idx[1] = qs[1] | ((qh[0] << 4) & 0x700); + idx[2] = qs[2] | ((qh[1] << 8) & 0x700); + idx[3] = qs[3] | ((qh[1] << 4) & 0x700); + delta[0] = qh[0] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA; + delta[1] = qh[0] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA; + delta[2] = qh[1] & 0x08 ? -IQ1S_DELTA : IQ1S_DELTA; + delta[3] = qh[1] & 0x80 ? -IQ1S_DELTA : IQ1S_DELTA; + for (int l = 0; l < 2; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + for (int j = 0; j < 8; ++j) { + y[j] = dl1 * (grid[j] + delta[l]); + } + y += 8; + } + for (int l = 2; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + idx[l]); + for (int j = 0; j < 8; ++j) { + y[j] = dl2 * (grid[j] + delta[l]); + } + y += 8; + } + qs += 4; + qh += 2; + } + } +} + static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; -void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int k) { +void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, int64_t k) { assert(k % QK4_NL == 0); - const int nb = k / QK4_NL; + const int64_t nb = k / QK4_NL; for (int i = 0; i < nb; i++) { @@ -3494,12 +3260,12 @@ void dequantize_row_iq4_nl(const block_iq4_nl * restrict x, float * restrict y, } } -void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int k) { +void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); #if QK_K == 64 dequantize_row_iq4_nl((const block_iq4_nl *)x, y, k); #else - const int nb = k / QK_K; + const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -3523,9 +3289,9 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y, //===================================== Q8_K ============================================== -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { +void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { @@ -3562,9 +3328,9 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict } } -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int64_t k) { assert(k % QK_K == 0); - const int nb = k / QK_K; + const int64_t nb = k / QK_K; for (int i = 0; i < nb; i++) { for (int j = 0; j < QK_K; ++j) { @@ -3573,7 +3339,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int } } -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { +void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) { quantize_row_q8_K_reference(x, y, k); } @@ -9695,6 +9461,248 @@ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void #endif } +void ggml_vec_dot_iq1_m_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { + assert(n % QK_K == 0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_iq1_m * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if QK_K != 64 + iq1m_scale_t scale; +#endif + +#if defined __ARM_NEON + +#if QK_K == 64 + const int32x4_t mask = vdupq_n_s32(0xf); +#else + const int32x4_t mask = vdupq_n_s32(0x7); +#endif + const int32x4_t mone = vdupq_n_s32(1); + const int32x4_t mzero = vdupq_n_s32(0); + + ggml_int8x16x4_t deltas; + deltas.val[0] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(+1)); + deltas.val[1] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(+1)); + deltas.val[2] = vcombine_s8(vdup_n_s8(+1), vdup_n_s8(-1)); + deltas.val[3] = vcombine_s8(vdup_n_s8(-1), vdup_n_s8(-1)); + + ggml_int8x16x4_t q1b; + ggml_int8x16x4_t q8b; + + uint32_t aux32; + const uint8_t * aux8 = (const uint8_t *)&aux32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + +#if QK_K != 64 + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); +#endif + + int32x4_t sumi1 = mzero; + int32x4_t sumi2 = mzero; + + for (int ib = 0; ib < QK_K/32; ib += 2) { + + q1b.val[0] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[0] | ((qh[0] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[1] | ((qh[0] << 4) & 0x700))))); + q1b.val[1] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[2] | ((qh[1] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[3] | ((qh[1] << 4) & 0x700))))); + q1b.val[2] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[4] | ((qh[2] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[5] | ((qh[2] << 4) & 0x700))))); + q1b.val[3] = vcombine_s8(vld1_s8((const int8_t *)(iq1s_grid + (qs[6] | ((qh[3] << 8) & 0x700)))), + vld1_s8((const int8_t *)(iq1s_grid + (qs[7] | ((qh[3] << 4) & 0x700))))); + + q8b = ggml_vld1q_s8_x4(q8); q8 += 64; + + const int32x4_t p1 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[0], q8b.val[0]), ggml_vdotq_s32(mzero, q1b.val[1], q8b.val[1])); + const int32x4_t p2 = vpaddq_s32(ggml_vdotq_s32(mzero, q1b.val[2], q8b.val[2]), ggml_vdotq_s32(mzero, q1b.val[3], q8b.val[3])); + const int32x4_t p12 = vpaddq_s32(p1, p2); + + const uint32_t * qh32 = (const uint32_t *)qh; // we are 4-byte aligned, so we can do that + aux32 = ((qh32[0] >> 3) & 0x01010101) | ((qh32[0] >> 6) & 0x02020202); + + const int32x4_t p3 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[0]], q8b.val[0]), ggml_vdotq_s32(mzero, deltas.val[aux8[1]], q8b.val[1])); + const int32x4_t p4 = vpaddq_s32(ggml_vdotq_s32(mzero, deltas.val[aux8[2]], q8b.val[2]), ggml_vdotq_s32(mzero, deltas.val[aux8[3]], q8b.val[3])); + const int32x4_t p34 = vpaddq_s32(p3, p4); + +#if QK_K == 64 + int32x4_t scales_4 = ggml_vld1q_u32(sc[0] >> 0, sc[0] >> 4, sc[0] >> 8, sc[0] >> 12); +#else + int32x4_t scales_4 = ggml_vld1q_u32(sc[ib/2] >> 0, sc[ib/2] >> 3, sc[ib/2] >> 6, sc[ib/2] >> 9); +#endif + scales_4 = vaddq_s32(vshlq_n_s32(vandq_s32(scales_4, mask), 1), mone); + + sumi1 = vmlaq_s32(sumi1, scales_4, p12); + sumi2 = vmlaq_s32(sumi2, scales_4, p34); + + qs += 8; qh += 4; + + } + +#if QK_K == 64 + sumf += y[i].d * GGML_FP16_TO_FP32(x[i].d) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); +#else + sumf += y[i].d * GGML_FP16_TO_FP32(scale.f16) * (vaddvq_s32(sumi1) + IQ1M_DELTA * vaddvq_s32(sumi2)); +#endif + } + + *s = sumf; + +#elif defined __AVX2__ + +#if QK_K == 64 + const __m256i mask = _mm256_set1_epi16(0xf); +#else + const __m256i mask = _mm256_set1_epi16(0x7); +#endif + const __m256i mone = _mm256_set1_epi16(1); + + __m256 accum1 = _mm256_setzero_ps(); + __m256 accum2 = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + +#if QK_K != 64 + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); +#endif + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib = 0; ib < QK_K/32; ib += 2) { + const __m256i q1b_1 = _mm256_set_epi64x( + iq1s_grid[qs[3] | (((uint16_t)qh[1] << 4) & 0x700)], iq1s_grid[qs[2] | (((uint16_t)qh[1] << 8) & 0x700)], + iq1s_grid[qs[1] | (((uint16_t)qh[0] << 4) & 0x700)], iq1s_grid[qs[0] | (((uint16_t)qh[0] << 8) & 0x700)] + ); + const __m256i q1b_2 = _mm256_set_epi64x( + iq1s_grid[qs[7] | (((uint16_t)qh[3] << 4) & 0x700)], iq1s_grid[qs[6] | (((uint16_t)qh[3] << 8) & 0x700)], + iq1s_grid[qs[5] | (((uint16_t)qh[2] << 4) & 0x700)], iq1s_grid[qs[4] | (((uint16_t)qh[2] << 8) & 0x700)] + ); + const __m256i q8b_1 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + const __m256i q8b_2 = _mm256_loadu_si256((const __m256i*)q8); q8 += 32; + + const __m256i dot1 = mul_add_epi8(q1b_1, q8b_1); + const __m256i dot2 = mul_add_epi8(q1b_2, q8b_2); + + const __m256i delta1 = _mm256_set_epi64x(qh[1] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[1] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[0] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + const __m256i delta2 = _mm256_set_epi64x(qh[3] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[3] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x80 ? 0xffffffffffffffff : 0x0101010101010101, + qh[2] & 0x08 ? 0xffffffffffffffff : 0x0101010101010101); + + const __m256i dot3 = mul_add_epi8(delta1, q8b_1); + const __m256i dot4 = mul_add_epi8(delta2, q8b_2); +#if QK_K == 64 + __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 4), _mm_set1_epi16(sc[0] >> 0)); + __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[0] >> 12), _mm_set1_epi16(sc[0] >> 8)); +#else + __m256i scale1 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 3), _mm_set1_epi16(sc[ib/2] >> 0)); + __m256i scale2 = MM256_SET_M128I(_mm_set1_epi16(sc[ib/2] >> 9), _mm_set1_epi16(sc[ib/2] >> 6)); +#endif + scale1 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale1, mask), 1), mone); + scale2 = _mm256_add_epi16(_mm256_slli_epi16(_mm256_and_si256(scale2, mask), 1), mone); + const __m256i p1 = _mm256_madd_epi16(dot1, scale1); + const __m256i p2 = _mm256_madd_epi16(dot2, scale2); + const __m256i p3 = _mm256_madd_epi16(dot3, scale1); + const __m256i p4 = _mm256_madd_epi16(dot4, scale2); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_add_epi32(p1, p2)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_add_epi32(p3, p4)); + + qs += 8; qh += 4; + } + +#if QK_K == 64 + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(x[i].d)); +#else + const __m256 d = _mm256_set1_ps(y[i].d * GGML_FP16_TO_FP32(scale.f16)); +#endif + accum1 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi1), accum1); + accum2 = _mm256_fmadd_ps(d, _mm256_cvtepi32_ps(sumi2), accum2); + + } + + *s = hsum_float_8(accum1) + IQ1M_DELTA * hsum_float_8(accum2); + +#else + + int sum1[2], sum2[2], delta[4]; + + float sumf = 0; + for (int i = 0; i < nb; i++) { + + const int8_t * q8 = y[i].qs; + const uint8_t * qs = x[i].qs; + const uint8_t * qh = x[i].qh; + const uint16_t * sc = (const uint16_t *)x[i].scales; + +#if QK_K != 64 + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); +#endif + + int sumi1 = 0, sumi2 = 0; + for (int ib = 0; ib < QK_K/32; ++ib) { + delta[0] = qh[0] & 0x08 ? -1 : 1; + delta[1] = qh[0] & 0x80 ? -1 : 1; + delta[2] = qh[1] & 0x08 ? -1 : 1; + delta[3] = qh[1] & 0x80 ? -1 : 1; + sum1[0] = sum1[1] = sum2[0] = sum2[1] = 0; + for (int l = 0; l < 4; ++l) { + const int8_t * grid = (const int8_t *)(iq1s_grid + (qs[l] | (((uint16_t)qh[l/2] << (8 - 4*(l%2))) & 0x700))); + int lsum1 = 0, lsum2 = 0; + for (int j = 0; j < 8; ++j) { + lsum1 += q8[j] * grid[j]; + lsum2 += q8[j]; + } + q8 += 8; + sum1[l/2] += lsum1; + sum2[l/2] += lsum2*delta[l]; + } +#if QK_K == 64 + const int ls1 = 2*((sc[0] >> (8*(ib%2)+0)) & 0xf) + 1; + const int ls2 = 2*((sc[0] >> (8*(ib%2)+4)) & 0xf) + 1; +#else + const int ls1 = 2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1; + const int ls2 = 2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1; +#endif + sumi1 += sum1[0] * ls1 + sum1[1] * ls2; + sumi2 += sum2[0] * ls1 + sum2[1] * ls2; + qs += 4; + qh += 2; + } + +#if QK_K == 64 + sumf += GGML_FP16_TO_FP32(x[i].d) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); +#else + sumf += GGML_FP16_TO_FP32(scale.f16) * y[i].d * (sumi1 + IQ1M_DELTA * sumi2); +#endif + } + + *s = sumf; + +#endif +} + void ggml_vec_dot_iq4_nl_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -9938,17 +9946,17 @@ static iq2_entry_t iq2_data[4] = { }; static inline int iq2_data_index(enum ggml_type type) { - GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S); + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S); return type == GGML_TYPE_IQ2_XXS ? 0 : type == GGML_TYPE_IQ2_XS ? 1 : - type == GGML_TYPE_IQ1_S ? 2 : 3; + type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 2 : 3; } static inline int iq2_grid_size(enum ggml_type type) { - GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S); + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S); return type == GGML_TYPE_IQ2_XXS ? 256 : type == GGML_TYPE_IQ2_XS ? 512 : - type == GGML_TYPE_IQ1_S ? NGRID_IQ1S : 1024; + type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? NGRID_IQ1S : 1024; } static int iq2_compare_func(const void * left, const void * right) { @@ -10214,10 +10222,10 @@ void iq2xs_init_impl(enum ggml_type type) { const int kmap_size = 43692; //const int nwant = type == GGML_TYPE_IQ1_S ? 3 : 2; - const int nwant = type == GGML_TYPE_IQ1_S ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2; + const int nwant = type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? 3 : type == GGML_TYPE_IQ2_S ? 1 : 2; const uint16_t * kgrid = type == GGML_TYPE_IQ2_XXS ? kgrid_2bit_256 : type == GGML_TYPE_IQ2_XS ? kgrid_2bit_512 : - type == GGML_TYPE_IQ1_S ? kgrid_1bit_2048 : kgrid_2bit_1024; + type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M ? kgrid_1bit_2048 : kgrid_2bit_1024; uint64_t * kgrid_q2xs; int * kmap_q2xs; uint16_t * kneighbors_q2xs; @@ -10314,7 +10322,7 @@ void iq2xs_init_impl(enum ggml_type type) { } void iq2xs_free_impl(enum ggml_type type) { - GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ2_S); + GGML_ASSERT(type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ1_S || type == GGML_TYPE_IQ1_M || type == GGML_TYPE_IQ2_S); const int gindex = iq2_data_index(type); if (iq2_data[gindex].grid) { free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL; @@ -10347,7 +10355,7 @@ static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } -static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { +static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) { const int gindex = iq2_data_index(GGML_TYPE_IQ2_XXS); @@ -10363,7 +10371,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict const int kMaxQ = 3; - const int nbl = n/QK_K; + const int64_t nbl = n/QK_K; block_iq2_xxs * y = vy; @@ -10520,7 +10528,7 @@ static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict } } -static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { +static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) { const int gindex = iq2_data_index(GGML_TYPE_IQ2_XS); @@ -10536,7 +10544,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v const int kMaxQ = 3; - const int nbl = n/QK_K; + const int64_t nbl = n/QK_K; block_iq2_xs * y = vy; @@ -10700,11 +10708,11 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v } } -size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); - int nblock = n_per_row/QK_K; + int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights); src += n_per_row; qrow += nblock*sizeof(block_iq2_xxs); @@ -10712,11 +10720,11 @@ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nro return nrow * nblock * sizeof(block_iq2_xxs); } -size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); - int nblock = n_per_row/QK_K; + int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights); src += n_per_row; qrow += nblock*sizeof(block_iq2_xs); @@ -10941,7 +10949,7 @@ static int iq3_find_best_neighbour(const uint16_t * restrict neighbours, const u return grid_index; } -static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int n, +static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) { const int gindex = iq3_data_index(grid_size); @@ -10958,7 +10966,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v const int kMaxQ = 8; - const int nbl = n/QK_K; + const int64_t nbl = n/QK_K; ggml_fp16_t * dh; uint8_t * qs; @@ -11154,11 +11162,11 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v } } -size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); - int nblock = n_per_row/QK_K; + int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_iq3_xxs_impl(256, src, qrow, n_per_row, quant_weights); src += n_per_row; qrow += nblock*sizeof(block_iq3_xxs); @@ -11166,13 +11174,13 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nro return nrow * nblock * sizeof(block_iq3_xxs); } -void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int k) { +void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK_K == 0); block_iq3_xxs * restrict y = vy; quantize_row_iq3_xxs_reference(x, y, k); } -void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int k) { +void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) { assert(k % QK_K == 0); quantize_row_iq3_xxs_impl(256, x, y, k, NULL); } @@ -11203,7 +11211,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo const int kMaxQ = 8; - const int nbl = n/QK_K; + const int64_t nbl = n/QK_K; block_iq3_s * y = vy; @@ -11360,9 +11368,9 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } #define IQ3S_BLOCK_SIZE 32 -size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); - int nblock = n_per_row/QK_K; + int64_t nblock = n_per_row/QK_K; float scales[QK_K/IQ3S_BLOCK_SIZE]; float weight[IQ3S_BLOCK_SIZE]; float xval[IQ3S_BLOCK_SIZE]; @@ -11373,7 +11381,7 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, bool is_on_grid_aux[IQ3S_BLOCK_SIZE/4]; uint8_t block_signs[IQ3S_BLOCK_SIZE/8]; char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_iq3_s_impl(IQ3S_BLOCK_SIZE, src, qrow, n_per_row, quant_weights, scales, weight, xval, L, Laux, waux, is_on_grid, is_on_grid_aux, block_signs); src += n_per_row; @@ -11382,13 +11390,13 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, return nrow * nblock * sizeof(block_iq3_s); } -void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) { +void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK_K == 0); block_iq3_s * restrict y = vy; quantize_row_iq3_s_reference(x, y, k); } -void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) { +void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) { assert(k % QK_K == 0); quantize_iq3_s(x, y, 1, k, NULL); } @@ -11520,7 +11528,16 @@ static int iq1_sort_helper(const void * left, const void * right) { } #define IQ1S_BLOCK_SIZE 32 -static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { +#define IQ1M_BLOCK_SIZE 16 +static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, + float * scales, + float * weight, + float * sumx, + float * sumw, + float * pairs, + int8_t * L, + uint16_t * index, + int8_t * shifts) { const int gindex = iq2_data_index(GGML_TYPE_IQ1_S); @@ -11534,22 +11551,17 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); GGML_ASSERT(n%QK_K == 0); - const int nbl = n/QK_K; - block_iq1_s * y = vy; + const int64_t nbl = n/QK_K; + + const int block_size = IQ1S_BLOCK_SIZE; + const float x_p[3] = {-1 + IQ1S_DELTA, IQ1S_DELTA, 1 + IQ1S_DELTA}; const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA}; - float scales[QK_K/IQ1S_BLOCK_SIZE]; - float weight[IQ1S_BLOCK_SIZE]; - int8_t L[IQ1S_BLOCK_SIZE]; - float sumx[IQ1S_BLOCK_SIZE+1]; - float sumw[IQ1S_BLOCK_SIZE+1]; - float pairs[2*IQ1S_BLOCK_SIZE]; + int * idx = (int *)(pairs + 1); - uint16_t index[IQ1S_BLOCK_SIZE/8]; - int8_t shifts[QK_K/IQ1S_BLOCK_SIZE]; for (int ibl = 0; ibl < nbl; ++ibl) { @@ -11564,15 +11576,15 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; float sigma2 = 2*sumx2/QK_K; - for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) { - const float * xb = xbl + IQ1S_BLOCK_SIZE*ib; - const float * qw = quant_weights + QK_K*ibl + IQ1S_BLOCK_SIZE*ib; - for (int i = 0; i < IQ1S_BLOCK_SIZE; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + for (int ib = 0; ib < QK_K/block_size; ++ib) { + const float * xb = xbl + block_size*ib; + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); float max = fabsf(xb[0]); - for (int i = 1; i < IQ1S_BLOCK_SIZE; ++i) max = MAX(max, fabsf(xb[i])); + for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); if (!max) { scales[ib] = 0; - memset(L, 1, IQ1S_BLOCK_SIZE); + memset(L, 1, block_size); continue; } // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. @@ -11581,14 +11593,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale // for each possible and score for each split. - for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) { + for (int j = 0; j < block_size; ++j) { pairs[2*j] = xb[j]; idx[2*j] = j; } - qsort(pairs, IQ1S_BLOCK_SIZE, 2*sizeof(float), iq1_sort_helper); + qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); { sumx[0] = sumw[0] = 0; - for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) { + for (int j = 0; j < block_size; ++j) { int i = idx[2*j]; sumx[j+1] = sumx[j] + weight[i]*xb[i]; sumw[j+1] = sumw[j] + weight[i]; @@ -11596,16 +11608,16 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } float best_score = 0, scale = max; int besti1 = -1, besti2 = -1, best_shift = 0; - for (int i1 = 0; i1 <= IQ1S_BLOCK_SIZE; ++i1) { - for (int i2 = i1; i2 <= IQ1S_BLOCK_SIZE; ++i2) { - float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_p[2]; - float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_p[2]*x_p[2]; + for (int i1 = 0; i1 <= block_size; ++i1) { + for (int i2 = i1; i2 <= block_size; ++i2) { + float sumqx = (sumx[i1] - sumx[0])*x_p[0] + (sumx[i2] - sumx[i1])*x_p[1] + (sumx[block_size] - sumx[i2])*x_p[2]; + float sumq2 = (sumw[i1] - sumw[0])*x_p[0]*x_p[0] + (sumw[i2] - sumw[i1])*x_p[1]*x_p[1] + (sumw[block_size] - sumw[i2])*x_p[2]*x_p[2]; if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { scale = sumqx/sumq2; best_score = scale*sumqx; besti1 = i1; besti2 = i2; best_shift = 1; } - sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[IQ1S_BLOCK_SIZE] - sumx[i2])*x_m[2]; - sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[IQ1S_BLOCK_SIZE] - sumw[i2])*x_m[2]*x_m[2]; + sumqx = (sumx[i1] - sumx[0])*x_m[0] + (sumx[i2] - sumx[i1])*x_m[1] + (sumx[block_size] - sumx[i2])*x_m[2]; + sumq2 = (sumw[i1] - sumw[0])*x_m[0]*x_m[0] + (sumw[i2] - sumw[i1])*x_m[1]*x_m[1] + (sumw[block_size] - sumw[i2])*x_m[2]*x_m[2]; if (sumq2 > 0 && sumqx*sumqx > best_score*sumq2) { scale = sumqx/sumq2; best_score = scale*sumqx; besti1 = i1; besti2 = i2; best_shift = -1; @@ -11615,14 +11627,14 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_shift != 0); for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; - for (int j = besti2; j < IQ1S_BLOCK_SIZE; ++j) L[idx[2*j]] = 2; + for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; if (scale < 0) { - for (int j = 0; j < IQ1S_BLOCK_SIZE; ++j) L[j] = 2 - L[j]; + for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; scale = -scale; best_shift = -best_shift; } bool all_on_grid = true; const float * xx = best_shift == 1 ? x_p : x_m; - for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { + for (int k = 0; k < block_size/8; ++k) { uint16_t u = 0; for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); int grid_index = kmap_q2xs[u]; @@ -11636,7 +11648,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } if (!all_on_grid) { float sumqx = 0, sumq2 = 0; - for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { + for (int k = 0; k < block_size/8; ++k) { const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); for (int j = 0; j < 8; ++j) { float w = weight[8*k + j]; @@ -11648,8 +11660,8 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2; } uint16_t h = 0; - for (int k = 0; k < IQ1S_BLOCK_SIZE/8; ++k) { - y[ibl].qs[(IQ1S_BLOCK_SIZE/8)*ib + k] = index[k] & 255; + for (int k = 0; k < block_size/8; ++k) { + y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255; h |= (index[k] >> 8) << 3*k; } y[ibl].qh[ib] = h; @@ -11660,14 +11672,13 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } if (!max_scale) { - memset(y[ibl].qs, 0, QK_K/8); continue; } float d = max_scale/15; - y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.085f is another fudge factor. Don't ask me why it is needed. + y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed. float id = 1/d; - for (int ib = 0; ib < QK_K/IQ1S_BLOCK_SIZE; ++ib) { + for (int ib = 0; ib < QK_K/block_size; ++ib) { int l = nearest_int(0.5f*(id*scales[ib]-1)); l = MAX(0, MIN(7, l)); if (shifts[ib] == -1) l |= 8; @@ -11676,18 +11687,309 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); - int nblock = n_per_row/QK_K; + float scales[QK_K/IQ1S_BLOCK_SIZE]; + float weight[IQ1S_BLOCK_SIZE]; + int8_t L[IQ1S_BLOCK_SIZE]; + float sumx[IQ1S_BLOCK_SIZE+1]; + float sumw[IQ1S_BLOCK_SIZE+1]; + float pairs[2*IQ1S_BLOCK_SIZE]; + uint16_t index[IQ1S_BLOCK_SIZE/8]; + int8_t shifts[QK_K/IQ1S_BLOCK_SIZE]; + int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { - quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights); + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts); src += n_per_row; qrow += nblock*sizeof(block_iq1_s); } return nrow * nblock * sizeof(block_iq1_s); } +static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights, + float * scales, + float * weight, + float * pairs, + int8_t * L, + uint16_t * index, + int8_t * shifts) { + + const int gindex = iq2_data_index(GGML_TYPE_IQ1_M); + + const uint64_t * kgrid_q2xs = iq2_data[gindex].grid; + const int * kmap_q2xs = iq2_data[gindex].map; + const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours; + + //GGML_ASSERT(quant_weights && "missing quantization weights"); + GGML_ASSERT(kgrid_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kmap_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?"); + GGML_ASSERT(n%QK_K == 0); + + block_iq1_m * y = vy; + + const int64_t nbl = n/QK_K; + + const int block_size = IQ1M_BLOCK_SIZE; + + const float x_p[3] = {-1 + IQ1M_DELTA, IQ1M_DELTA, 1 + IQ1M_DELTA}; + const float x_m[3] = {-1 - IQ1M_DELTA, -IQ1M_DELTA, 1 - IQ1M_DELTA}; + const uint8_t masks[4] = {0x00, 0x80, 0x08, 0x88}; + + int * idx = (int *)(pairs + 1); + + float sumqx[4], sumq2[4]; + + iq1m_scale_t s; + const float * xx; + + for (int ibl = 0; ibl < nbl; ++ibl) { + +#if QK_K == 64 + y[ibl].d = GGML_FP32_TO_FP16(0.f); +#endif + memset(y[ibl].qs, 0, QK_K/8); + memset(y[ibl].qh, 0, QK_K/16); + memset(y[ibl].scales, 0, QK_K/32); + + float max_scale = 0; + + const float * xbl = x + QK_K*ibl; + float sumx2 = 0; + for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i]; + float sigma2 = 2*sumx2/QK_K; + + for (int ib = 0; ib < QK_K/block_size; ++ib) { + const float * xb = xbl + block_size*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i]; + } + float max = fabsf(xb[0]); + for (int i = 1; i < block_size; ++i) max = MAX(max, fabsf(xb[i])); + if (!max) { + scales[ib] = 0; + memset(L, 1, block_size); + continue; + } + // Here we solve exactly the sum of squared difference (SSD) weighted minimization problem. + // With just 3 allowed quant values (-1, 0, 1), we can search exhaustively for the two + // boundaries that split the weights xb[i] into 3 groups. To do so, we sort the weights + // in ascending order, compute Si = sum[weight[j] xb[j], j = 0...i] and + // Wi = sum[weight[j], j = 0...i], and use these to quckly get get the optimum scale + // for each possible and score for each split. + for (int j = 0; j < block_size; ++j) { + pairs[2*j] = xb[j]; + idx[2*j] = j; + } + qsort(pairs, block_size, 2*sizeof(float), iq1_sort_helper); + float best_score = 0, scale = max; + int besti1 = -1, besti2 = -1, best_k = -1; + // 0: +, + + // 1: +, - + // 2: -, + + // 3: -, - + for (int i1 = 0; i1 <= block_size; ++i1) { + for (int i2 = i1; i2 <= block_size; ++i2) { + memset(sumqx, 0, 4*sizeof(float)); + memset(sumq2, 0, 4*sizeof(float)); + for (int j = 0; j < i1; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[0]*xb[i]; + sumqx[1] += weight[i]*x_p[0]*xb[i]; + sumqx[2] += weight[i]*x_m[0]*xb[i]; + sumqx[3] += weight[i]*x_m[0]*xb[i]; + sumq2[0] += weight[i]*x_p[0]*x_p[0]; + sumq2[1] += weight[i]*x_p[0]*x_p[0]; + sumq2[2] += weight[i]*x_m[0]*x_m[0]; + sumq2[3] += weight[i]*x_m[0]*x_m[0]; + } else { + sumqx[0] += weight[i]*x_p[0]*xb[i]; + sumqx[2] += weight[i]*x_p[0]*xb[i]; + sumqx[1] += weight[i]*x_m[0]*xb[i]; + sumqx[3] += weight[i]*x_m[0]*xb[i]; + sumq2[0] += weight[i]*x_p[0]*x_p[0]; + sumq2[2] += weight[i]*x_p[0]*x_p[0]; + sumq2[1] += weight[i]*x_m[0]*x_m[0]; + sumq2[3] += weight[i]*x_m[0]*x_m[0]; + } + } + for (int j = i1; j < i2; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[1]*xb[i]; + sumqx[1] += weight[i]*x_p[1]*xb[i]; + sumqx[2] += weight[i]*x_m[1]*xb[i]; + sumqx[3] += weight[i]*x_m[1]*xb[i]; + sumq2[0] += weight[i]*x_p[1]*x_p[1]; + sumq2[1] += weight[i]*x_p[1]*x_p[1]; + sumq2[2] += weight[i]*x_m[1]*x_m[1]; + sumq2[3] += weight[i]*x_m[1]*x_m[1]; + } else { + sumqx[0] += weight[i]*x_p[1]*xb[i]; + sumqx[2] += weight[i]*x_p[1]*xb[i]; + sumqx[1] += weight[i]*x_m[1]*xb[i]; + sumqx[3] += weight[i]*x_m[1]*xb[i]; + sumq2[0] += weight[i]*x_p[1]*x_p[1]; + sumq2[2] += weight[i]*x_p[1]*x_p[1]; + sumq2[1] += weight[i]*x_m[1]*x_m[1]; + sumq2[3] += weight[i]*x_m[1]*x_m[1]; + } + } + for (int j = i2; j < block_size; ++j) { + int i = idx[2*j]; + if (i < block_size/2) { + sumqx[0] += weight[i]*x_p[2]*xb[i]; + sumqx[1] += weight[i]*x_p[2]*xb[i]; + sumqx[2] += weight[i]*x_m[2]*xb[i]; + sumqx[3] += weight[i]*x_m[2]*xb[i]; + sumq2[0] += weight[i]*x_p[2]*x_p[2]; + sumq2[1] += weight[i]*x_p[2]*x_p[2]; + sumq2[2] += weight[i]*x_m[2]*x_m[2]; + sumq2[3] += weight[i]*x_m[2]*x_m[2]; + } else { + sumqx[0] += weight[i]*x_p[2]*xb[i]; + sumqx[2] += weight[i]*x_p[2]*xb[i]; + sumqx[1] += weight[i]*x_m[2]*xb[i]; + sumqx[3] += weight[i]*x_m[2]*xb[i]; + sumq2[0] += weight[i]*x_p[2]*x_p[2]; + sumq2[2] += weight[i]*x_p[2]*x_p[2]; + sumq2[1] += weight[i]*x_m[2]*x_m[2]; + sumq2[3] += weight[i]*x_m[2]*x_m[2]; + } + } + for (int k = 0; k < 4; ++k) { + if (sumq2[k] > 0 && sumqx[k]*sumqx[k] > best_score*sumq2[k]) { + scale = sumqx[k]/sumq2[k]; best_score = scale*sumqx[k]; + besti1 = i1; besti2 = i2; best_k = k; + } + } + } + } + GGML_ASSERT(besti1 >= 0 && besti2 >= 0 && best_k >= 0); + for (int j = 0; j < besti1; ++j) L[idx[2*j]] = 0; + for (int j = besti1; j < besti2; ++j) L[idx[2*j]] = 1; + for (int j = besti2; j < block_size; ++j) L[idx[2*j]] = 2; + if (scale < 0) { + for (int j = 0; j < block_size; ++j) L[j] = 2 - L[j]; + scale = -scale; + best_k = best_k == 0 ? 3 : best_k == 1 ? 2 : best_k == 2 ? 1 : 0; + } + bool all_on_grid = true; + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = best_k < 2 ? x_p : x_m; + else xx = best_k%2 == 0 ? x_p : x_m; + uint16_t u = 0; + for (int j = 0; j < 8; ++j) u |= (L[8*k+j] << 2*j); + int grid_index = kmap_q2xs[u]; + if (grid_index < 0) { + all_on_grid = false; + const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1; + grid_index = iq1_find_best_neighbour2(neighbours, kgrid_q2xs, xb + 8*k, weight + 8*k, scale, xx, L + 8*k, NGRID_IQ1S); + GGML_ASSERT(grid_index >= 0); + } + index[k] = grid_index; + } + if (!all_on_grid) { + float sumqx_f = 0, sumq2_f = 0; + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = best_k < 2 ? x_p : x_m; + else xx = best_k%2 == 0 ? x_p : x_m; + const int8_t * pg = (const int8_t *)(kgrid_q2xs + index[k]); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = xx[(pg[j] - 1)/2]; + sumqx_f += w*q*xb[8*k+j]; + sumq2_f += w*q*q; + } + } + if (sumqx_f > 0 && sumq2_f > 0) scale = sumqx_f/sumq2_f; + } + y[ibl].qs[2*ib + 0] = index[0] & 255; + y[ibl].qs[2*ib + 1] = index[1] & 255; + y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4); + GGML_ASSERT(scale >= 0); + scales[ib] = scale; + shifts[ib] = best_k; + max_scale = MAX(max_scale, scale); + } + + if (!max_scale) { + continue; + } + + uint16_t * sc = (uint16_t *)y[ibl].scales; +#if QK_K == 64 + float d = max_scale/31; +#else + float d = max_scale/15; +#endif + float id = 1/d; + float sumqx_f = 0, sumq2_f = 0; + for (int ib = 0; ib < QK_K/block_size; ++ib) { + int l = nearest_int(0.5f*(id*scales[ib+0]-1)); +#if QK_K == 64 + l = MAX(0, MIN(15, l)); + sc[ib/4] |= (l << 4*(ib%4)); +#else + l = MAX(0, MIN(7, l)); + sc[ib/4] |= (l << 3*(ib%4)); +#endif + y[ibl].qh[ib] |= masks[shifts[ib]]; + const float * xb = xbl + block_size*ib; + if (quant_weights) { + const float * qw = quant_weights + QK_K*ibl + block_size*ib; + for (int i = 0; i < block_size; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]); + } else { + for (int i = 0; i < block_size; ++i) weight[i] = xb[i]*xb[i]; + } + for (int k = 0; k < block_size/8; ++k) { + if (k == 0) xx = shifts[ib] < 2 ? x_p : x_m; + else xx = shifts[ib]%2 == 0 ? x_p : x_m; + const int8_t * pg = (const int8_t *)(kgrid_q2xs + y[ibl].qs[2*ib+k] + ((y[ibl].qh[ib] << (8 - 4*k)) & 0x700)); + for (int j = 0; j < 8; ++j) { + float w = weight[8*k + j]; + float q = xx[(pg[j] - 1)/2]*(2*l+1); + sumqx_f += w*q*xb[8*k+j]; + sumq2_f += w*q*q; + } + } + } + if (sumq2_f > 0) d = sumqx_f/sumq2_f; + s.f16 = GGML_FP32_TO_FP16(d*1.1125f); // 1.1125f is another fudge factor. Don't ask me why it is needed. +#if QK_K == 64 + y[ibl].d = s.f16; +#else + sc[0] |= ((s.u16 & 0x000f) << 12); + sc[1] |= ((s.u16 & 0x00f0) << 8); + sc[2] |= ((s.u16 & 0x0f00) << 4); + sc[3] |= ((s.u16 & 0xf000) << 0); +#endif + } +} + +size_t quantize_iq1_m(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { + GGML_ASSERT(n_per_row%QK_K == 0); + float scales[QK_K/IQ1M_BLOCK_SIZE]; + float weight[IQ1M_BLOCK_SIZE]; + int8_t L[IQ1M_BLOCK_SIZE]; + float pairs[2*IQ1M_BLOCK_SIZE]; + uint16_t index[IQ1M_BLOCK_SIZE/8]; + int8_t shifts[QK_K/IQ1M_BLOCK_SIZE]; + int64_t nblock = n_per_row/QK_K; + char * qrow = (char *)dst; + for (int64_t row = 0; row < nrow; ++row) { + quantize_row_iq1_m_impl(src, qrow, n_per_row, quant_weights, scales, weight, pairs, L, index, shifts); + src += n_per_row; + qrow += nblock*sizeof(block_iq1_m); + } + return nrow * nblock * sizeof(block_iq1_m); +} + // ============================ 4-bit non-linear quants static inline int best_index_int8(int n, const int8_t * val, float x) { @@ -11812,16 +12114,16 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block } } -size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK4_NL == 0); - int nblock = n_per_row/QK4_NL; + int64_t nblock = n_per_row/QK4_NL; char * qrow = (char *)dst; uint8_t L[QK4_NL]; float weight[QK4_NL]; uint16_t unused_h; uint8_t * unused_l = NULL; float scale; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { block_iq4_nl * iq4 = (block_iq4_nl *)qrow; for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK4_NL*ibl : NULL; @@ -11834,9 +12136,9 @@ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow return nrow * nblock * sizeof(block_iq4_nl); } -void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { +void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k) { GGML_ASSERT(k%QK4_NL == 0); - int nblock = k/QK4_NL; + int64_t nblock = k/QK4_NL; uint8_t L[QK4_NL]; float weight[QK4_NL]; uint16_t unused_h; @@ -11849,22 +12151,22 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) { } } -void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) { +void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) { assert(k % QK4_NL == 0); quantize_row_iq4_nl(x, y, k); } -size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { #if QK_K == 64 return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights); #else GGML_ASSERT(n_per_row%QK_K == 0); - int nblock = n_per_row/QK_K; + int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; uint8_t L[QK_K]; float weight[32]; float scales[QK_K/32]; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { block_iq4_xs * iq4 = (block_iq4_xs *)qrow; for (int ibl = 0; ibl < nblock; ++ibl) { const float * qw = quant_weights ? quant_weights + QK_K*ibl : NULL; @@ -11878,20 +12180,20 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow #endif } -void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) { +void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK_K == 0); block_iq4_xs * restrict y = vy; quantize_row_iq4_xs_reference(x, y, k); } -void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) { +void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) { assert(k % QK_K == 0); quantize_iq4_xs(x, y, 1, k, NULL); } // =============================== 2.5625 bpw -static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) { +static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy, int64_t n, const float * restrict quant_weights) { const int gindex = iq2_data_index(GGML_TYPE_IQ2_S); @@ -11906,7 +12208,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy const int kMaxQ = 3; - const int nbl = n/QK_K; + const int64_t nbl = n/QK_K; block_iq2_s * y = vy; @@ -12059,11 +12361,11 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy } } -size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) { +size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) { GGML_ASSERT(n_per_row%QK_K == 0); - int nblock = n_per_row/QK_K; + int64_t nblock = n_per_row/QK_K; char * qrow = (char *)dst; - for (int row = 0; row < nrow; ++row) { + for (int64_t row = 0; row < nrow; ++row) { quantize_row_iq2_s_impl(src, qrow, n_per_row, quant_weights); src += n_per_row; qrow += nblock*sizeof(block_iq2_s); @@ -12071,13 +12373,315 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, return nrow * nblock * sizeof(block_iq2_s); } -void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) { +void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) { assert(k % QK_K == 0); quantize_iq2_s(x, y, 1, k, NULL); } -void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) { +void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) { assert(k % QK_K == 0); block_iq2_s * restrict y = vy; quantize_row_iq2_s_reference(x, y, k); } + +static bool validate_float(float f, size_t i) { + if (isinf(f)) { + fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i); + return false; + } + + if (isnan(f)) { + fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i); + return false; + } + + return true; +} + +static bool isinf_fp16(ggml_fp16_t f) { + return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) == 0; +} + +static bool isnan_fp16(ggml_fp16_t f) { + return (f & 0x7c00) == 0x7c00 && (f & 0x03ff) != 0; +} + +static bool validate_fp16(ggml_fp16_t f, size_t i) { + if (isinf_fp16(f)) { + fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i); + return false; + } + + if (isnan_fp16(f)) { + fprintf(stderr, "ggml_validate_row_data: found nan value at block %zu\n", i); + return false; + } + + return true; +} + +#define VALIDATE_ROW_DATA_D_F16_IMPL(type, data, nb) \ + const type * q = (const type *) (data); \ + for (size_t i = 0; i < (nb); ++i) { \ + if (!validate_fp16(q[i].d, i)) { \ + return false; \ + } \ + } + +#define VALIDATE_ROW_DATA_DM_F16_IMPL(type, data, nb, d, m) \ + const type * q = (const type *) (data); \ + for (size_t i = 0; i < (nb); ++i) { \ + if (!validate_fp16(q[i].d, i) || !validate_fp16(q[i].m, i)) { \ + return false; \ + } \ + } + +bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes) { + if (type < 0 || type >= GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid type %d\n", __func__, type); + return false; + } + + if (nbytes % ggml_type_size(type) != 0) { + fprintf(stderr, "%s: invalid size %zu for type %d\n", __func__, nbytes, type); + return false; + } + + const size_t nb = nbytes/ggml_type_size(type); + + switch (type) { + case GGML_TYPE_BF16: + { + int nans = 0; + int infs = 0; + const unsigned short * f = (const unsigned short *) data; + for (size_t i = 0; i < nb; ++i) { + nans += (f[i] & 0x7fff) > 0x7f80; + infs += (f[i] & 0x7fff) == 0x7f80; + } + if (nans) { + fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb); + return false; + } + if (infs) { + fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb); + return false; + } + } break; + case GGML_TYPE_F16: + { + const ggml_fp16_t * f = (const ggml_fp16_t *) data; + size_t i = 0; +#if defined(__AVX2__) + for (; i + 15 < nb; i += 16) { + __m256i v = _mm256_loadu_si256((const __m256i *)(f + i)); + __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi16(0x7c00)); + __m256i cmp = _mm256_cmpeq_epi16(vexp, _mm256_set1_epi16(0x7c00)); + int mask = _mm256_movemask_epi8(cmp); + if (mask) { + for (size_t j = 0; j < 16; ++j) { + if (!validate_fp16(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#elif defined(__ARM_NEON) + for (; i + 7 < nb; i += 8) { + uint16x8_t v = vld1q_u16(f + i); + uint16x8_t vexp = vandq_u16(v, vdupq_n_u16(0x7c00)); + uint16x8_t cmp = vceqq_u16(vexp, vdupq_n_u16(0x7c00)); + uint64_t mask = vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(cmp, 4)), 0); + if (mask) { + for (size_t j = 0; j < 8; ++j) { + if (!validate_fp16(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#endif + for (; i < nb; ++i) { + if (!validate_fp16(f[i], i)) { + return false; + } + } + } break; + case GGML_TYPE_F32: + { + const float * f = (const float *) data; + size_t i = 0; +#if defined(__AVX2__) + for (; i + 7 < nb; i += 8) { + __m256i v = _mm256_loadu_si256((const __m256i *)(f + i)); + __m256i vexp = _mm256_and_si256(v, _mm256_set1_epi32(0x7f800000)); + __m256i cmp = _mm256_cmpeq_epi32(vexp, _mm256_set1_epi32(0x7f800000)); + int mask = _mm256_movemask_epi8(cmp); + if (mask) { + for (size_t j = 0; j < 8; ++j) { + if (!validate_float(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#elif defined(__ARM_NEON) + for (; i + 3 < nb; i += 4) { + uint32x4_t v = vld1q_u32((const uint32_t *)f + i); + uint32x4_t vexp = vandq_u32(v, vdupq_n_u32(0x7f800000)); + uint32x4_t cmp = vceqq_u32(vexp, vdupq_n_u32(0x7f800000)); + uint64_t mask = vget_lane_u64(vreinterpret_u64_u16(vshrn_n_u32(cmp, 8)), 0); + if (mask) { + for (size_t j = 0; j < 4; ++j) { + if (!validate_float(f[i + j], i + j)) { + return false; + } + } + GGML_UNREACHABLE(); + } + } +#endif + for (; i < nb; ++i) { + if (!validate_float(f[i], i)) { + return false; + } + } + } break; + case GGML_TYPE_F64: + { + const double * f = (const double *) data; + for (size_t i = 0; i < nb; ++i) { + if (!validate_float(f[i], i)) { + return false; + } + } + } break; + case GGML_TYPE_Q4_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q4_0, data, nb); + } break; + case GGML_TYPE_Q4_1: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_1, data, nb, d, m); + } break; + case GGML_TYPE_Q5_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_0, data, nb); + } break; + case GGML_TYPE_Q5_1: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_1, data, nb, d, m); + } break; + case GGML_TYPE_Q8_0: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q8_0, data, nb); + } break; + case GGML_TYPE_Q2_K: + { + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q2_K, data, nb, d, dmin); + } break; + case GGML_TYPE_Q3_K: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q3_K, data, nb); + } break; + case GGML_TYPE_Q4_K: + { + #ifdef GGML_QKK_64 + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d[0], d[1]); + #else + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q4_K, data, nb, d, dmin); + #endif + } break; + case GGML_TYPE_Q5_K: + { + #ifdef GGML_QKK_64 + VALIDATE_ROW_DATA_D_F16_IMPL(block_q5_K, data, nb); + #else + VALIDATE_ROW_DATA_DM_F16_IMPL(block_q5_K, data, nb, d, dmin); + #endif + } break; + case GGML_TYPE_Q6_K: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_q6_K, data, nb); + } break; + case GGML_TYPE_Q8_K: + { + const block_q8_K * q = (const block_q8_K *) data; + for (size_t i = 0; i < nb; ++i) { + if (!validate_float(q[i].d, i)) { + return false; + } + } + } break; + case GGML_TYPE_IQ1_S: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq1_s, data, nb); + } break; + case GGML_TYPE_IQ1_M: + { + const block_iq1_m * q = (const block_iq1_m *) data; + for (size_t i = 0; i < nb; ++i) { + #if QK_K == 64 + if (!validate_fp16(q[i].d, i)) { + return false; + } + #else + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)q[i].scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + if (!validate_fp16(scale.f16, i)) { + return false; + } + #endif + } + } break; + case GGML_TYPE_IQ2_XXS: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xxs, data, nb); + } break; + case GGML_TYPE_IQ2_XS: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_xs, data, nb); + } break; + case GGML_TYPE_IQ2_S: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq2_s, data, nb); + } break; + case GGML_TYPE_IQ3_XXS: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_xxs, data, nb); + } break; + + case GGML_TYPE_IQ3_S: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq3_s, data, nb); + } break; + case GGML_TYPE_IQ4_XS: + #if QK_K != 64 + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_xs, data, nb); + } break; + #endif + // with QK_K == 64, iq4_xs is iq4_nl + case GGML_TYPE_IQ4_NL: + { + VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb); + } break; + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_I64: + // nothing to validate + break; + default: + { + fprintf(stderr, "%s: invalid type %d\n", __func__, type); + return false; + } + } + + return true; +} diff --git a/ggml-quants.h b/ggml-quants.h index aa7e54a16..4d436a8f0 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -12,69 +12,70 @@ extern "C" { #endif // Quantization -void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k); -void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k); -void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k); -void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k); -void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k); -void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k); +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k); -void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k); -void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k); -void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k); -void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k); -void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); -void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k); -void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k); -void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k); -void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int k); -void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k); +void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k); -void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); -void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); +void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); -void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); +void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); @@ -94,30 +95,32 @@ void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); +void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc); // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization") -size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); -size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); -size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix); +size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); +size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix); void iq2xs_init_impl(enum ggml_type type); void iq2xs_free_impl(enum ggml_type type); diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index cc9ee0762..79aec4d9f 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -740,11 +740,7 @@ namespace dpct sycl::queue &default_queue() { -#ifdef DPCT_USM_LEVEL_NONE - return out_of_order_queue(); -#else return in_order_queue(); -#endif // DPCT_USM_LEVEL_NONE } void queues_wait_and_throw() @@ -763,11 +759,7 @@ namespace dpct sycl::queue *create_queue(bool enable_exception_handler = false) { -#ifdef DPCT_USM_LEVEL_NONE - return create_out_of_order_queue(enable_exception_handler); -#else return create_in_order_queue(enable_exception_handler); -#endif // DPCT_USM_LEVEL_NONE } sycl::queue *create_queue(sycl::context context, sycl::device device, @@ -1075,11 +1067,6 @@ namespace dpct static pointer_access_attribute get_pointer_attribute(sycl::queue &q, const void *ptr) { -#ifdef DPCT_USM_LEVEL_NONE - return mem_mgr::instance().is_device_ptr(ptr) - ? pointer_access_attribute::device_only - : pointer_access_attribute::host_only; -#else switch (sycl::get_pointer_type(ptr, q.get_context())) { case sycl::usm::alloc::unknown: @@ -1090,7 +1077,6 @@ namespace dpct case sycl::usm::alloc::host: return pointer_access_attribute::host_device; } -#endif } template @@ -1273,11 +1259,7 @@ namespace dpct static inline void *dpct_malloc(size_t size, sycl::queue &q) { -#ifdef DPCT_USM_LEVEL_NONE - return mem_mgr::instance().mem_alloc(size * sizeof(byte_t)); -#else return sycl::malloc_device(size, q.get_device(), q.get_context()); -#endif // DPCT_USM_LEVEL_NONE } #define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F)) @@ -1301,25 +1283,7 @@ namespace dpct static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr, valueT value, size_t size) { -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - assert(mm.is_device_ptr(dev_ptr)); - auto alloc = mm.translate_ptr(dev_ptr); - size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr; - - return q.submit([&](sycl::handler &cgh) - { - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - auto new_buffer = alloc.buffer.reinterpret( - sycl::range<1>(alloc.size / sizeof(valueT))); - sycl::accessor - acc(new_buffer, cgh, r, o); - cgh.fill(acc, value); }); -#else return q.fill(dev_ptr, value, size); -#endif // DPCT_USM_LEVEL_NONE } /** @@ -1413,72 +1377,8 @@ namespace dpct { if (!size) return sycl::event{}; -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - - switch (real_direction) - { - case host_to_host: - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); }); - case host_to_device: - { - auto alloc = mm.translate_ptr(to_ptr); - size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(from_ptr, acc); }); - } - case device_to_host: - { - auto alloc = mm.translate_ptr(from_ptr); - size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(acc, to_ptr); }); - } - case device_to_device: - { - auto to_alloc = mm.translate_ptr(to_ptr); - auto from_alloc = mm.translate_ptr(from_ptr); - size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, r, to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, r, from_o); - cgh.copy(from_acc, to_acc); }); - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } -#else return q.memcpy(to_ptr, from_ptr, size, dep_events); GGML_UNUSED(direction); -#endif // DPCT_USM_LEVEL_NONE } // Get actual copy range and make sure it will not exceed range. @@ -1618,45 +1518,15 @@ namespace dpct break; } case device_to_device: -#ifdef DPCT_USM_LEVEL_NONE - { - auto &mm = mem_mgr::instance(); - auto to_alloc = mm.translate_ptr(to_surface); - auto from_alloc = mm.translate_ptr(from_surface); - size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, - get_copy_range(size, to_slice, to_range.get(0)), to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, - get_copy_range(size, from_slice, from_range.get(0)), from_o); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_acc[get_offset(id, to_slice, to_range.get(0))] = - from_acc[get_offset(id, from_slice, from_range.get(0))]; - }); })); - } -#else - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_surface[get_offset(id, to_slice, to_range.get(0))] = - from_surface[get_offset(id, from_slice, from_range.get(0))]; - }); })); -#endif - break; + event_list.push_back(q.submit([&](sycl::handler &cgh){ + cgh.depends_on(dep_events); + cgh.parallel_for( + size, + [=](sycl::id<3> id) { + to_surface[get_offset(id, to_slice, to_range.get(0))] = + from_surface[get_offset(id, from_slice, from_range.get(0))]; + }); })); + break; default: throw std::runtime_error("dpct_memcpy: invalid direction value"); } @@ -1754,11 +1624,7 @@ namespace dpct { if (ptr) { -#ifdef DPCT_USM_LEVEL_NONE - detail::mem_mgr::instance().mem_free(ptr); -#else sycl::free(ptr, q.get_context()); -#endif // DPCT_USM_LEVEL_NONE } } @@ -1766,11 +1632,7 @@ namespace dpct inline auto get_memory(const void *x) { T *new_x = reinterpret_cast(const_cast(x)); -#ifdef DPCT_USM_LEVEL_NONE - return dpct::get_buffer>(new_x); -#else return new_x; -#endif } template @@ -1802,24 +1664,6 @@ namespace dpct const void *alpha, const void *a, int lda, const void *b, int ldb, const void *beta, void *c, int ldc) { -#ifndef __INTEL_MKL__ - GGML_UNUSED(q); - GGML_UNUSED(a_trans); - GGML_UNUSED(b_trans); - GGML_UNUSED(m); - GGML_UNUSED(n); - GGML_UNUSED(k); - GGML_UNUSED(alpha); - GGML_UNUSED(a); - GGML_UNUSED(lda); - GGML_UNUSED(b); - GGML_UNUSED(ldb); - GGML_UNUSED(beta); - GGML_UNUSED(c); - GGML_UNUSED(ldc); - throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces " - "Project does not support this API."); -#else Ts alpha_value = dpct::get_value(reinterpret_cast(alpha), q); Ts beta_value = dpct::get_value(reinterpret_cast(beta), q); auto data_a = get_memory(a); @@ -1828,7 +1672,6 @@ namespace dpct oneapi::mkl::blas::column_major::gemm( q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb, beta_value, data_c, ldc); -#endif } template @@ -2222,72 +2065,8 @@ namespace dpct { if (!size) return sycl::event{}; -#ifdef DPCT_USM_LEVEL_NONE - auto &mm = mem_mgr::instance(); - auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction); - - switch (real_direction) - { - case host_to_host: - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); }); - case host_to_device: - { - auto alloc = mm.translate_ptr(to_ptr); - size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(from_ptr, acc); }); - } - case device_to_host: - { - auto alloc = mm.translate_ptr(from_ptr); - size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto o = sycl::id<1>(offset); - sycl::accessor - acc(alloc.buffer, cgh, r, o); - cgh.copy(acc, to_ptr); }); - } - case device_to_device: - { - auto to_alloc = mm.translate_ptr(to_ptr); - auto from_alloc = mm.translate_ptr(from_ptr); - size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr; - return q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto r = sycl::range<1>(size); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, r, to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, r, from_o); - cgh.copy(from_acc, to_acc); }); - } - default: - throw std::runtime_error("dpct_memcpy: invalid direction value"); - } -#else return q.memcpy(to_ptr, from_ptr, size, dep_events); GGML_UNUSED(direction); -#endif // DPCT_USM_LEVEL_NONE } // Get actual copy range and make sure it will not exceed range. @@ -2427,34 +2206,6 @@ namespace dpct break; } case device_to_device: -#ifdef DPCT_USM_LEVEL_NONE - { - auto &mm = mem_mgr::instance(); - auto to_alloc = mm.translate_ptr(to_surface); - auto from_alloc = mm.translate_ptr(from_surface); - size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr; - size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr; - event_list.push_back(q.submit([&](sycl::handler &cgh) - { - cgh.depends_on(dep_events); - auto to_o = sycl::id<1>(to_offset); - auto from_o = sycl::id<1>(from_offset); - sycl::accessor - to_acc(to_alloc.buffer, cgh, - get_copy_range(size, to_slice, to_range.get(0)), to_o); - sycl::accessor - from_acc(from_alloc.buffer, cgh, - get_copy_range(size, from_slice, from_range.get(0)), from_o); - cgh.parallel_for( - size, - [=](sycl::id<3> id) { - to_acc[get_offset(id, to_slice, to_range.get(0))] = - from_acc[get_offset(id, from_slice, from_range.get(0))]; - }); })); - } -#else event_list.push_back(q.submit([&](sycl::handler &cgh) { cgh.depends_on(dep_events); @@ -2464,7 +2215,6 @@ namespace dpct to_surface[get_offset(id, to_slice, to_range.get(0))] = from_surface[get_offset(id, from_slice, from_range.get(0))]; }); })); -#endif break; default: throw std::runtime_error("dpct_memcpy: invalid direction value"); @@ -2561,6 +2311,7 @@ namespace dpct lda, b, ldb, beta, c, ldc); break; } +#ifdef __INTEL_MKL__ case detail::get_type_combination_id( library_data_t::real_bfloat16, library_data_t::real_bfloat16, library_data_t::real_float, library_data_t::real_float): @@ -2622,6 +2373,7 @@ namespace dpct q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc); break; } +#endif // __INTEL_MKL__ default: throw std::runtime_error("the combination of data type is unsupported"); } @@ -2655,9 +2407,6 @@ namespace dpct void *c[], library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type) { -#ifdef DPCT_USM_LEVEL_NONE - throw std::runtime_error("this API is unsupported when USM level is none"); -#else if (scaling_type == library_data_t::real_float && c_type == library_data_t::complex_float) { @@ -2792,7 +2541,6 @@ namespace dpct default: throw std::runtime_error("the combination of data type is unsupported"); } -#endif } /// Computes a batch of matrix-matrix product with general matrices. @@ -3131,24 +2879,9 @@ namespace dpct template typename std::enable_if::type &operator[](size_t index) { init(); - #ifdef DPCT_USM_LEVEL_NONE - return dpct::get_buffer::type>( - _device_ptr) - .template get_access()[index]; - #else return _device_ptr[index]; - #endif // DPCT_USM_LEVEL_NONE } - #ifdef DPCT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - return get_buffer(_device_ptr) - .template reinterpret(_range) - .template get_access::mode, - detail::memory_traits::target>(cgh); - } - #else /// Get dpct::accessor with dimension info for the device memory object /// when usm is used and dimension is greater than 1. template @@ -3156,7 +2889,6 @@ namespace dpct get_access(sycl::handler &cgh) { return dpct_accessor_t((T *)_device_ptr, _range); } - #endif // DPCT_USM_LEVEL_NONE private: device_memory(value_t *memory_ptr, size_t size) @@ -3201,15 +2933,6 @@ namespace dpct /// Default constructor device_memory() : base(1) {} - - #ifdef DPCT_USM_LEVEL_NONE - /// Get sycl::accessor for the device memory object when usm is not used. - accessor_t get_access(sycl::handler &cgh) { - auto buf = get_buffer(base::get_ptr()) - .template reinterpret(sycl::range<1>(1)); - return accessor_t(buf, cgh); - } - #endif // DPCT_USM_LEVEL_NONE }; } // namespace detail @@ -3228,7 +2951,7 @@ namespace dpct #include "ggml-common.h" static int g_ggml_sycl_debug=0; -#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0) +#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) fprintf(stderr, __VA_ARGS__);}while(0) #define CHECK_TRY_ERROR(expr) \ [&]() { \ @@ -3315,6 +3038,10 @@ typedef float dfloat; // dequantize float typedef sycl::float2 dfloat2; #endif //GGML_SYCL_F16 +#define MMVQ_MAX_BATCH_SIZE 8 + +static const int8_t kvalues_iq4nl[16]={-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; + bool ggml_sycl_loaded(void); void * ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void * ptr); @@ -3427,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)( #define SYCL_SCALE_BLOCK_SIZE 256 #define SYCL_CLAMP_BLOCK_SIZE 256 #define SYCL_ROPE_BLOCK_SIZE 256 -#define SYCL_SOFT_MAX_BLOCK_SIZE 1024 #define SYCL_ALIBI_BLOCK_SIZE 32 #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32 #define SYCL_QUANTIZE_BLOCK_SIZE 256 @@ -4750,6 +4476,32 @@ static void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __rest } +template +__dpct_inline__ static void +dequantize_block_iq2_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq2_s * x = (const block_iq2_s *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint8_t * grid = (const uint8_t *)(iq2s_grid + (x[i].qs[4*ib+il] | ((x[i].qh[ib] << (8-2*il)) & 0x300))); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = x[i].qs[QK_K/8+4*ib+il]; +#pragma unroll + for (int j = 0; j < 8; ++j) + y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); + +#endif + +} + template static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __restrict__ yy, const sycl::nd_item<3> &item_ct1, @@ -4782,26 +4534,26 @@ static void dequantize_block_iq3_xxs(const void * __restrict__ vx, dst_t * __res } -template -static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3s_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { +template +__dpct_inline__ static void +dequantize_block_iq3_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint8_t *kmask_iq2xs, const uint32_t *iq3s_grid) { const int i = item_ct1.get_group(2); - const block_iq3_s * x = (const block_iq3_s *) vx; + const block_iq3_s * x = (const block_iq3_s *) vx; const int tid = item_ct1.get_local_id(2); #if QK_K == 256 const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + qs[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + qs[2*il+1]); + const uint8_t * qs = x[i].qs + 8*ib; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); const uint8_t signs = x[i].signs[4*ib + il]; +#pragma unroll for (int j = 0; j < 4; ++j) { y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); @@ -4812,12 +4564,12 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr } -template -static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid, - const uint8_t *ksigns_iq2xs, - const uint8_t *kmask_iq2xs) { +template +__dpct_inline__ static void +dequantize_block_iq1_s(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + const int i = item_ct1.get_group(2); const block_iq1_s * x = (const block_iq1_s *) vx; @@ -4826,14 +4578,15 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr const int il = tid/8; // 0...3 const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; - const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]); - const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1); - const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7]; - for (int j = 0; j < 4; ++j) { - y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); - y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f); + const float delta = x[i].qh[ib] & 0x8000 ? -1 - IQ1S_DELTA : -1 + IQ1S_DELTA; + const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 7) + 1); + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[ib] >> 3*il) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); } #else assert(false); @@ -4841,6 +4594,85 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr } +template +__dpct_inline__ static void +dequantize_block_iq1_m(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1, + const uint32_t *iq1s_grid_gpu) { + + const int i = item_ct1.get_group(2); + const block_iq1_m * x = (const block_iq1_m *) vx; + + const int tid = item_ct1.get_local_id(2); +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * sc = (const uint16_t *)x[i].scales; + iq1m_scale_t scale; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const int ib16 = 2*ib + il/2; // sc[ib16/4] >> 3*(ib16%4) -> sc[ib/2] >> 3*((2*ib+il/2)%4); + const float d = (float)scale.f16 * (2*((sc[ib16/4] >> 3*(ib16%4)) & 0x7) + 1); + const float delta = x[i].qh[2*ib+il/2] & (0x08 << 4*(il%2)) ? -1 - IQ1M_DELTA : -1 + IQ1M_DELTA; + uint32_t grid32[2]; const int8_t * q = (const int8_t *)grid32; + grid32[0] = iq1s_grid_gpu[x[i].qs[4*ib+il] | (((x[i].qh[2*ib+il/2] >> 4*(il%2)) & 7) << 8)]; + grid32[1] = (grid32[0] >> 4) & 0x0f0f0f0f; + grid32[0] &= 0x0f0f0f0f; +#pragma unroll + for (int j = 0; j < 8; ++j) { + y[j] = d * (q[j] + delta); + } +#else + assert(false); +#endif + +} + +template +__dpct_inline__ static void +dequantize_block_iq4_nl(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + + const int i = item_ct1.get_group(2); + const block_iq4_nl * x = (const block_iq4_nl *) vx + i*(QK_K/QK4_NL); + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[ib].qs + 4*il; + const float d = (float)x[ib].d; +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } + +} + + +template +__dpct_inline__ static void +dequantize_block_iq4_xs(const void *__restrict__ vx, dst_t *__restrict__ yy, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_group(2); + const block_iq4_xs * x = (const block_iq4_xs *)vx; + + const int tid = item_ct1.get_local_id(2); + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 4*il; + const uint8_t * q4 = x[i].qs + 16*ib + 4*il; + const float d = (float)x[i].d * ((((x[i].scales_l[ib/2] >> 4*(ib%2)) & 0xf) | (((x[i].scales_h >> 2*ib) & 3) << 4)) - 32); +#pragma unroll + for (int j = 0; j < 4; ++j) { + y[j+ 0] = d * kvalues_iq4nl[q4[j] & 0xf]; + y[j+16] = d * kvalues_iq4nl[q4[j] >> 4]; + } +} + + + /* DPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register @@ -7647,6 +7479,58 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq, #endif } +static __dpct_inline__ float +vec_dot_iq2_s_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq2_s * bq2 = (const block_iq2_s *) vbq; + + const int ib32 = iqs; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t * signs = bq2->qs + QK_K/8 + 4*ib32; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi1 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi1); + sumi1 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi1); + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint32_t * grid = (const uint32_t *)(iq2s_grid + (bq2->qs[4*ib32+l] | ((bq2->qh[ib32] << (8-2*l)) & 0x300))); + const uint32_t signs0 = dpct::vectorized_binary( + ((signs[l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const uint32_t signs1 = dpct::vectorized_binary( + ((signs[l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, + std::equal_to<>()); + const int grid_l = dpct::vectorized_binary( + grid[0] ^ signs0, signs0, std::minus<>()); + const int grid_h = dpct::vectorized_binary( + grid[1] ^ signs1, signs1, std::minus<>()); + sumi2 = dpct::dp4a(grid_l, *((const int *)q8 + 0), sumi2); + sumi2 = dpct::dp4a(grid_h, *((const int *)q8 + 1), sumi2); + q8 += 8; + } + const float d = (float)bq2->d * bq8_1[ib32].ds[0] * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); +#endif +} + static __dpct_inline__ float vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, const block_q8_1 *__restrict__ bq8_1, const int &iqs, @@ -7689,10 +7573,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq, static __dpct_inline__ float vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint32_t *iq3s_grid, const uint64_t *ksigns64) { -#if DPCT_COMPATIBILITY_TEMP >= \ - MIN_CC_DP4A // lowest compute capability for integer intrinsics + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq3s_grid) { #if QK_K == 256 const block_iq3_s * bq2 = (const block_iq3_s *) vbq; @@ -7704,9 +7586,11 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); uint32_t signs0 = dpct::vectorized_binary( - ((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + ((bq2->signs[4 * ib32 + l] & 0xf) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); uint32_t signs1 = dpct::vectorized_binary( - ((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201, std::equal_to<>()); + ((bq2->signs[4 * ib32 + l] >> 4) * 0x01010101) & 0x08040201, + 0x08040201, std::equal_to<>()); const int grid_l = dpct::vectorized_binary( grid1[0] ^ signs0, signs0, std::minus<>()); const int grid_h = dpct::vectorized_binary( @@ -7715,45 +7599,142 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq, sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); q8 += 8; } - const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * bq8_1[ib32].ds[0]; + const float d = + (float)bq2->d * + (1 + 2 * ((bq2->scales[ib32 / 2] >> 4 * (ib32 % 2)) & 0xf)) * + bq8_1[ib32].ds[0]; return d * sumi; #else assert(false); - return 0.f; -#endif -#else - assert(false); - return 0.f; #endif } static __dpct_inline__ float vec_dot_iq1_s_q8_1(const void *__restrict__ vbq, - const block_q8_1 *__restrict__ bq8_1, const int &iqs, - const uint32_t *iq1s_grid, const uint64_t *ksigns64) { + const block_q8_1 *__restrict__ bq8_1, const int &iqs, + const uint32_t *iq1s_grid_gpu) { #if QK_K == 256 const block_iq1_s * bq1 = (const block_iq1_s *) vbq; const int ib32 = iqs; - const uint8_t * qs = bq1->qs + 4*ib32; - const int8_t * q8 = bq8_1[ib32].qs; int sumi = 0; + const int * q8 = (const int *)bq8_1[ib32].qs; for (int l = 0; l < 4; ++l) { - const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]); - const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8)); - const int grid_l = dpct::vectorized_binary( - grid[0] ^ signs[0], signs[0], std::minus<>()); - const int grid_h = dpct::vectorized_binary( - grid[1] ^ signs[1], signs[1], std::minus<>()); - sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi); - sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi); - q8 += 8; + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[ib32] >> 3*l) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi)); } - const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f; - return d * sumi; + + const float delta = bq1->qh[ib32] & 0x8000 ? -1-IQ1S_DELTA : -1+IQ1S_DELTA; + const float d1q = (float)bq1->d * (2*((bq1->qh[ib32] >> 12) & 7) + 1); + const float d = d1q * bq8_1[ib32].ds[0]; + const float m = d1q * bq8_1[ib32].ds[1]; + return d * sumi + m * delta; +#else + assert(false); +#endif +} + +static __dpct_inline__ float +vec_dot_iq1_m_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { +#if QK_K == 256 + const block_iq1_m * bq1 = (const block_iq1_m *) vbq; + + const int ib32 = iqs; + int sumi[2] = {0, 0}; + float sumf[2] = {0.f, 0.f}; + + const int * q8 = (const int *)bq8_1[ib32].qs; + for (int l = 0; l < 4; ++l) { + const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs[4*ib32+l] | (((bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 7) << 8))); + int grid0 = grid[0] & 0x0f0f0f0f; + int grid1 = (grid[0] >> 4) & 0x0f0f0f0f; + sumi[l / 2] = dpct::dp4a(q8[2 * l + 1], grid1, + dpct::dp4a(q8[2 * l + 0], grid0, sumi[l / 2])); + const float delta = (bq1->qh[2*ib32+l/2] >> 4*(l%2)) & 0x08 ? -1-IQ1M_DELTA : -1+IQ1M_DELTA; + const int sumy = dpct::dp4a(q8[2 * l + 1], 0x01010101, + dpct::dp4a(q8[2 * l + 0], 0x01010101, 0)); + sumf[l/2] += delta*sumy; + } + + iq1m_scale_t scale; + const uint16_t * sc = (const uint16_t *)bq1->scales; + scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000); + const float d = (float)scale.f16 * bq8_1[ib32].ds[0]; + return d * ((sumi[0] + sumf[0]) * (2*((sc[ib32/2] >> 6*(ib32%2)) & 0x7) + 1) + (sumi[1] + sumf[1]) * (2*((sc[ib32/2] >> (6*(ib32%2)+3)) & 0x7) + 1)); +#else + assert(false); +#endif +} + +static __dpct_inline__ void get_int_from_table_16(const uint32_t &q4, + const uint8_t *values, + int &val1, int &val2) { + + uint32_t aux32; const uint8_t * q8 = (const uint8_t *)&aux32; + aux32 = q4 & 0x0f0f0f0f; + uint16_t v1 = values[q8[0]] | (values[q8[1]] << 8); + uint16_t v2 = values[q8[2]] | (values[q8[3]] << 8); + val1 = v1 | (v2 << 16); + aux32 = (q4 >> 4) & 0x0f0f0f0f; + v1 = values[q8[0]] | (values[q8[1]] << 8); + v2 = values[q8[2]] | (values[q8[3]] << 8); + val2 = v1 | (v2 << 16); +} + + +static __dpct_inline__ float +vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + + const block_iq4_nl * bq = (const block_iq4_nl *) vbq; + + const uint16_t * q4 = (const uint16_t *)bq->qs + 2*iqs; + const int32_t * q8 = (const int32_t *)bq8_1->qs + iqs; + + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int l = 0; l < VDR_Q4_0_Q8_1_MMVQ; ++l) { + const uint32_t aux = q4[2*l] | (q4[2*l+1] << 16); + get_int_from_table_16(aux, values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[l + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[l + 4], sumi2); + } + + const float d = (float)bq->d * bq8_1->ds[0]; + return d * (sumi1 + sumi2); +} + + +static __dpct_inline__ float +vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq, + const block_q8_1 *__restrict__ bq8_1, const int &iqs) { + +#if QK_K == 256 + const block_iq4_xs * bq4 = (const block_iq4_xs *) vbq; + const uint8_t * values = (const uint8_t *)kvalues_iq4nl; + + // iqs is 0...7 + const int ib32 = iqs; + const int32_t * q8 = (const int *)bq8_1[ib32].qs; + const uint32_t * q4 = (const uint32_t *)bq4->qs + 4*ib32; + const int8_t ls = ((bq4->scales_l[ib32/2] >> 4*(ib32%2)) & 0xf) | (((bq4->scales_h >> 2*ib32) & 3) << 4); + const float d = (float)bq4->d * (ls - 32) * bq8_1[ib32].ds[0]; + int v1, v2; + int sumi1 = 0, sumi2 = 0; + for (int j = 0; j < 4; ++j) { + get_int_from_table_16(q4[j], values, v1, v2); + sumi1 = dpct::dp4a(v1, q8[j + 0], sumi1); + sumi2 = dpct::dp4a(v2, q8[j + 4], sumi2); + } + return d * (sumi1 + sumi2); #else assert(false); - return 0.f; #endif } @@ -8338,8 +8319,7 @@ template static void template static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr) { + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8350,24 +8330,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ const int blocks_per_row = ncols / qk; const int blocks_per_warp = vdr * WARP_SIZE / qi; -// partial sum for each thread + const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block + + // partial sum for each thread float tmp = 0.0f; const block_q_t * x = (const block_q_t *) vx; const block_q8_1 * y = (const block_q8_1 *) vy; - for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row; i += blocks_per_warp) { - const int ibx = row*blocks_per_row + i; // x block index + const int ibx = row * blocks_per_row + i; // x block index - const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + const int iby = i * (qk / QK8_1); // y block index that aligns with ibx - const int iqs = - vdr * - (item_ct1.get_local_id(2) % - (qi / vdr)); // x block quant index when casting the quants to int + const int iqs = + vdr * + (item_ct1.get_local_id(2) - + i * qi_vdr); // x block quant index when casting the quants to int - tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); + tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); } // sum up partial sums and write back result @@ -8383,10 +8365,11 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_ } template -static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xxs_grid_ptr, const uint8_t *ksigns_iq2xs_ptr, - const uint8_t *kmask_iq2xs_ptr ) { +static void mul_mat_vec_q_iq2_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8414,7 +8397,7 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid_ptr, ksigns_iq2xs_ptr, kmask_iq2xs_ptr); + tmp += vec_dot_iq2_xxs_q8_1(&x[ibx], &y[iby], iqs, iq2xxs_grid, ksigns_iq2xs, kmask_iq2xs); } // sum up partial sums and write back result @@ -8430,9 +8413,11 @@ static void mul_mat_vec_q_iq2_xxs_q8_1(const void * __restrict__ vx, const void } template -static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint64_t *iq2xs_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq2_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8460,7 +8445,7 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq2_xs_q8_1(&x[ibx], &y[iby], iqs, iq2xs_grid, ksigns64); } // sum up partial sums and write back result @@ -8476,9 +8461,11 @@ static void mul_mat_vec_q_iq2_xs_q8_1(const void * __restrict__ vx, const void * } template -static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3xxs_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq2_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8506,7 +8493,7 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq2_s_q8_1(&x[ibx], &y[iby], iqs); } // sum up partial sums and write back result @@ -8522,9 +8509,11 @@ static void mul_mat_vec_q_iq3_xxs_q8_1(const void * __restrict__ vx, const void } template -static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq3s_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq3_xxs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8552,7 +8541,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq3_xxs_q8_1(&x[ibx], &y[iby], iqs, iq3xxs_grid, ksigns64); } // sum up partial sums and write back result @@ -8568,9 +8557,11 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void * } template -static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows, - const sycl::nd_item<3> &item_ct1, - const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) { +static void mul_mat_vec_q_iq3_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); @@ -8598,7 +8589,7 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * (item_ct1.get_local_id(2) % (qi / vdr)); // x block quant index when casting the quants to int - tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_ptr, ksigns64_ptr); + tmp += vec_dot_iq3_s_q8_1(&x[ibx], &y[iby], iqs, iq3s_grid); } // sum up partial sums and write back result @@ -8613,6 +8604,200 @@ static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * } } +template +static void mul_mat_vec_q_iq1_s_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_s_q8_1(&x[ibx], &y[iby], iqs, iq1s_grid_gpu); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq1_m_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq1_m_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + +template +static void mul_mat_vec_q_iq4_nl_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_nl_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + + +template +static void mul_mat_vec_q_iq4_xs_q8_1(const void *__restrict__ vx, + const void *__restrict__ vy, + float *__restrict__ dst, const int ncols, + const int nrows, + const sycl::nd_item<3> &item_ct1) { + const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) + + item_ct1.get_local_id(1); + + if (row >= nrows) { + return; + } + + const int blocks_per_row = ncols / qk; + const int blocks_per_warp = vdr * WARP_SIZE / qi; + +// partial sum for each thread + float tmp = 0.0f; + + const block_q_t * x = (const block_q_t *) vx; + const block_q8_1 * y = (const block_q8_1 *) vy; + + for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; + i += blocks_per_warp) { + const int ibx = row*blocks_per_row + i; // x block index + + const int iby = i * (qk/QK8_1); // y block index that aligns with ibx + + const int iqs = + vdr * + (item_ct1.get_local_id(2) % + (qi / vdr)); // x block quant index when casting the quants to int + + tmp += vec_dot_iq4_xs_q8_1(&x[ibx], &y[iby], iqs); + } + + // sum up partial sums and write back result +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) { + tmp += + dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask); + } + + if (item_ct1.get_local_id(2) == 0) { + dst[row] = tmp; + } +} + + template static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows, const sycl::nd_item<3> &item_ct1) { @@ -9174,64 +9359,71 @@ static void k_sum_rows_f32(const float * x, float * dst, const int ncols, } } + template -static inline void swap(T & a, T & b) { +static inline void ggml_sycl_swap(T & a, T & b) { T tmp = a; a = b; b = tmp; } -template -static void k_argsort_f32_i32(const float * x, int * dst, const int ncols, - const sycl::nd_item<3> &item_ct1) { +template +__dpct_inline__ static void +k_argsort_f32_i32(const float *x, int *dst, const int ncols, int ncols_pad, + const sycl::nd_item<3> &item_ct1, uint8_t *dpct_local) { // bitonic sort int col = item_ct1.get_local_id(2); int row = item_ct1.get_group(1); - if (col >= ncols) return; + if (col >= ncols_pad) { + return; + } const float * x_row = x + row * ncols; - int * dst_row = dst + row * ncols; + auto dst_row = (int *)dpct_local; // initialize indices - if (col < ncols) { - dst_row[col] = col; - } - /* - DPCT1065:58: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better - performance if there is no access to global memory. - */ - item_ct1.barrier(); + dst_row[col] = col; - for (int k = 2; k <= ncols; k *= 2) { + item_ct1.barrier(sycl::access::fence_space::local_space); + + for (int k = 2; k <= ncols_pad; k *= 2) { for (int j = k / 2; j > 0; j /= 2) { int ixj = col ^ j; if (ixj > col) { if ((col & k) == 0) { - if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) { - swap(dst_row[col], dst_row[ixj]); + if (dst_row[col] >= ncols || + (dst_row[ixj] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] > x_row[dst_row[ixj]] : + x_row[dst_row[col]] < x_row[dst_row[ixj]])) + ) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); } } else { - if (order == GGML_SORT_ORDER_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) { - swap(dst_row[col], dst_row[ixj]); + if (dst_row[ixj] >= ncols || + (dst_row[col] < ncols && (order == GGML_SORT_ORDER_ASC ? + x_row[dst_row[col]] < x_row[dst_row[ixj]] : + x_row[dst_row[col]] > x_row[dst_row[ixj]])) + ) { + ggml_sycl_swap(dst_row[col], dst_row[ixj]); } } } /* - DPCT1118:11: SYCL group functions and algorithms must be encountered + DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code. */ - /* - DPCT1065:59: Consider replacing sycl::nd_item::barrier() with - sycl::nd_item::barrier(sycl::access::fence_space::local_space) for - better performance if there is no access to global memory. - */ - item_ct1.barrier(); + item_ct1.barrier(sycl::access::fence_space::local_space); } } + + // copy the result to dst without the padding + if (col < ncols) { + dst[row * ncols + col] = dst_row[col]; + } } + static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past, const sycl::nd_item<3> &item_ct1) { const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) + @@ -10210,31 +10402,64 @@ static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k, #endif } +template +static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_s( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} + +template +static void dequantize_row_iq1_m_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq1_m( + vx, y, item_ct1, iq1s_grid_gpu + ); + }); + }); + } +} template static void dequantize_row_iq2_xxs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq2xxs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_xxs( - vx, y, item_ct1, iq2xxs_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, iq2xxs_grid, + ksigns_iq2xs, kmask_iq2xs); }); }); } @@ -10245,54 +10470,58 @@ static void dequantize_row_iq2_xs_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq2xs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq2_xs( - vx, y, item_ct1, iq2xs_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, iq2xs_grid, + ksigns_iq2xs, kmask_iq2xs); }); }); } } template -static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, - dpct::queue_ptr stream) { +static void dequantize_row_iq2_s_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq3xxs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); + cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq2_s(vx, y, item_ct1); + }); + }); + } +} + +template +static void dequantize_row_iq3_xxs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = k / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_xxs( - vx, y, item_ct1, iq3xxs_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, iq3xxs_grid, + ksigns_iq2xs, kmask_iq2xs); }); }); } @@ -10303,59 +10532,68 @@ static void dequantize_row_iq3_s_sycl(const void *vx, dst_t *y, const int k, dpct::queue_ptr stream) { const int nb = k / QK_K; { - iq3s_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); - dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); stream->submit([&](sycl::handler &cgh) { - auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), [=](sycl::nd_item<3> item_ct1) { dequantize_block_iq3_s( - vx, y, item_ct1, iq3s_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, y, item_ct1, kmask_iq2xs, iq3s_grid); }); }); } } template -static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k, - dpct::queue_ptr stream) { - const int nb = k / QK_K; - { - iq1s_grid_gpu.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); +static void dequantize_row_iq4_xs_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; +#if QK_K == 64 + dequantize_row_iq4_nl_sycl(vx, y, k, stream); +#else + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); - dpct::has_capability_or_fail(stream->get_device(), - {sycl::aspect::fp16}); - - stream->submit([&](sycl::handler &cgh) { - auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); - - cgh.parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * - sycl::range<3>(1, 1, 32), - sycl::range<3>(1, 1, 32)), - [=](sycl::nd_item<3> item_ct1) { - dequantize_block_iq1_s( - vx, y, item_ct1, iq1s_grid_ptr_ct1, - ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); - }); - }); - } + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_xs(vx, y, item_ct1); + }); + }); + } +#endif } + +template +static void dequantize_row_iq4_nl_sycl(const void *vx, dst_t *y, const int k, + dpct::queue_ptr stream) { + const int nb = (k + QK_K - 1) / QK_K; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * + sycl::range<3>(1, 1, 32), + sycl::range<3>(1, 1, 32)), + [=](sycl::nd_item<3> item_ct1) { + dequantize_block_iq4_nl(vx, y, item_ct1); + }); + }); + } +} + + + template static void convert_unary_sycl(const void *__restrict__ vx, dst_t *__restrict__ y, const int k, @@ -10400,16 +10638,24 @@ static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) try { return dequantize_row_q5_K_sycl; case GGML_TYPE_Q6_K: return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_sycl; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_sycl; case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; case GGML_TYPE_F32: return convert_unary_sycl; default: @@ -10444,16 +10690,24 @@ static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) { return dequantize_row_q5_K_sycl; case GGML_TYPE_Q6_K: return dequantize_row_q6_K_sycl; + case GGML_TYPE_IQ1_S: + return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ1_M: + return dequantize_row_iq1_m_sycl; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_sycl; case GGML_TYPE_IQ2_XS: return dequantize_row_iq2_xs_sycl; + case GGML_TYPE_IQ2_S: + return dequantize_row_iq2_s_sycl; case GGML_TYPE_IQ3_XXS: return dequantize_row_iq3_xxs_sycl; case GGML_TYPE_IQ3_S: return dequantize_row_iq3_s_sycl; - case GGML_TYPE_IQ1_S: - return dequantize_row_iq1_s_sycl; + case GGML_TYPE_IQ4_XS: + return dequantize_row_iq4_xs_sycl; + case GGML_TYPE_IQ4_NL: + return dequantize_row_iq4_nl_sycl; case GGML_TYPE_F16: return convert_unary_sycl; default: @@ -10675,12 +10929,8 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10688,8 +10938,7 @@ static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10704,12 +10953,8 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10717,8 +10962,7 @@ static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10733,12 +10977,8 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10746,8 +10986,7 @@ static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10762,12 +11001,8 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10775,8 +11010,7 @@ static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10791,12 +11025,8 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10804,8 +11034,7 @@ static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10820,12 +11049,8 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10833,8 +11058,7 @@ static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10849,12 +11073,8 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10862,8 +11082,7 @@ static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10878,12 +11097,8 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10891,8 +11106,7 @@ static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10907,12 +11121,8 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10920,8 +11130,7 @@ static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10936,12 +11145,8 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), @@ -10949,13 +11154,13 @@ static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy, [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } } + static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols, const int nrows, @@ -10965,23 +11170,13 @@ static void mul_mat_vec_iq2_xxs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq2xxs_grid.init(*stream); - ksigns_iq2xs.init(*stream); - kmask_iq2xs.init(*stream); - - stream->submit([&](sycl::handler &cgh) { - auto iq2xxs_grid_ptr_ct1 = iq2xxs_grid.get_ptr(); - auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr(); - auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr(); - cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq2_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq2xxs_grid_ptr_ct1, ksigns_iq2xs_ptr_ct1, kmask_iq2xs_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -10996,20 +11191,42 @@ static void mul_mat_vec_iq2_xs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq2xs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq2xs_grid_ptr_ct1 = iq2xs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq2_xs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq2xs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq2_s_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + auto iq2xs_grid_ptr_ct1 = &iq2xs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq2_s_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -11024,20 +11241,17 @@ static void mul_mat_vec_iq3_xxs_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3xxs_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3xxs_grid_ptr_ct1 = iq3xxs_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq3xxs_grid_ptr_ct1 = &iq3xxs_grid[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq3_xxs_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq3xxs_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -11052,20 +11266,16 @@ static void mul_mat_vec_iq3_s_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq3s_grid.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq3s_grid_ptr_ct1 = iq3s_grid.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq3s_grid_ptr_ct1 = &iq3s_grid[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq3_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq3s_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -11080,20 +11290,82 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy, const sycl::range<3> block_nums(1, 1, block_num_y); const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); { - iq1s_grid_gpu.init(*stream); - ksigns64.init(*stream); stream->submit([&](sycl::handler &cgh) { - auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr(); - auto ksigns64_ptr_ct1 = ksigns64.get_ptr(); + auto iq1s_grid_ptr_ct1 = &iq1s_grid_gpu[0]; + auto ksigns64_ptr_ct1 = &ksigns64[0]; cgh.parallel_for( sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { mul_mat_vec_q_iq1_s_q8_1( - vx, vy, dst, ncols, nrows, item_ct1, - iq1s_grid_ptr_ct1, ksigns64_ptr_ct1); + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq1_m_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq1_m_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_nl_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK4_NL == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq4_nl_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); + }); + }); + } +} + +static void mul_mat_vec_iq4_xs_q8_1_sycl(const void *vx, const void *vy, + float *dst, const int ncols, + const int nrows, + dpct::queue_ptr stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y; + const sycl::range<3> block_nums(1, 1, block_num_y); + const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE); + { + + stream->submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) + [[intel::reqd_sub_group_size(32)]] { + mul_mat_vec_q_iq4_xs_q8_1( + vx, vy, dst, ncols, nrows, item_ct1); }); }); } @@ -12717,36 +12989,54 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, }); } +static int next_power_of_2(int x) { + int n = 1; + while (n < x) { + n *= 2; + } + return n; +} + static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols, const int nrows, ggml_sort_order order, dpct::queue_ptr stream) { // bitonic sort requires ncols to be power of 2 - GGML_ASSERT((ncols & (ncols - 1)) == 0); + const int ncols_pad = next_power_of_2(ncols); - const sycl::range<3> block_dims(1, 1, ncols); + const sycl::range<3> block_dims(1, 1, ncols_pad); const sycl::range<3> block_nums(1, nrows, 1); + const size_t shared_mem = ncols_pad * sizeof(int); + + // GGML_ASSERT(shared_mem <= ggml_cuda_info().devices[ggml_cuda_get_device()].smpb); + if (order == GGML_SORT_ORDER_ASC) { - /* - DPCT1049:44: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr() + .get()); + }); + }); } else if (order == GGML_SORT_ORDER_DESC) { - /* - DPCT1049:45: The work-group size passed to the SYCL kernel may exceed - the limit. To get the device limit, query - info::device::max_work_group_size. Adjust the work-group size if needed. - */ - stream->parallel_for( - sycl::nd_range<3>(block_nums * block_dims, block_dims), - [=](sycl::nd_item<3> item_ct1) { - k_argsort_f32_i32(x, dst, ncols, item_ct1); - }); + stream->submit([&](sycl::handler &cgh) { + sycl::local_accessor dpct_local_acc_ct1( + sycl::range<1>(shared_mem), cgh); + + cgh.parallel_for( + sycl::nd_range<3>(block_nums * block_dims, block_dims), + [=](sycl::nd_item<3> item_ct1) { + k_argsort_f32_i32( + x, dst, ncols, ncols_pad, item_ct1, + dpct_local_acc_ct1.get_multi_ptr() + .get()); + }); + }); } else { GGML_ASSERT(false); } @@ -12791,11 +13081,13 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float * const int nrows_y, const float scale, const float max_bias, dpct::queue_ptr stream) { int nth = WARP_SIZE; - while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2; + int max_block_size = g_work_group_size; + while (nth < ncols_x && nth < max_block_size) nth *= 2; + if (nth>max_block_size) nth = max_block_size; + const sycl::range<3> block_dims(1, 1, nth); const sycl::range<3> block_nums(1, 1, nrows_x); const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE); - static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); @@ -12805,6 +13097,12 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float * const size_t local_mem_size = stream->get_device().get_info(); if (n_local_scratch*sizeof(float) < local_mem_size) { + if (ncols_x > max_block_size) { + soft_max_f32_submitter(x, mask, pos, dst, ncols_x, nrows_y, scale, + max_bias, m0, m1, n_head_log2, block_nums, + block_dims, n_local_scratch, stream); + return; + } switch (ncols_x) { case 32: soft_max_f32_submitter(x, mask, pos, dst, ncols_x, nrows_y, scale, @@ -13120,20 +13418,27 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) version += std::to_string(prop.get_minor_version()); device_type = std::regex_replace(device_type, std::regex("ext_oneapi_"), ""); + std::string name = std::string(prop.get_name()); + name = std::regex_replace(name, std::regex("\\(R\\)"), ""); + name = std::regex_replace(name, std::regex("\\(TM\\)"), ""); - fprintf(stderr, "|%2d|%18s|%45s|%10s|%11d|%8d|%7d|%15lu|\n", id, device_type.c_str(), - prop.get_name(), version.c_str(), prop.get_max_compute_units(), + auto global_mem_size = prop.get_global_mem_size()/1000000; + + fprintf(stderr, "|%2d|%19s|%39s|%7s|%7d|%8d|%5d|%6luM|%21s|\n", id, device_type.c_str(), + name.c_str(), version.c_str(), prop.get_max_compute_units(), prop.get_max_work_group_size(), prop.get_max_sub_group_size(), - prop.get_global_mem_size()); + global_mem_size, device.get_info().c_str()); } void ggml_backend_sycl_print_sycl_devices() { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_print_sycl_devices\n"); int device_count = dpct::dev_mgr::instance().device_count(); std::map DeviceNums; fprintf(stderr, "found %d SYCL devices:\n", device_count); - fprintf(stderr, "| | | |Compute |Max compute|Max work|Max sub| |\n"); - fprintf(stderr, "|ID| Device Type| Name|capability|units |group |group |Global mem size|\n"); - fprintf(stderr, "|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------|\n"); + fprintf(stderr, "| | | | |Max | |Max |Global | |\n"); + fprintf(stderr, "| | | | |compute|Max work|sub |mem | |\n"); + fprintf(stderr, "|ID| Device Type| Name|Version|units |group |group|size | Driver version|\n"); + fprintf(stderr, "|--|-------------------|---------------------------------------|-------|-------|--------|-----|-------|---------------------|\n"); for (int id = 0; id < device_count; ++id) { sycl::device device = dpct::dev_mgr::instance().get_device(id); sycl::backend backend = device.get_backend(); @@ -13181,11 +13486,13 @@ int get_work_group_size(int user_device_id) { return prop.get_max_work_group_size(); } -void ggml_init_sycl() try { +static void ggml_init_sycl() try { static bool initialized = false; if (!initialized) { + fprintf(stderr, "[SYCL] call ggml_init_sycl\n"); g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); + fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug); #if defined(GGML_SYCL_F16) @@ -13871,8 +14178,12 @@ static int64_t get_row_rounding(ggml_type type, const std::array= VER_GEN9 ? 128 : 64; case GGML_TYPE_IQ3_S: return max_compute_capability >= VER_GEN9 ? 128 : 64; @@ -13891,11 +14202,20 @@ inline void ggml_sycl_op_mul_mat_vec_q( const int64_t src1_ncols, const int64_t src1_padded_row_size, const dpct::queue_ptr &stream) { - GGML_ASSERT(ggml_nrows(src1) == 1); + const int64_t ne10 = src1->ne[0]; + GGML_ASSERT(ne10 % QK8_1 == 0); const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; + int id; + SYCL_CHECK( + CHECK_TRY_ERROR(id = get_current_device_id())); + + // the main device has a larger memory buffer to hold the results from all GPUs + // nrows_dst == nrows of the matrix that the kernel writes into + const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne00 : row_diff; + switch (src0->type) { case GGML_TYPE_Q4_0: mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); @@ -13927,20 +14247,32 @@ inline void ggml_sycl_op_mul_mat_vec_q( case GGML_TYPE_Q6_K: mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ1_S: + mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ1_M: + mul_mat_vec_iq1_m_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_IQ2_XXS: mul_mat_vec_iq2_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; case GGML_TYPE_IQ2_XS: mul_mat_vec_iq2_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_S: + mul_mat_vec_iq2_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; case GGML_TYPE_IQ3_XXS: mul_mat_vec_iq3_xxs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; case GGML_TYPE_IQ3_S: mul_mat_vec_iq3_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; - case GGML_TYPE_IQ1_S: - mul_mat_vec_iq1_s_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + case GGML_TYPE_IQ4_NL: + mul_mat_vec_iq4_nl_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; + case GGML_TYPE_IQ4_XS: + mul_mat_vec_iq4_xs_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; default: GGML_ASSERT(false); @@ -14022,6 +14354,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec( convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream); break; default: + printf("ggml_sycl_op_dequantize_mul_mat_vec unsupported GGML_TYPE %d\n", src0->type); GGML_ASSERT(false); break; } @@ -14413,7 +14746,12 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0, GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); + const ggml_tensor * src2 = dst->src[2]; + +#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support") +#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional + GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional const int64_t ne00 = src0->ne[0]; const int64_t nrows_x = ggml_nrows(src0); @@ -14429,7 +14767,6 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0, float * src2_dd = nullptr; sycl_pool_alloc src2_f; - ggml_tensor * src2 = dst->src[2]; const bool use_src2 = src2 != nullptr; if (use_src2) { @@ -14876,8 +15213,8 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10; } // do the computation - op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, - dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream); + SYCL_CHECK(CHECK_TRY_ERROR(op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, + dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream))); /* DPCT1010:93: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to @@ -15246,6 +15583,9 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, SYCL_CHECK(ggml_sycl_set_device(g_main_device)); dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; + bool no_mixed_dtypes = main_stream->get_backend() == sycl::backend::ext_oneapi_cuda || + main_stream->get_backend() == sycl::backend::ext_oneapi_hip; + SYCL_CHECK( CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream)); @@ -15276,24 +15616,38 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, dpct::library_data_t cu_compute_type = dpct::library_data_t::real_float; dpct::library_data_t cu_data_type = dpct::library_data_t::real_float; + if (no_mixed_dtypes) { + cu_compute_type = dpct::library_data_t::real_half; + cu_data_type = dpct::library_data_t::real_half; + } // dst strides size_t nbd2 = dst->nb[2]; size_t nbd3 = dst->nb[3]; + const float alpha_f32 = 1.0f; + const float beta_f32 = 0.0f; + const sycl::half alpha_f16 = 1.0f; const sycl::half beta_f16 = 0.0f; - const float alpha_f32 = 1.0f; - const float beta_f32 = 0.0f; - const void * alpha = &alpha_f32; const void * beta = &beta_f32; + if (no_mixed_dtypes) { + alpha = &alpha_f16; + beta = &beta_f16; + } // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway - // oneMKL open source supports half, half, float, float: datatypes + // when oneMKL open source supports half, half, float, float: datatypes dst_t = (char *) dst_ddf; + if (no_mixed_dtypes) { + dst_t = (char *) dst_f16.alloc(ne_dst); + + nbd2 /= sizeof(float) / sizeof(sycl::half); + nbd3 /= sizeof(float) / sizeof(sycl::half); + } GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -15379,6 +15733,10 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, } #endif + if (no_mixed_dtypes) { + const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16); + to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream); + } } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -15437,11 +15795,17 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 #ifdef GGML_SYCL_FORCE_DMMV const bool use_mul_mat_vec_q = false; #else - const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1; + bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type); + use_mul_mat_vec_q = use_mul_mat_vec_q || + (src0->type == GGML_TYPE_IQ2_XXS) || (src0->type == GGML_TYPE_IQ2_XS) || (src0->type == GGML_TYPE_IQ2_S) || + (src0->type == GGML_TYPE_IQ3_XXS) || (src0->type == GGML_TYPE_IQ3_S) || + (src0->type == GGML_TYPE_IQ4_NL) || (src0->type == GGML_TYPE_IQ4_XS) || + (src0->type == GGML_TYPE_IQ1_S) || (src0->type == GGML_TYPE_IQ1_M); + + #endif // GGML_SYCL_FORCE_DMMV if (use_mul_mat_vec_q) { - // NOTE: this kernel does not support ggml_nrows(src1) > 1 // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n"); ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true); } else { @@ -15644,73 +16008,76 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst) try { -#if 0 - ggml_sycl_mul_mat_id_sycl(dst); - // TODO: mmq/mmv support -#endif - - const int64_t nb11 = src1->nb[1]; - const int64_t nb1 = dst->nb[1]; - - const struct ggml_tensor * ids = src0; - const int32_t id = ((int32_t *) dst->op_params)[0]; - const int32_t n_as = ((int32_t *) dst->op_params)[1]; - - std::vector ids_host(ggml_nbytes(ids)); - + GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT && + "mul_mat_id does not support split buffers"); + const ggml_tensor *ids = dst->src[2]; const dpct::queue_ptr stream = g_syclStreams[g_main_device][0]; - if (ids->backend == GGML_BACKEND_TYPE_GPU) { - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - SYCL_CHECK(CHECK_TRY_ERROR( - stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)).wait())); - // SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); - } else { - memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); - } + const size_t nb11 = src1->nb[1]; + const size_t nb1 = dst->nb[1]; - const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; - const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; + const int32_t id = ((int32_t *)dst->op_params)[0]; + const int32_t n_as = src0->ne[2]; + std::vector ids_host(ggml_nbytes(ids)); + const char *ids_dev = (const char *)ids->data; + + SYCL_CHECK(CHECK_TRY_ERROR( + stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); + SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); + + const ggml_tensor_extra_gpu *src0_extra = + (const ggml_tensor_extra_gpu *)src0->extra; + const ggml_tensor_extra_gpu *src1_extra = + (const ggml_tensor_extra_gpu *)src1->extra; + const ggml_tensor_extra_gpu *dst_extra = + (const ggml_tensor_extra_gpu *)dst->extra; + + ggml_tensor_extra_gpu src0_row_extra; ggml_tensor_extra_gpu src1_row_extra; ggml_tensor_extra_gpu dst_row_extra; + ggml_tensor src0_row = *src0; ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; src1_row.backend = GGML_BACKEND_TYPE_GPU; dst_row.backend = GGML_BACKEND_TYPE_GPU; + src0_row.extra = &src0_row_extra; src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? - (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? - (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + char *src0_original = src1->backend == GGML_BACKEND_TYPE_CPU + ? (char *)src0->data + : (char *)src0_extra->data_device[g_main_device]; + char *src1_original = src1->backend == GGML_BACKEND_TYPE_CPU + ? (char *)src1->data + : (char *)src1_extra->data_device[g_main_device]; + char *dst_original = dst->backend == GGML_BACKEND_TYPE_CPU + ? (char *)dst->data + : (char *)dst_extra->data_device[g_main_device]; + + src0_row.ne[2] = 1; + src0_row.ne[3] = 1; + src0_row.nb[3] = src0->nb[2]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0])); - //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0])); - - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); + const int32_t row_id = + *(const int32_t *)(ids_host.data() + i01 * ids->nb[1] + + id * ids->nb[0]); GGML_ASSERT(row_id >= 0 && row_id < n_as); - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; + src0_row_extra.data_device[g_main_device] = + src0_original + row_id * src0->nb[2]; + src1_row_extra.data_device[g_main_device] = + src1_original + i01 * src1->nb[1]; + dst_row_extra.data_device[g_main_device] = + dst_original + i01 * dst->nb[1]; - src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; - src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? - - dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; - dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? - - ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row); + ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row); } } else { sycl_pool_alloc src1_contiguous(sizeof(float)*ggml_nelements(src1)); @@ -15720,8 +16087,6 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); for (int32_t row_id = 0; row_id < n_as; ++row_id) { - const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - int64_t num_src1_rows = 0; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); @@ -15734,7 +16099,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, SYCL_CHECK(CHECK_TRY_ERROR( stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11, - src1_original + i01 * nb11, nb11).wait())); + src1_original + i01 * nb11, nb11))); num_src1_rows++; } @@ -15742,6 +16107,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, continue; } + src0_row_extra.data_device[g_main_device] = + src0_original + row_id * src0->nb[2]; + src1_row.ne[1] = num_src1_rows; dst_row.ne[1] = num_src1_rows; @@ -15753,7 +16121,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, dst_row.nb[2] = num_src1_rows*nb1; dst_row.nb[3] = num_src1_rows*nb1; - ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row); + ggml_sycl_mul_mat(&src0_row, &src1_row, &dst_row); num_src1_rows = 0; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -15767,7 +16135,7 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( dst_original + i01 * nb1, - dst_contiguous.get() + num_src1_rows * nb1, nb1).wait())); + dst_contiguous.get() + num_src1_rows * nb1, nb1))); num_src1_rows++; } } @@ -16278,6 +16646,7 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ } GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try { + GGML_SYCL_DEBUG("[SYCL] call ggml_sycl_get_gpu_list\n"); for(int i=0;igpus[device]; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( @@ -16326,6 +16696,7 @@ catch (sycl::exception const &exc) { GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total) try { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_memory\n"); ggml_sycl_set_device(device); /* @@ -16466,11 +16837,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, const dpct::queue_ptr stream = g_syclStreams[ctx->device][0]; SYCL_CHECK( CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); - + char* host_buf = (char*)malloc(size); + memcpy(host_buf, data, size); SYCL_CHECK( CHECK_TRY_ERROR((*stream) - .memcpy((char *)tensor->data + offset, data, size) + .memcpy((char *)tensor->data + offset, host_buf, size) .wait())); + free(host_buf); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -16677,6 +17050,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { }; ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_index) { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); + if (device_index>=g_device_count or device_index<0) { printf("ggml_backend_sycl_buffer_type error: device_index:%d is out of range [0, %d], miss to call ggml_backend_sycl_set_single_device()\n", device_index, g_device_count-1); @@ -17046,6 +17421,8 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_split_buffer_type\n"); + ggml_init_sycl(); // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -17117,6 +17494,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm } ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_host_buffer_type\n"); static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = { /* .iface = */ { /* .get_name = */ ggml_backend_sycl_host_buffer_type_name, @@ -17231,7 +17609,7 @@ GGML_CALL static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t back params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } #ifndef NDEBUG @@ -17289,9 +17667,14 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons return false; } ggml_type a_type = a->type; - if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ2_S || - a_type == GGML_TYPE_IQ4_XS) { - return false; + if (a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ4_XS || + a_type == GGML_TYPE_IQ3_XXS || a_type == GGML_TYPE_IQ3_S || + a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ2_S || + a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ1_M + ) { + if (b->ne[1] == 1 && ggml_nrows(b) > 1) { + return false; + } } return true; } break; @@ -17379,6 +17762,13 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +GGML_CALL static bool ggml_backend_sycl_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + const int min_batch_size = 32; + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS && op->op != GGML_OP_MUL_MAT_ID; + GGML_UNUSED(backend); +} + + static ggml_backend_i ggml_backend_sycl_interface = { /* .get_name = */ ggml_backend_sycl_name, /* .free = */ ggml_backend_sycl_free, @@ -17392,7 +17782,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, - /* .offload_op = */ NULL, + /* .offload_op = */ ggml_backend_sycl_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, @@ -17406,7 +17796,8 @@ static ggml_guid_t ggml_backend_sycl_guid() { } GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { - ggml_init_sycl(); // TODO: remove from ggml.c + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_init\n"); + ggml_init_sycl(); check_allow_gpu_index(device); @@ -17432,6 +17823,7 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) { } GGML_CALL int ggml_backend_sycl_get_device_count() { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_count\n"); if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); return g_sycl_gpu_mgr->get_gpu_count(); } @@ -17444,16 +17836,21 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, } GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_index\n"); return g_sycl_gpu_mgr->get_index(device_id); } GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index) { + GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_get_device_id\n"); return g_sycl_gpu_mgr->gpus[device_index]; } GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) { - GGML_ASSERT(main_gpu_id #include #include -#include #include #include #include @@ -121,8 +120,16 @@ struct vk_device { vk_matmul_pipeline pipeline_dequant_mul_mat_mat[VK_NUM_TYPES]; + vk_matmul_pipeline pipeline_matmul_id_f32; + vk_matmul_pipeline pipeline_matmul_id_f16; + vk_matmul_pipeline pipeline_matmul_id_f16_f32; + + vk_matmul_pipeline pipeline_dequant_mul_mat_mat_id[VK_NUM_TYPES]; + vk_pipeline pipeline_dequant[VK_NUM_TYPES]; - vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_dequant_mul_mat_vec_f32_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_dequant_mul_mat_vec_f16_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_dequant_mul_mat_vec_id_f32[VK_NUM_TYPES]; vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; @@ -140,7 +147,7 @@ struct vk_device { vk_pipeline pipeline_silu_f32; vk_pipeline pipeline_relu_f32; vk_pipeline pipeline_diag_mask_inf_f32; - vk_pipeline pipeline_soft_max_f32; + vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16; vk_pipeline pipeline_rope_f32, pipeline_rope_f16; vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; vk_pipeline pipeline_argsort_f32; @@ -216,6 +223,21 @@ struct vk_submission { typedef std::vector vk_sequence; +struct vk_mat_mat_push_constants { + uint32_t M; uint32_t N; uint32_t K; + uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; uint32_t k_split; + uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; + uint32_t expert_stride_b; uint32_t expert_stride_d; + uint32_t idx; uint32_t nbi1; uint32_t n_as; +}; + +struct vk_mat_vec_push_constants { + uint32_t ncols; uint32_t stride_a; uint32_t stride_b; uint32_t stride_d; + uint32_t ne02; uint32_t ne12; uint32_t broadcast2; uint32_t broadcast3; + uint32_t batch_stride_a; uint32_t batch_stride_b; uint32_t batch_stride_d; +}; + struct vk_op_push_constants { uint32_t KX; uint32_t KY; @@ -340,8 +362,8 @@ struct ggml_backend_vk_context { size_t semaphore_idx, event_idx; ggml_vk_garbage_collector gc; std::vector> pinned_memory; - size_t prealloc_size_qx, prealloc_size_qy, prealloc_size_x, prealloc_size_y, prealloc_size_split_k; - vk_buffer prealloc_qx, prealloc_qy, prealloc_x, prealloc_y, prealloc_split_k; + size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; + vk_buffer prealloc_x, prealloc_y, prealloc_split_k; vk::Fence fence; vk_buffer staging; size_t staging_size; @@ -809,7 +831,7 @@ static uint32_t find_properties(const vk::PhysicalDeviceMemoryProperties* mem_pr static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t size, vk::MemoryPropertyFlags req_flags, vk::MemoryPropertyFlags fallback_flags = vk::MemoryPropertyFlags(0)) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_create_buffer(" << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl; + std::cerr << "ggml_vk_create_buffer(device " << ctx->idx << ", " << size << ", " << to_string(req_flags) << ", " << to_string(fallback_flags) << ")" << std::endl; #endif vk_buffer buf = std::make_shared(); @@ -998,132 +1020,428 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0] = std::make_shared(); ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1] = std::make_shared(); ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K] = std::make_shared(); + + /*ctx->device->pipeline_matmul_id_f32 = std::make_shared(); + ctx->device->pipeline_matmul_id_f16_f32 = std::make_shared(); + ctx->device->pipeline_matmul_id_f16 = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K] = std::make_shared(); + ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared();*/ if (device->fp16) { - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_len, matmul_q4_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_len, matmul_q4_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_0_f32_l", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_0_f32_m", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_0_f32_s", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_0_f32_l", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_0_f32_m", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_0_f32_s", matmul_q4_1_f32_len, matmul_q4_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_1_f32_aligned_len, matmul_q4_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_len, matmul_q5_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_len, matmul_q5_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_len, matmul_q5_1_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_len, matmul_q5_1_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_len, matmul_q8_0_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_len, matmul_q8_0_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_len, matmul_q2_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_len, matmul_q2_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_len, matmul_q3_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_len, matmul_q3_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_len, matmul_q4_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_len, matmul_q4_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_len, matmul_q5_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_len, matmul_q5_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_len, matmul_q6_k_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_len, matmul_q6_k_f32_aligned_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + /*ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_len, matmul_id_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_len, matmul_id_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_len, matmul_id_f16_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_len, matmul_id_f16_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_len, matmul_id_f16_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_len, matmul_id_f16_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_len, matmul_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_len, matmul_id_q4_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_1_f32_len, matmul_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_1_f32_aligned_len, matmul_id_q4_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_len, matmul_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_len, matmul_id_q5_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_len, matmul_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_len, matmul_id_q5_1_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_len, matmul_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_len, matmul_id_q8_0_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_len, matmul_id_q2_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_len, matmul_id_q2_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_len, matmul_id_q3_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_len, matmul_id_q3_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_len, matmul_id_q4_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_len, matmul_id_q4_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_len, matmul_id_q5_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_len, matmul_id_q5_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_len, matmul_id_q6_k_f32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_len, matmul_id_q6_k_f32_aligned_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);*/ } else { - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32->a_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16->a_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32->a_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->l, "matmul_q4_0_f32_l", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->m, "matmul_q4_0_f32_m", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->s, "matmul_q4_0_f32_s", matmul_q4_0_f32_fp32_len, matmul_q4_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_l, "matmul_q4_0_f32_aligned_l", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_m, "matmul_q4_0_f32_aligned_m", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_0]->a_s, "matmul_q4_0_f32_aligned_s", matmul_q4_0_f32_aligned_fp32_len, matmul_q4_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->l, "matmul_q4_1_f32_l", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->m, "matmul_q4_1_f32_m", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->s, "matmul_q4_1_f32_s", matmul_q4_1_f32_fp32_len, matmul_q4_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_l, "matmul_q4_1_f32_aligned_l", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_m, "matmul_q4_1_f32_aligned_m", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_1]->a_s, "matmul_q4_1_f32_aligned_s", matmul_q4_1_f32_aligned_fp32_len, matmul_q4_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->l, "matmul_q5_0_f32_l", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->m, "matmul_q5_0_f32_m", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->s, "matmul_q5_0_f32_s", matmul_q5_0_f32_fp32_len, matmul_q5_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_l, "matmul_q5_0_f32_aligned_l", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_m, "matmul_q5_0_f32_aligned_m", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_0]->a_s, "matmul_q5_0_f32_aligned_s", matmul_q5_0_f32_aligned_fp32_len, matmul_q5_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->l, "matmul_q5_1_f32_l", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->m, "matmul_q5_1_f32_m", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->s, "matmul_q5_1_f32_s", matmul_q5_1_f32_fp32_len, matmul_q5_1_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_l, "matmul_q5_1_f32_aligned_l", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_m, "matmul_q5_1_f32_aligned_m", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_1]->a_s, "matmul_q5_1_f32_aligned_s", matmul_q5_1_f32_aligned_fp32_len, matmul_q5_1_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_mmq_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_mmq_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->l, "matmul_q8_0_f32_l", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->m, "matmul_q8_0_f32_m", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->s, "matmul_q8_0_f32_s", matmul_q8_0_f32_fp32_len, matmul_q8_0_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_l, "matmul_q8_0_f32_aligned_l", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_m, "matmul_q8_0_f32_aligned_m", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q8_0]->a_s, "matmul_q8_0_f32_aligned_s", matmul_q8_0_f32_aligned_fp32_len, matmul_q8_0_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->l, "matmul_q2_k_f32_l", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->m, "matmul_q2_k_f32_m", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->s, "matmul_q2_k_f32_s", matmul_q2_k_f32_fp32_len, matmul_q2_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_l, "matmul_q2_k_f32_aligned_l", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_m, "matmul_q2_k_f32_aligned_m", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q2_K]->a_s, "matmul_q2_k_f32_aligned_s", matmul_q2_k_f32_aligned_fp32_len, matmul_q2_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->l, "matmul_q3_k_f32_l", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->m, "matmul_q3_k_f32_m", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->s, "matmul_q3_k_f32_s", matmul_q3_k_f32_fp32_len, matmul_q3_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_l, "matmul_q3_k_f32_aligned_l", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_m, "matmul_q3_k_f32_aligned_m", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q3_K]->a_s, "matmul_q3_k_f32_aligned_s", matmul_q3_k_f32_aligned_fp32_len, matmul_q3_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->l, "matmul_q4_k_f32_l", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->m, "matmul_q4_k_f32_m", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->s, "matmul_q4_k_f32_s", matmul_q4_k_f32_fp32_len, matmul_q4_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_l, "matmul_q4_k_f32_aligned_l", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_m, "matmul_q4_k_f32_aligned_m", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q4_K]->a_s, "matmul_q4_k_f32_aligned_s", matmul_q4_k_f32_aligned_fp32_len, matmul_q4_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->l, "matmul_q5_k_f32_l", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->m, "matmul_q5_k_f32_m", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->s, "matmul_q5_k_f32_s", matmul_q5_k_f32_fp32_len, matmul_q5_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_l, "matmul_q5_k_f32_aligned_l", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_m, "matmul_q5_k_f32_aligned_m", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q5_K]->a_s, "matmul_q5_k_f32_aligned_s", matmul_q5_k_f32_aligned_fp32_len, matmul_q5_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->l, "matmul_q6_k_f32_l", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->m, "matmul_q6_k_f32_m", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->s, "matmul_q6_k_f32_s", matmul_q6_k_f32_fp32_len, matmul_q6_k_f32_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_l, "matmul_q6_k_f32_aligned_l", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_m, "matmul_q6_k_f32_aligned_m", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat[GGML_TYPE_Q6_K]->a_s, "matmul_q6_k_f32_aligned_s", matmul_q6_k_f32_aligned_fp32_len, matmul_q6_k_f32_aligned_fp32_data, "main", 3, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + /*ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->l, "matmul_id_f32_l", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->m, "matmul_id_f32_m", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->s, "matmul_id_f32_s", matmul_id_f32_fp32_len, matmul_id_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_l, "matmul_id_f32_aligned_l", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_m, "matmul_id_f32_aligned_m", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f32->a_s, "matmul_id_f32_aligned_s", matmul_id_f32_aligned_fp32_len, matmul_id_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->l, "matmul_id_f16_l", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->m, "matmul_id_f16_m", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->s, "matmul_id_f16_s", matmul_id_f16_fp32_len, matmul_id_f16_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_l, "matmul_id_f16_aligned_l", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_m, "matmul_id_f16_aligned_m", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16->a_s, "matmul_id_f16_aligned_s", matmul_id_f16_aligned_fp32_len, matmul_id_f16_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->l, "matmul_id_f16_f32_l", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->m, "matmul_id_f16_f32_m", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->s, "matmul_id_f16_f32_s", matmul_id_f16_f32_fp32_len, matmul_id_f16_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_l, "matmul_id_f16_f32_aligned_l", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_m, "matmul_id_f16_f32_aligned_m", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_id_f16_f32->a_s, "matmul_id_f16_f32_aligned_s", matmul_id_f16_f32_aligned_fp32_len, matmul_id_f16_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_0_f32_fp32_len, matmul_id_q4_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_0]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_0_f32_aligned_fp32_len, matmul_id_q4_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->l, "matmul_id_q4_0_f32_l", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->m, "matmul_id_q4_0_f32_m", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->s, "matmul_id_q4_0_f32_s", matmul_id_q4_1_f32_fp32_len, matmul_id_q4_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_l, "matmul_id_q4_0_f32_aligned_l", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_m, "matmul_id_q4_0_f32_aligned_m", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_1]->a_s, "matmul_id_q4_0_f32_aligned_s", matmul_id_q4_1_f32_aligned_fp32_len, matmul_id_q4_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->l, "matmul_id_q5_0_f32_l", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->m, "matmul_id_q5_0_f32_m", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->s, "matmul_id_q5_0_f32_s", matmul_id_q5_0_f32_fp32_len, matmul_id_q5_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_l, "matmul_id_q5_0_f32_aligned_l", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_m, "matmul_id_q5_0_f32_aligned_m", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_0]->a_s, "matmul_id_q5_0_f32_aligned_s", matmul_id_q5_0_f32_aligned_fp32_len, matmul_id_q5_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->l, "matmul_id_q5_1_f32_l", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->m, "matmul_id_q5_1_f32_m", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->s, "matmul_id_q5_1_f32_s", matmul_id_q5_1_f32_fp32_len, matmul_id_q5_1_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_l, "matmul_id_q5_1_f32_aligned_l", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_m, "matmul_id_q5_1_f32_aligned_m", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_1]->a_s, "matmul_id_q5_1_f32_aligned_s", matmul_id_q5_1_f32_aligned_fp32_len, matmul_id_q5_1_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->l, "matmul_id_q8_0_f32_l", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->m, "matmul_id_q8_0_f32_m", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->s, "matmul_id_q8_0_f32_s", matmul_id_q8_0_f32_fp32_len, matmul_id_q8_0_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_l, "matmul_id_q8_0_f32_aligned_l", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_m, "matmul_id_q8_0_f32_aligned_m", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q8_0]->a_s, "matmul_id_q8_0_f32_aligned_s", matmul_id_q8_0_f32_aligned_fp32_len, matmul_id_q8_0_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->l, "matmul_id_q2_k_f32_l", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->m, "matmul_id_q2_k_f32_m", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->s, "matmul_id_q2_k_f32_s", matmul_id_q2_k_f32_fp32_len, matmul_id_q2_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_l, "matmul_id_q2_k_f32_aligned_l", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_m, "matmul_id_q2_k_f32_aligned_m", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q2_K]->a_s, "matmul_id_q2_k_f32_aligned_s", matmul_id_q2_k_f32_aligned_fp32_len, matmul_id_q2_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->l, "matmul_id_q3_k_f32_l", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->m, "matmul_id_q3_k_f32_m", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->s, "matmul_id_q3_k_f32_s", matmul_id_q3_k_f32_fp32_len, matmul_id_q3_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_l, "matmul_id_q3_k_f32_aligned_l", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_m, "matmul_id_q3_k_f32_aligned_m", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q3_K]->a_s, "matmul_id_q3_k_f32_aligned_s", matmul_id_q3_k_f32_aligned_fp32_len, matmul_id_q3_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->l, "matmul_id_q4_k_f32_l", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->m, "matmul_id_q4_k_f32_m", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->s, "matmul_id_q4_k_f32_s", matmul_id_q4_k_f32_fp32_len, matmul_id_q4_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_l, "matmul_id_q4_k_f32_aligned_l", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_m, "matmul_id_q4_k_f32_aligned_m", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q4_K]->a_s, "matmul_id_q4_k_f32_aligned_s", matmul_id_q4_k_f32_aligned_fp32_len, matmul_id_q4_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->l, "matmul_id_q5_k_f32_l", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->m, "matmul_id_q5_k_f32_m", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->s, "matmul_id_q5_k_f32_s", matmul_id_q5_k_f32_fp32_len, matmul_id_q5_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_l, "matmul_id_q5_k_f32_aligned_l", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_m, "matmul_id_q5_k_f32_aligned_m", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q5_K]->a_s, "matmul_id_q5_k_f32_aligned_s", matmul_id_q5_k_f32_aligned_fp32_len, matmul_id_q5_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->l, "matmul_id_q6_k_f32_l", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->m, "matmul_id_q6_k_f32_m", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->s, "matmul_id_q6_k_f32_s", matmul_id_q6_k_f32_fp32_len, matmul_id_q6_k_f32_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_l, "matmul_id_q6_k_f32_aligned_l", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_mmq_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_m, "matmul_id_q6_k_f32_aligned_m", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_mmq_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K]->a_s, "matmul_id_q6_k_f32_aligned_s", matmul_id_q6_k_f32_aligned_fp32_len, matmul_id_q6_k_f32_aligned_fp32_data, "main", 4, sizeof(vk_mat_mat_push_constants), s_wg_denoms, warptile_mmq_s, s_align);*/ } - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + // mul mat vec + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32_f32", mul_mat_vec_f16_f32_f32_len, mul_mat_vec_f16_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32_f32", mul_mat_vec_q4_0_f32_f32_len, mul_mat_vec_q4_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32_f32", mul_mat_vec_q4_1_f32_f32_len, mul_mat_vec_q4_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32_f32", mul_mat_vec_q5_0_f32_f32_len, mul_mat_vec_q5_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32_f32", mul_mat_vec_q5_1_f32_f32_len, mul_mat_vec_q5_1_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32_f32", mul_mat_vec_q8_0_f32_f32_len, mul_mat_vec_q8_0_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32_f32", mul_mat_vec_q2_K_f32_f32_len, mul_mat_vec_q2_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32_f32", mul_mat_vec_q3_K_f32_f32_len, mul_mat_vec_q3_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32_f32", mul_mat_vec_q4_K_f32_f32_len, mul_mat_vec_q4_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32_f32", mul_mat_vec_q5_K_f32_f32_len, mul_mat_vec_q5_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32_f32", mul_mat_vec_q6_K_f32_f32_len, mul_mat_vec_q6_K_f32_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f16_f32", mul_mat_vec_f16_f16_f32_len, mul_mat_vec_f16_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f16_f32", mul_mat_vec_q4_0_f16_f32_len, mul_mat_vec_q4_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f16_f32", mul_mat_vec_q4_1_f16_f32_len, mul_mat_vec_q4_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f16_f32", mul_mat_vec_q5_0_f16_f32_len, mul_mat_vec_q5_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f16_f32", mul_mat_vec_q5_1_f16_f32_len, mul_mat_vec_q5_1_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f16_f32", mul_mat_vec_q8_0_f16_f32_len, mul_mat_vec_q8_0_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f16_f32", mul_mat_vec_q2_K_f16_f32_len, mul_mat_vec_q2_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f16_f32", mul_mat_vec_q3_K_f16_f32_len, mul_mat_vec_q3_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f16_f32", mul_mat_vec_q4_K_f16_f32_len, mul_mat_vec_q4_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f16_f32", mul_mat_vec_q5_K_f16_f32_len, mul_mat_vec_q5_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f16_f32", mul_mat_vec_q6_K_f16_f32_len, mul_mat_vec_q6_K_f16_f32_data, "main", 3, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + + /*ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_F16 ], "mul_mat_vec_id_f16_f32", mul_mat_vec_id_f16_f32_len, mul_mat_vec_id_f16_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_0], "mul_mat_vec_id_q4_0_f32", mul_mat_vec_id_q4_0_f32_len, mul_mat_vec_id_q4_0_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_1], "mul_mat_vec_id_q4_1_f32", mul_mat_vec_id_q4_1_f32_len, mul_mat_vec_id_q4_1_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_0], "mul_mat_vec_id_q5_0_f32", mul_mat_vec_id_q5_0_f32_len, mul_mat_vec_id_q5_0_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_1], "mul_mat_vec_id_q5_1_f32", mul_mat_vec_id_q5_1_f32_len, mul_mat_vec_id_q5_1_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q8_0], "mul_mat_vec_id_q8_0_f32", mul_mat_vec_id_q8_0_f32_len, mul_mat_vec_id_q8_0_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q2_K], "mul_mat_vec_id_q2_K_f32", mul_mat_vec_id_q2_K_f32_len, mul_mat_vec_id_q2_K_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q3_K], "mul_mat_vec_id_q3_K_f32", mul_mat_vec_id_q3_K_f32_len, mul_mat_vec_id_q3_K_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q4_K], "mul_mat_vec_id_q4_K_f32", mul_mat_vec_id_q4_K_f32_len, mul_mat_vec_id_q4_K_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q5_K], "mul_mat_vec_id_q5_K_f32", mul_mat_vec_id_q5_K_f32_len, mul_mat_vec_id_q5_K_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_id_f32[GGML_TYPE_Q6_K], "mul_mat_vec_id_q6_K_f32", mul_mat_vec_id_q6_K_f32_len, mul_mat_vec_id_q6_K_f32_data, "main", 4, sizeof(vk_mat_vec_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);*/ // dequant shaders ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", dequant_f32_len, dequant_f32_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); @@ -1139,19 +1457,21 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); // get_rows - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F32 ], "get_rows_f32", get_rows_f32_len, get_rows_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f32_f32", get_rows_f32_f32_len, get_rows_f32_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F16 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), { 512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); @@ -1182,6 +1502,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32_f16, "soft_max_f32_f16", soft_max_f32_f16_len, soft_max_f32_f16_data, "main", 4, sizeof(vk_op_soft_max_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); @@ -1341,7 +1662,33 @@ void ggml_vk_instance_init() { vk_instance.device_indices.push_back(tmp); } } else { - vk_instance.device_indices.push_back(0); + std::vector devices = vk_instance.instance.enumeratePhysicalDevices(); + + // Make sure at least one device exists + if (devices.empty()) { + std::cerr << "ggml_vulkan: Error: No devices found." << std::endl; + GGML_ASSERT(false); + } + + // Default to using all dedicated GPUs + for (size_t i = 0; i < devices.size(); i++) { + vk::PhysicalDeviceProperties props = devices[i].getProperties(); + + if (props.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) { + vk_instance.device_indices.push_back(i); + } + } + + // If no dedicated GPUs found, fall back to GPU 0 + if (vk_instance.device_indices.empty()) { + vk_instance.device_indices.push_back(0); + } + } + + std::cerr << "ggml_vulkan: Found " << vk_instance.device_indices.size() << " Vulkan devices:" << std::endl; + + for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { + ggml_vk_print_gpu_info(i); } vk_instance_initialized = true; @@ -1567,6 +1914,15 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte switch (src0_type) { case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: break; default: return nullptr; @@ -1575,11 +1931,48 @@ static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_pipeline(ggml_backend_vk_conte return ctx->device->pipeline_dequant_mul_mat_mat[src0_type]; } -static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) { +static vk_matmul_pipeline ggml_vk_get_mul_mat_mat_id_pipeline(ggml_backend_vk_context * ctx, ggml_type src0_type, ggml_type src1_type) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_get_mul_mat_mat_id_pipeline()" << std::endl; +#endif + if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f32; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return ctx->device->pipeline_matmul_id_f16_f32; + } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { + return ctx->device->pipeline_matmul_id_f16; + } + + GGML_ASSERT(src1_type == GGML_TYPE_F32); + + switch (src0_type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + break; + default: + return nullptr; + } + + return ctx->device->pipeline_dequant_mul_mat_mat_id[src0_type]; +} + +static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type a_type, ggml_type b_type) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; #endif - switch (type) { + GGML_ASSERT(b_type == GGML_TYPE_F32 || b_type == GGML_TYPE_F16); + + switch (a_type) { case GGML_TYPE_F16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -1596,7 +1989,7 @@ static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * return nullptr; } - return ctx->device->pipeline_dequant_mul_mat_vec_f32[type]; + return b_type == GGML_TYPE_F32 ? ctx->device->pipeline_dequant_mul_mat_vec_f32_f32[a_type] : ctx->device->pipeline_dequant_mul_mat_vec_f16_f32[a_type]; } static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) { @@ -1802,6 +2195,9 @@ static void ggml_vk_ctx_begin(ggml_backend_vk_context * ctx, vk_context * subctx } static size_t ggml_vk_align_size(size_t width, size_t align) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_align_size(" << width << ", " << align << ")" << std::endl; +#endif return CEIL_DIV(width, align) * align; } @@ -2034,7 +2430,6 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds ggml_vk_submit(subctx, ctx->fence); VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); ctx->device->device.resetFences({ ctx->fence }); - ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue); } } @@ -2131,7 +2526,6 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); } - ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue); } } @@ -2259,11 +2653,13 @@ static uint32_t ggml_vk_guess_split_k(int m, int n, int k) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_split_k(" << m << ", " << n << ", " << k << ")" << std::endl; #endif - if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { - return 4; - } + // if (k > 128 && (m < 128 || n < 128) && m > 2 && n > 2) { + // return 4; + // } return 1; + + GGML_UNUSED(m); GGML_UNUSED(n); GGML_UNUSED(k); } static vk_pipeline ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, int m, int n, bool aligned) { @@ -2298,6 +2694,8 @@ static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, return ggml_vk_guess_matmul_pipeline_apple(ctx, mmp, aligned); case VK_VENDOR_ID_INTEL: return ggml_vk_guess_matmul_pipeline_intel(ctx, mmp, aligned); + default: + break; } if (m <= 32 || n <= 32) { @@ -2313,25 +2711,58 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl; #endif - return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, false)->align; + return ggml_vk_guess_matmul_pipeline(ctx, mmp, m, n, true)->align; } -static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) { +static void ggml_vk_matmul( + ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, + vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, + uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, + uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, + uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, + uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl; #endif ggml_vk_sync_buffers(subctx); if (split_k == 1) { - const std::array pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc.size() * sizeof(uint32_t), pc.data(), { m, n, batch }); + const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d, expert_stride_b, expert_stride_d, idx, nbi1, n_as }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch }); return; } GGML_ASSERT(batch_stride_d == m * n); - const std::array pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; + const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d, expert_stride_b, expert_stride_d, idx, nbi1, n_as }; // Make sure enough workgroups get assigned for split k to work - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); + ggml_vk_sync_buffers(subctx); + const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); +} + +static void ggml_vk_matmul_id( + ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, + vk_subbuffer&& ids, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& a, vk_subbuffer&& split_k_buffer, + uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, + uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, + uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d, + uint32_t expert_stride_b, uint32_t expert_stride_d, uint32_t idx, uint32_t nbi1, uint32_t n_as) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), c: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << split_k_buffer.buffer->buffer << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ")" << std::endl; +#endif + ggml_vk_sync_buffers(subctx); + if (split_k == 1) { + const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, k, ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d, expert_stride_b, expert_stride_d, idx, nbi1, n_as }; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { ids, b, d, a }, sizeof(vk_mat_mat_push_constants), &pc, { m, n, batch }); + return; + } + + GGML_ASSERT(batch_stride_d == m * n); + + const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d, expert_stride_b, expert_stride_d, idx, nbi1, n_as }; + // Make sure enough workgroups get assigned for split k to work + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { ids, b, split_k_buffer, a }, sizeof(vk_mat_mat_push_constants), &pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); ggml_vk_sync_buffers(subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); @@ -2423,11 +2854,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; - - const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); + const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; @@ -2449,7 +2877,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su const int d_ne = ne11 * ne01; const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); - const bool aligned = ne10 == kpad; + const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); @@ -2469,16 +2897,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; - if (load_x) { - d_Qx = ctx->prealloc_qx; - } else if (!src0_uma) { + if (!src0_uma) { d_Qx = extra_src0->buffer_gpu.lock(); qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } - if (load_y) { - d_Qy = ctx->prealloc_qy; - } else if (!src1_uma) { + if (!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); @@ -2530,38 +2954,34 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su if (x_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); - } else if (load_x || qx_needs_dequant) { - if (load_x) { - // copy data to device - ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); - ctx->staging_offset = qx_sz * ne02 * ne03; - } - - if (qx_needs_dequant) { - const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); - } + } else if (qx_needs_dequant) { + const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } if (y_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); - } else if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); } uint32_t stride_batch_x = ne00*ne01; uint32_t stride_batch_y = ne10*ne11; - if (!ggml_vk_dim01_contiguous(src0) && !load_x && !qx_needs_dequant) { + if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); } - if (!ggml_vk_dim01_contiguous(src1) && !load_y && !qy_needs_dequant) { + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); } // compute - ggml_vk_matmul(ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT + ggml_vk_matmul( + ctx, subctx, pipeline, + { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, + { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, + ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21, + 0, 0, 0, 0, 1 + ); // NOLINT if (dst->backend == GGML_BACKEND_TYPE_CPU) { // copy dst to host @@ -2591,8 +3011,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context GGML_ASSERT(ne11 == 1); - const uint64_t nb2 = dst->nb[2]; - const uint64_t nb3 = dst->nb[3]; + const uint64_t ne20 = dst->ne[0]; + const uint64_t ne21 = dst->ne[1]; + const uint64_t ne22 = dst->ne[2]; + const uint64_t ne23 = dst->ne[3]; const uint64_t r2 = ne12 / ne02; const uint64_t r3 = ne13 / ne03; @@ -2616,17 +3038,17 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context src1_uma = d_Qy != nullptr; } - const bool load_x = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; - const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; - - const bool x_non_contig = !load_x && !ggml_vk_dim01_contiguous(src0); - const bool y_non_contig = !load_y && !ggml_vk_dim01_contiguous(src1); + const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; const bool qx_needs_dequant = x_non_contig; const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig; + // Not implemented + GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT + const uint64_t x_ne = ne01 * ne00; const uint64_t y_ne = ne11 * ne10; const uint64_t d_ne = ne11 * ne01; @@ -2644,16 +3066,12 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context uint64_t x_buf_offset = 0; vk_buffer d_Y; uint64_t y_buf_offset = 0; - if (load_x) { - d_Qx = ctx->prealloc_qx; - } else if(!src1_uma) { + if(!src0_uma) { d_Qx = extra_src0->buffer_gpu.lock(); qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); } - if (load_y) { - d_Qy = ctx->prealloc_qy; - } else if(!src1_uma) { + if(!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qy != nullptr); @@ -2683,7 +3101,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } - vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type); + vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type); GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); @@ -2700,54 +3118,31 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context if (x_non_contig) { GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); - } else if (load_x) { - // copy data to device - ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qx, 0, src0, 0, 0, ggml_nrows(src0)); } if (y_non_contig) { GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); - } else if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, 0, src1, 0, 0, ggml_nrows(src1)); } - for (uint64_t i13 = 0; i13 < ne13; i13++) { - const uint64_t i03 = i13 / r3; - for (uint64_t i12 = 0; i12 < ne12; i12++) { - const uint64_t i02 = i12 / r2; + uint32_t stride_batch_x = ne00*ne01; + uint32_t stride_batch_y = ne10*ne11; - const uint64_t it_idx0 = (i03 * ne02 + i02); - const uint64_t it_idx1 = (i13 * ne12 + i12); - const uint64_t x_offset = x_buf_offset + x_sz * it_idx0; - const uint64_t qy_offset = qy_buf_offset + qy_sz * it_idx1; - const uint64_t y_offset = y_buf_offset + y_sz * it_idx1; - const uint64_t d_offset = d_buf_offset + d_sz * it_idx1; - - const uint64_t y_buffer_offset = (y_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; - const uint64_t y_shader_offset = y_offset - y_buffer_offset; - - const uint64_t d_buffer_offset = (d_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; - const uint64_t d_shader_offset = d_offset - d_buffer_offset; - - if (!y_non_contig && qy_needs_dequant) { - const std::vector pc = { (uint32_t)ne11, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(y_ne / 32) }; - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)y_ne, 1, 1}); - } - - // compute - const std::array pc = { (uint32_t)ne00, (uint32_t)(y_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type))}; - ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - ggml_vk_sync_buffers(subctx); - ggml_vk_buffer_read_async(ctx, subctx, d_D, d_offset, d, sizeof(float) * d_ne); - } - } + if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { + stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); } + + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { + stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); + } + + // compute + const vk_mat_vec_push_constants pc = { + (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, + (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, + stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21), + }; + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1}); } static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -2789,8 +3184,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; - const uint64_t x_ne = ne00 * ne01 * ne02; const uint64_t y_ne = ne10 * ne11 * ne12; const uint64_t d_ne = ne01 * ne11 * ne12; @@ -2805,9 +3198,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); const uint64_t qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); - if (load_y) { - d_Qy = ctx->prealloc_qy; - } else if (!src1_uma) { + if (!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qx != nullptr); @@ -2822,10 +3213,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; - if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); - } - // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); @@ -2881,8 +3268,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con src1_uma = d_Qy != nullptr; } - const bool load_y = src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; - const uint64_t d_ne = ne01 * ne11 * ne12; const uint32_t row_stride_x = nb01 / sizeof(ggml_fp16_t); @@ -2898,9 +3283,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con vk_buffer d_Qx = extra_src0->buffer_gpu.lock(); const uint64_t qx_buf_offset = extra_src0->offset; GGML_ASSERT(d_Qx != nullptr); - if (load_y) { - d_Qy = ctx->prealloc_qy; - } else { + if (!src1_uma) { d_Qy = extra_src1->buffer_gpu.lock(); qy_buf_offset = extra_src1->offset; GGML_ASSERT(d_Qx != nullptr); @@ -2915,10 +3298,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; - if (load_y) { - ggml_vk_h2d_tensor_2d(ctx, subctx, d_Qy, qy_buf_offset, src1, 0, 0, ggml_nrows(src1)); - } - // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); @@ -2945,7 +3324,7 @@ static bool ggml_vk_can_mul_mat(const ggml_tensor * src0, const ggml_tensor * sr ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_TYPE_GPU); } -static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_mul_mat(" << src0 << ", " << src1 << ", " << dst << ")" << std::endl; #endif @@ -2960,9 +3339,347 @@ static void ggml_vk_mul_mat(ggml_backend_vk_context * ctx, vk_context * subctx, } } -// static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context * subctx, const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { -// -// } +/*static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_mul_mat_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << ids << ", name=" << ids->name << ", type=" << ids->type << ", backend=" << ids->backend << ", ne0=" << ids->ne[0] << ", ne1=" << ids->ne[1] << ", ne2=" << ids->ne[2] << ", ne3=" << ids->ne[3] << ", nb0=" << ids->nb[0] << ", nb1=" << ids->nb[1] << ", nb2=" << ids->nb[2] << ", nb3=" << ids->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; +#endif + GGML_ASSERT(src0->type == GGML_TYPE_I32); + GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + const uint64_t ne03 = src0->ne[3]; + + const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + const uint64_t ne13 = src1->ne[3]; + + const uint32_t nb11 = src1->nb[1]; + + const uint64_t ne20 = dst->ne[0]; + const uint64_t ne21 = dst->ne[1]; + + const uint64_t r2 = ne12 / ne02; + const uint64_t r3 = ne13 / ne03; + + const uint32_t nbi1 = src0->nb[1]; + const uint32_t idx = ((uint32_t *) dst->op_params)[0]; + const uint64_t n_as = ne02; + + GGML_ASSERT(n_as <= 8); + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + ggml_tensor_extra_gpu * extra_ids = (ggml_tensor_extra_gpu *) ids->extra; + + vk_buffer d_Qx; + size_t qx_buf_offset = 0; + vk_buffer d_Qy; + size_t qy_buf_offset = 0; + + bool src0_uma = false; + bool src1_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); + src0_uma = d_Qx != nullptr; + src1_uma = d_Qy != nullptr; + } + + const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); + + const bool y_f32_kernel = src1->type == GGML_TYPE_F32 && !y_non_contig; + + vk_matmul_pipeline mmp = ggml_vk_get_mul_mat_mat_id_pipeline(ctx, src0->type, y_non_contig ? GGML_TYPE_F16 : src1->type); + + const bool qx_needs_dequant = mmp == nullptr || x_non_contig; + const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig; + + if (mmp == nullptr) { + GGML_ASSERT(false); + } + + // Not implemented + GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT + + const int x_ne = ne01 * ne00; + const int y_ne = ne11 * ne10; + const int d_ne = ne11 * ne01; + + const uint32_t kpad = ggml_vk_align_size(ne10, ggml_vk_guess_matmul_pipeline_align(ctx, mmp, ne01, ne11)); + const bool aligned = ne10 == kpad && ne01 > 8 && ne11 > 8; + + const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); + + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, mmp, ne01, ne11, aligned); + + const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); + const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); + const uint64_t x_sz = !qx_needs_dequant ? qx_sz : sizeof(ggml_fp16_t) * x_ne; + const uint64_t y_sz = y_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; + const uint64_t d_sz = sizeof(float) * d_ne; + + vk_buffer d_D = extra->buffer_gpu.lock(); + const uint64_t d_buf_offset = extra->offset; + GGML_ASSERT(d_D != nullptr); + GGML_ASSERT(d_D->size >= d_buf_offset + d_sz * ne02 * ne03); + vk_buffer d_X; + uint64_t x_buf_offset = 0; + vk_buffer d_Y; + uint64_t y_buf_offset = 0; + if (!src0_uma) { + d_Qx = extra_src0->buffer_gpu.lock(); + qx_buf_offset = extra_src0->offset; + GGML_ASSERT(d_Qx != nullptr); + } + if (!src1_uma) { + d_Qy = extra_src1->buffer_gpu.lock(); + qy_buf_offset = extra_src1->offset; + GGML_ASSERT(d_Qy != nullptr); + } + if (qx_needs_dequant) { + d_X = ctx->prealloc_x; + GGML_ASSERT(d_X->size >= x_sz * ne02 * ne03); + } else { + d_X = d_Qx; + x_buf_offset = qx_buf_offset; + GGML_ASSERT(qx_sz == x_sz); + } + if (qy_needs_dequant) { + d_Y = ctx->prealloc_y; + GGML_ASSERT(d_Y->size >= y_sz * ne02 * ne03); + } else { + d_Y = d_Qy; + y_buf_offset = qy_buf_offset; + GGML_ASSERT(qy_sz == y_sz); + } + + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); + } else { + to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + + // Allocate descriptor sets + ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1); + if (qx_needs_dequant) { + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1); + } + if (split_k > 1) { + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); + } + + if (x_non_contig) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); + } else if (qx_needs_dequant) { + const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + } + if (y_non_contig) { + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + } + + uint32_t stride_batch_x = ne00*ne01; + uint32_t stride_batch_y = ne10*ne11; + + if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { + stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); + } + + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { + stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); + } + + // compute + ggml_vk_matmul( + ctx, subctx, pipeline, + { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, + { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, + ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21, + nb11 / ggml_type_size(src1->type), ne20, idx, nbi1, n_as + ); // NOLINT +} + +static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_mul_mat_vec_id_q_f16((" << src0 << ", name=" << src0->name << ", type=" << src0->type << ", backend=" << src0->backend << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3] << ", nb0=" << src0->nb[0] << ", nb1=" << src0->nb[1] << ", nb2=" << src0->nb[2] << ", nb3=" << src0->nb[3]; + std::cerr << "), (" << src1 << ", name=" << src1->name << ", type=" << src1->type << ", backend=" << src1->backend << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3] << ", nb0=" << src1->nb[0] << ", nb1=" << src1->nb[1] << ", nb2=" << src1->nb[2] << ", nb3=" << src1->nb[3]; + std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "),)" << std::endl; +#endif + GGML_ASSERT(ggml_vk_dim01_contiguous(src0) || src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); // NOLINT + GGML_ASSERT(ggml_vk_dim01_contiguous(src1) || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); // NOLINT + + const uint64_t ne00 = src0->ne[0]; + const uint64_t ne01 = src0->ne[1]; + const uint64_t ne02 = src0->ne[2]; + const uint64_t ne03 = src0->ne[3]; + + const uint64_t ne10 = src1->ne[0]; + const uint64_t ne11 = src1->ne[1]; + const uint64_t ne12 = src1->ne[2]; + const uint64_t ne13 = src1->ne[3]; + + GGML_ASSERT(ne11 == 1); + + const uint64_t ne20 = dst->ne[0]; + const uint64_t ne21 = dst->ne[1]; + const uint64_t ne22 = dst->ne[2]; + const uint64_t ne23 = dst->ne[3]; + + const uint64_t nb22 = dst->nb[2]; + const uint64_t nb23 = dst->nb[3]; + + const uint64_t r2 = ne12 / ne02; + const uint64_t r3 = ne13 / ne03; + + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; + ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; + ggml_tensor_extra_gpu * extra_src1 = (ggml_tensor_extra_gpu *) src1->extra; + + vk_buffer d_Qx; + size_t qx_buf_offset = 0; + vk_buffer d_Qy; + size_t qy_buf_offset = 0; + + bool src0_uma = false; + bool src1_uma = false; + + if (ctx->device->uma) { + ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); + ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); + src0_uma = d_Qx != nullptr; + src1_uma = d_Qy != nullptr; + } + + const bool x_non_contig = !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = !ggml_vk_dim01_contiguous(src1); + + const bool f16_f32_kernel = src1->type == GGML_TYPE_F32; + + const bool qx_needs_dequant = x_non_contig; + const bool qy_needs_dequant = (src1->type != GGML_TYPE_F16 && !f16_f32_kernel) || y_non_contig; + + // Not implemented + GGML_ASSERT(y_non_contig || !qy_needs_dequant); // NOLINT + + const uint64_t x_ne = ne01 * ne00; + const uint64_t y_ne = ne11 * ne10; + const uint64_t d_ne = ne11 * ne01; + + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); + const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; + const uint64_t d_sz = sizeof(float) * d_ne; + + vk_buffer d_D = extra->buffer_gpu.lock(); + const uint64_t d_buf_offset = extra->offset; + GGML_ASSERT(d_D != nullptr); + vk_buffer d_X; + uint64_t x_buf_offset = 0; + vk_buffer d_Y; + uint64_t y_buf_offset = 0; + if(!src0_uma) { + d_Qx = extra_src0->buffer_gpu.lock(); + qx_buf_offset = extra_src0->offset; + GGML_ASSERT(d_Qx != nullptr); + } + if(!src1_uma) { + d_Qy = extra_src1->buffer_gpu.lock(); + qy_buf_offset = extra_src1->offset; + GGML_ASSERT(d_Qy != nullptr); + } + if (qx_needs_dequant) { + d_X = ctx->prealloc_x; + } else { + d_X = d_Qx; + x_buf_offset = qx_buf_offset; + GGML_ASSERT(qx_sz == x_sz); + } + if (qy_needs_dequant) { + d_Y = ctx->prealloc_y; + } else { + d_Y = d_Qy; + y_buf_offset = qy_buf_offset; + GGML_ASSERT(qy_sz == y_sz); + } + + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; + if (x_non_contig) { + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); + } + if (y_non_contig) { + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); + } else { + to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); + } + vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type, src1->type); + GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT + GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT + GGML_ASSERT(dmmv != nullptr); + + // Allocate descriptor sets + if (qx_needs_dequant) { + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); + } + if (qy_needs_dequant) { + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); + } + ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13); + + if (x_non_contig) { + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); + } + if (y_non_contig) { + GGML_ASSERT(y_sz == ggml_type_size(src1->type) * y_ne); + ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_1, src1, { d_Qy, qy_buf_offset, VK_WHOLE_SIZE }, { d_Y, 0, VK_WHOLE_SIZE }); + } + + uint32_t stride_batch_x = ne00*ne01; + uint32_t stride_batch_y = ne10*ne11; + + if (!ggml_vk_dim01_contiguous(src0) && !qx_needs_dequant) { + stride_batch_x = src0->nb[0] / ggml_type_size(src0->type); + } + + if (!ggml_vk_dim01_contiguous(src1) && !qy_needs_dequant) { + stride_batch_y = src1->nb[0] / ggml_type_size(src1->type); + } + + // compute + const vk_mat_vec_push_constants pc = { + (uint32_t)ne00, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne01, + (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, + stride_batch_x, stride_batch_y, (uint32_t)(ne20*ne21), + // 0, 0, 0, 0, 1 + }; + ggml_vk_sync_buffers(subctx); + ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne22 * ne23} }, sizeof(vk_mat_vec_push_constants), &pc, { (uint32_t)ne01, (uint32_t)(ne12 * ne13), 1}); +}*/ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { // guaranteed to be an integer due to the check in ggml_can_repeat @@ -3112,9 +3829,15 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const } return nullptr; case GGML_OP_SOFT_MAX: + GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16); + GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32 || src2->type == GGML_TYPE_F16); + if (src0->type == GGML_TYPE_F32 && (src1 == nullptr || src1->type == GGML_TYPE_F32) && (src2 == nullptr || src2->type == GGML_TYPE_F32) && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_soft_max_f32; } + if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16 && src2->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) { + return ctx->device->pipeline_soft_max_f32_f16; + } return nullptr; case GGML_OP_ROPE: { @@ -3162,6 +3885,21 @@ static ggml_vk_func_t ggml_vk_op_get_func(ggml_op op) { } } +static bool ggml_vk_op_supports_incontiguous(ggml_op op) { + switch (op) { + case GGML_OP_CPY: + case GGML_OP_GET_ROWS: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + return true; + default: + return false; + } +} + template static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op, const PC&& pc) { #ifdef GGML_VULKAN_DEBUG @@ -3174,7 +3912,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } std::cerr << "), (" << dst << ", name=" << dst->name << ", type=" << dst->type << ", backend=" << dst->backend << ", ne0=" << dst->ne[0] << ", ne1=" << dst->ne[1] << ", ne2=" << dst->ne[2] << ", ne3=" << dst->ne[3] << ", nb0=" << dst->nb[0] << ", nb1=" << dst->nb[1] << ", nb2=" << dst->nb[2] << ", nb3=" << dst->nb[3] << "), " << ggml_op_name(op) << ")" << std::endl; #endif - GGML_ASSERT(!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type))); // NOLINT + GGML_ASSERT(op == GGML_OP_GET_ROWS || (!ggml_is_quantized(src0->type) && (src1 == nullptr || !ggml_is_quantized(src1->type)))); // NOLINT GGML_ASSERT(op == GGML_OP_CPY || ggml_vk_dim01_contiguous(src0)); // NOLINT GGML_ASSERT(dst->extra != nullptr); const uint64_t ne00 = src0->ne[0]; @@ -3213,6 +3951,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c return; } + const bool op_supports_incontiguous = ggml_vk_op_supports_incontiguous(op); + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; ggml_tensor_extra_gpu * extra_src1 = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; @@ -3242,11 +3982,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } } - const bool transfer_src0 = src0->backend != GGML_BACKEND_TYPE_GPU && !src0_uma; - const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_TYPE_GPU && !src1_uma; - const bool transfer_src2 = use_src2 && src2->backend != GGML_BACKEND_TYPE_GPU && !src2_uma; - - uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment); + uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type)/ggml_blck_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment); uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t z_sz = use_src2 ? ggml_vk_align_size(ggml_type_size(src2->type) * ne2, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t d_sz = ggml_type_size(dst->type) * ne0; @@ -3261,55 +3997,43 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c GGML_ASSERT(d_D != nullptr); uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT - if (transfer_src0) { - d_X = ctx->prealloc_qx; - } else if(!src0_uma) { + if(!src0_uma) { d_X = extra_src0->buffer_gpu.lock(); x_buf_offset = extra_src0->offset; GGML_ASSERT(d_X != nullptr); } - if (transfer_src1) { - d_Y = ctx->prealloc_qy; - } else if (use_src1 && !src1_uma) { + if (use_src1 && !src1_uma) { d_Y = extra_src1->buffer_gpu.lock(); y_buf_offset = extra_src1->offset; GGML_ASSERT(d_Y != nullptr); } - GGML_ASSERT(!transfer_src2); if (use_src2 && !src2_uma) { d_Z = extra_src2->buffer_gpu.lock(); z_buf_offset = extra_src2->offset; GGML_ASSERT(d_Z != nullptr); } - if (op == GGML_OP_CPY) { - GGML_ASSERT(!transfer_src0); - GGML_ASSERT(!transfer_src1); + if (op_supports_incontiguous) { x_sz = ggml_nbytes(src0); + y_sz = use_src1 ? ggml_nbytes(src1) : 0; d_sz = ggml_nbytes(dst); - if (extra_src0->offset + x_sz >= d_X->size) { + if (x_buf_offset + x_sz >= d_X->size) { x_sz = VK_WHOLE_SIZE; } - if (extra->offset + d_sz >= d_D->size) { + if (use_src1 && y_buf_offset + y_sz >= d_Y->size) { + y_sz = VK_WHOLE_SIZE; + } + if (d_buf_offset + d_sz >= d_D->size) { d_sz = VK_WHOLE_SIZE; } } std::array elements; - // copy src0 to device - if (transfer_src0) { - ggml_vk_h2d_tensor_2d(ctx, subctx, d_X, 0, src0, 0, 0, ggml_nrows(src0)); - ctx->staging_offset = x_sz * ne02 * ne03; - } - if (transfer_src1) { - ggml_vk_h2d_tensor_2d(ctx, subctx, d_Y, 0, src1, 0, 0, ggml_nrows(src1)); - } - // Single call if dimension 2 is contiguous - if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { + if (op_supports_incontiguous || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1); switch (dst->op) { @@ -3322,16 +4046,19 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c case GGML_OP_ROPE: elements = { (uint32_t)ggml_nrows(src0), (uint32_t)ne00, 1 }; break; + case GGML_OP_GET_ROWS: + elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; + break; default: elements = { (uint32_t)ggml_nelements(src0), 1, 1 }; break; } - if (op != GGML_OP_CPY) { + if (!op_supports_incontiguous) { if (x_sz != VK_WHOLE_SIZE) { x_sz *= ne02 * ne03; } - if (y_sz != VK_WHOLE_SIZE) { + if (use_src1 && y_sz != VK_WHOLE_SIZE) { y_sz *= ne12 * ne13; } if (d_sz != VK_WHOLE_SIZE) { @@ -3345,14 +4072,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (use_src1) { subbuf_y = { d_Y, y_buf_offset, y_sz }; } else { - subbuf_y = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; + subbuf_y = { d_X, 0, d_X->size }; } vk_subbuffer subbuf_z; if (use_src2) { subbuf_z = { d_Z, z_buf_offset, z_sz }; } else { - subbuf_z = { ctx->prealloc_y, 0, ctx->prealloc_y->size }; + subbuf_z = { d_X, 0, d_X->size }; } ggml_vk_sync_buffers(subctx); @@ -3386,6 +4113,9 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c case GGML_OP_ROPE: elements = { (uint32_t)ne01, (uint32_t)ne00, 1 }; break; + case GGML_OP_GET_ROWS: + elements = { (uint32_t)ne00, (uint32_t)ne10, (uint32_t)(ne11 * ne12) }; + break; default: elements = { (uint32_t)ne0, 1, 1 }; break; @@ -3420,7 +4150,18 @@ static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context * subctx, c } static void ggml_vk_get_rows(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { (uint32_t)ggml_nelements(src0), (uint32_t)ggml_nelements(src1), 0.0f, 0.0f }); + const uint32_t src0_type_size = ggml_type_size(src0->type); + const uint32_t src1_type_size = ggml_type_size(src1->type); + const uint32_t dst_type_size = ggml_type_size(dst->type); + + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GET_ROWS, { + (uint32_t)ggml_nelements(src0), + (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, + (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, + (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, + 0, + 0.0f, 0.0f, + }); } static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -3510,7 +4251,9 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons } static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], 0.0f, 0.0f }); + float * op_params = (float *)dst->op_params; + + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }); } static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { @@ -3545,7 +4288,7 @@ static void ggml_vk_soft_max(ggml_backend_vk_context * ctx, vk_context * subctx, ggml_vk_op_f32(ctx, subctx, src0, src1, src2, dst, GGML_OP_SOFT_MAX, { ncols, - nrows_y, + src1 != nullptr ? nrows_y : (uint32_t)0, src2 != nullptr ? (uint32_t)1 : (uint32_t)0, scale, max_bias, m0, m1, @@ -3576,9 +4319,9 @@ static void ggml_vk_rope(ggml_backend_vk_context * ctx, vk_context * subctx, con if (is_neox) { const float theta_scale = powf(freq_base, -2.0f/n_dims); const float inv_ndims = -1.0f / n_dims; - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f, theta_scale, inv_ndims }); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], (uint32_t)n_dims, freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f}, theta_scale, inv_ndims }); } else { - ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, corr_dims[0], corr_dims[1], 0.0f, 0.0f }); + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_ROPE, { (uint32_t)src0->ne[0], freq_scale, (uint32_t)src0->ne[1], freq_base, ext_factor, attn_factor, {corr_dims[0], corr_dims[1], 0.0f, 0.0f} }); } } @@ -3587,16 +4330,6 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context * subctx, ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ARGSORT, { (uint32_t)src0->ne[0], ((ggml_sort_order) op_params[0]) == GGML_SORT_ORDER_ASC }); } -static void ggml_vk_nop(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, ggml_tensor * dst) { - // If backend is CPU, data from src0 has to be copied off the device - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - ggml_tensor_extra_gpu * extra_src0 = (ggml_tensor_extra_gpu *) src0->extra; - vk_buffer d_D = extra_src0->buffer_gpu.lock(); - ggml_vk_sync_buffers(subctx); - ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, dst->data, d_D->size); - } -} - #ifdef GGML_VULKAN_RUN_TESTS static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0, int ne1, int i0, int i1, int i2) { if (type != GGML_TYPE_F32 && type != GGML_TYPE_F16) { @@ -3619,6 +4352,8 @@ static void ggml_vk_print_matrix_area(const void * data, ggml_type type, int ne0 val = *((const float *) data + i2*ne1*ne0 + idx1*ne0 + idx0); } else if (type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*((const ggml_fp16_t *) data + i2*ne1*ne0 + idx1*ne0 + idx0)); + } else { + GGML_ASSERT(false); } fprintf(stderr, "% 7.2f ", val); } else { @@ -3770,7 +4505,10 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); for (size_t i = 0; i < num_it; i++) { ggml_vk_ctx_begin(ctx, subctx); - ggml_vk_matmul(ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n); + ggml_vk_matmul( + ctx, subctx, p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), + m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n, 0, 0, 0, 0, 1 + ); ggml_vk_ctx_end(subctx); } @@ -3920,6 +4658,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, int i0, int i1 val = *(float *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); } else if (tensor->type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*(ggml_fp16_t *) ((char *) tensor->data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); + } else { + GGML_ASSERT(false); } fprintf(stderr, "% 7.2f ", val); } else { @@ -4273,7 +5013,10 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); for (size_t i = 0; i < num_it; i++) { ggml_vk_ctx_begin(ctx, subctx); - ggml_vk_matmul(ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n); + ggml_vk_matmul( + ctx, subctx, p, ggml_vk_subbuffer(qx_buf), ggml_vk_subbuffer(y_buf), ggml_vk_subbuffer(d_buf), ggml_vk_subbuffer(ctx->prealloc_split_k), + m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n, 0, 0, 0, 0, 1 + ); ggml_vk_ctx_end(subctx); } @@ -4335,7 +5078,7 @@ static void ggml_vk_test_dequant_matmul(ggml_backend_vk_context * ctx, size_t m, std::cerr << "TEST MMQ " << shname << " m=" << m << " n=" << n << " k=" << k << " batch=" << batch << " split_k=" << split_k << " matmul " << time_ms / num_it << "ms avg_err=" << avg_err << std::endl; - if (avg_err > 0.1 || std::isnan(avg_err)) { + if (avg_err > 0.01 || std::isnan(avg_err)) { std::cerr << "m = " << first_err_m << " n = " << first_err_n << " b = " << first_err_b << std::endl; std::cerr << "Actual result: " << std::endl << std::endl; ggml_vk_print_matrix_area(d, GGML_TYPE_F32, m, n, first_err_m, first_err_n, first_err_b); @@ -4385,27 +5128,15 @@ static ggml_tensor_extra_gpu * ggml_vk_tensor_create_extra(ggml_tensor * tensor) return extra; } -static bool ggml_vk_cpu_assist_op(const ggml_tensor * node) { - return node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID; -} - static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggml_tensor * node){ #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_preallocate_buffers_graph(" << node << ")" << std::endl; #endif - const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) - || (node->src[1] != nullptr && (node->src[1]->backend == GGML_BACKEND_TYPE_GPU)); - - if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node))) { + if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU) { return; } ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) node->extra; - if (extra == nullptr) { - // Workaround for CPU backend BLAS matmul calls - extra = ggml_vk_tensor_create_extra(node); - } ggml_tensor * src0 = node->src[0]; ggml_tensor * src1 = node->src[1]; @@ -4425,7 +5156,18 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm const int64_t ne22 = node->ne[2]; const int64_t ne23 = node->ne[3]; - const bool f16_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32; + const ggml_type src0_type = (use_src0 && src0->type == GGML_TYPE_F32) ? src0->type : GGML_TYPE_F16; + const ggml_type src1_type = (use_src1 && src1->type == GGML_TYPE_F32) ? src1->type : GGML_TYPE_F16; + + const bool x_non_contig = use_src0 && !ggml_vk_dim01_contiguous(src0); + const bool y_non_contig = use_src1 && !ggml_vk_dim01_contiguous(src1); + + const bool y_f32_kernel = use_src1 && src1->type == GGML_TYPE_F32 && !y_non_contig; + + bool mmp = (use_src0 && use_src1 && src1_type == GGML_TYPE_F32) ? ggml_vk_get_mul_mat_mat_pipeline(ctx, src0_type, y_non_contig ? GGML_TYPE_F16 : src1->type) != nullptr : false; + + const bool qx_needs_dequant = use_src0 && (mmp || x_non_contig); + const bool qy_needs_dequant = use_src1 && ((src1->type != GGML_TYPE_F16 && !y_f32_kernel) || y_non_contig); int split_k; if (node->op == GGML_OP_MUL_MAT || node->op == GGML_OP_MUL_MAT_ID) { @@ -4437,10 +5179,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm const uint32_t y_ne = ne10 * ne11; const uint32_t d_ne = ne20 * ne21; - const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; + const uint64_t x_sz = (use_src0 && qx_needs_dequant) ? ggml_vk_align_size(sizeof(src0_type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; + const uint64_t y_sz = (use_src1 && qy_needs_dequant) ? ggml_vk_align_size(sizeof(src1_type) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0; @@ -4483,12 +5223,6 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - if (ctx->prealloc_size_qx < qx_sz) { - ctx->prealloc_size_qx = qx_sz; - } - if (ctx->prealloc_size_qy < qy_sz) { - ctx->prealloc_size_qy = qy_sz; - } if (ctx->prealloc_size_x < x_sz) { ctx->prealloc_size_x = x_sz; } @@ -4512,7 +5246,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { return; } #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_preallocate_buffers(qx_size: " << ctx->prealloc_size_qx << " qy_size: " << ctx->prealloc_size_qy << " x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; + std::cerr << "ggml_vk_preallocate_buffers(x_size: " << ctx->prealloc_size_x << " y_size: " << ctx->prealloc_size_y << " split_k_size: " << ctx->prealloc_size_split_k << ")" << std::endl; #endif #if defined(GGML_VULKAN_RUN_TESTS) ctx->staging = ggml_vk_create_buffer_check(ctx, 100ul * 1024ul * 1024ul, @@ -4533,6 +5267,8 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q5_K); ggml_vk_test_dequant(ctx, 7680, GGML_TYPE_Q6_K); + ggml_vk_test_matmul(ctx, 512, 512, 100, 32, 100, 1, 2); + ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 0); ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 1); ggml_vk_test_matmul(ctx, 128, 512, 512, 2, 100, 1, 2); @@ -4575,6 +5311,41 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q8_0); ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q8_0); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q2_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q2_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q2_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q2_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q2_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q2_K); + + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q3_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q3_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q3_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q3_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q3_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q3_K); + + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q4_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q4_K); + + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q5_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q5_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q5_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q5_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q5_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q5_K); + + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 0, GGML_TYPE_Q6_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 1, GGML_TYPE_Q6_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 1, 2, GGML_TYPE_Q6_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 0, GGML_TYPE_Q6_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 1, GGML_TYPE_Q6_K); + ggml_vk_test_dequant_matmul(ctx, 128, 512, 512, 2, 100, 4, 2, GGML_TYPE_Q6_K); + std::cerr << std::endl; const std::vector vals { @@ -4614,20 +5385,6 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { GGML_ASSERT(false); #endif - if (ctx->prealloc_qx == nullptr || (ctx->prealloc_size_qx > 0 && ctx->prealloc_qx->size < ctx->prealloc_size_qx)) { - // Resize buffer - if (ctx->prealloc_qx != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_qx); - } - ctx->prealloc_qx = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qx); - } - if (ctx->prealloc_qy == nullptr || (ctx->prealloc_size_qy > 0 && ctx->prealloc_qy->size < ctx->prealloc_size_qy)) { - // Resize buffer - if (ctx->prealloc_qy != nullptr) { - ggml_vk_destroy_buffer(ctx->prealloc_qy); - } - ctx->prealloc_qy = ggml_vk_create_buffer_device(ctx, ctx->prealloc_size_qy); - } if (ctx->prealloc_x == nullptr || (ctx->prealloc_size_x > 0 && ctx->prealloc_x->size < ctx->prealloc_size_x)) { // Resize buffer if (ctx->prealloc_x != nullptr) { @@ -4661,11 +5418,7 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * node, bool last_node){ - const bool any_on_device = node->backend == GGML_BACKEND_TYPE_GPU - || (node->src[0] != nullptr && (node->src[0]->backend == GGML_BACKEND_TYPE_GPU || node->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) - || (node->src[1] != nullptr && node->src[1]->backend == GGML_BACKEND_TYPE_GPU); - - if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(node)) || (ggml_vk_cpu_assist_op(node) && !any_on_device && !ggml_vk_can_mul_mat(node->src[0], node->src[1], node))) { + if (ctx->disable || node->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(node)) { return; } @@ -4693,7 +5446,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } break; case GGML_OP_REPEAT: - // case GGML_OP_GET_ROWS: + case GGML_OP_GET_ROWS: case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_SCALE: @@ -4717,10 +5470,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_ARGSORT: break; default: - if (any_on_device) { - std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; - GGML_ASSERT(false); - } + std::cerr << "ggml_vulkan: Error: Missing op: " << ggml_op_name(node->op) << std::endl; + GGML_ASSERT(false); return; } @@ -4769,8 +5520,6 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NONE: - ggml_vk_nop(ctx, ctx->compute_ctx, src0, node); - break; case GGML_OP_NORM: ggml_vk_norm(ctx, ctx->compute_ctx, src0, node); @@ -4837,11 +5586,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_params * params, ggml_tensor * tensor){ - const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); - - if (ctx->disable || (!any_on_device && !ggml_vk_cpu_assist_op(tensor))) { + if (ctx->disable) { return false; } @@ -4884,10 +5629,6 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ break; case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: - if (!any_on_device && !ggml_vk_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { - return false; - } - extra = (ggml_tensor_extra_gpu *) tensor->extra; break; @@ -5001,8 +5742,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { #endif ggml_vk_graph_cleanup(ctx); - ggml_vk_destroy_buffer(ctx->prealloc_qx); - ggml_vk_destroy_buffer(ctx->prealloc_qy); ggml_vk_destroy_buffer(ctx->prealloc_x); ggml_vk_destroy_buffer(ctx->prealloc_y); ggml_vk_destroy_buffer(ctx->prealloc_split_k); @@ -5013,8 +5752,6 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ggml_vk_destroy_buffer(buffer); } - ctx->prealloc_size_qx = 0; - ctx->prealloc_size_qy = 0; ctx->prealloc_size_x = 0; ctx->prealloc_size_y = 0; ctx->prealloc_size_split_k = 0; @@ -5045,80 +5782,6 @@ GGML_CALL static void ggml_vk_get_device_description(int device, char * descript snprintf(description, description_size, "%s", props.deviceName.data()); } -// CPU assist interface - -void ggml_vk_init_cpu_assist() { - ggml_vk_instance_init(); - - std::cerr << "ggml_vulkan: Found " << ggml_vk_get_device_count() << " Vulkan devices:" << std::endl; - - for (int i = 0; i < ggml_vk_get_device_count(); i++) { - ggml_vk_print_gpu_info(i); - } - // Initialize the first backend to make sure CPU matrix multiplications can be offloaded. - ggml_backend_vk_init(0); -} - -void ggml_vk_preallocate_buffers_graph_cpu_assist(ggml_tensor * node) { - ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; - - if (!ctx->initialized) { - return; - } - - ggml_vk_preallocate_buffers_graph(ctx, node); -} - -void ggml_vk_preallocate_buffers_cpu_assist() { - ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; - - if (!ctx->initialized) { - return; - } - - ggml_vk_preallocate_buffers(ctx); -} - -void ggml_vk_build_graph_cpu_assist(ggml_tensor * node, bool last_node) { - ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; - - if (!ctx->initialized) { - return; - } - - ggml_vk_build_graph(ctx, node, last_node); -} - -bool ggml_vk_compute_forward_cpu_assist(ggml_compute_params * params, ggml_tensor * tensor){ - ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; - - if (!ctx->initialized) { - return false; - } - - return ggml_vk_compute_forward(ctx, params, tensor); -} - -void ggml_vk_graph_cleanup_cpu_assist() { - ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; - - if (!ctx->initialized) { - return; - } - - ggml_vk_graph_cleanup(ctx); -} - -void ggml_vk_free_cpu_assist() { - ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; - - if (!ctx->initialized || vk_instance.backends[0] == nullptr) { - return; - } - - ggml_backend_vk_free(vk_instance.backends[0]); -} - // backend interface #define UNUSED GGML_UNUSED @@ -5330,16 +5993,16 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = { /* .is_host = */ NULL, }; -GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t idx) { +GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_buffer_type(" << idx << ")" << std::endl; + std::cerr << "ggml_backend_vk_buffer_type(" << dev_num << ")" << std::endl; #endif - GGML_ASSERT(idx < vk_instance.device_indices.size()); + GGML_ASSERT(dev_num < vk_instance.device_indices.size()); - ggml_backend_vk_init(idx); + ggml_backend_vk_init(dev_num); - return &vk_instance.buffer_types[idx]; + return &vk_instance.buffer_types[dev_num]; } // host buffer type @@ -5508,7 +6171,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c vk_buffer src_buf = src_extra->buffer_gpu.lock(); vk_buffer dst_buf = dst_extra->buffer_gpu.lock(); - ggml_vk_buffer_copy_async(ctx->transfer_ctx, src_buf, src_extra->offset, dst_buf, dst_extra->offset, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(ctx->transfer_ctx, dst_buf, dst_extra->offset, src_buf, src_extra->offset, ggml_nbytes(src)); return true; } @@ -5542,6 +6205,9 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { } GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_backend_vk_graph_compute(" << cgraph->n_nodes << " nodes)" << std::endl; +#endif ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; for (int i = 0; i < cgraph->n_nodes; i++) { @@ -5552,7 +6218,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen int last_node = cgraph->n_nodes - 1; // If the last op in the cgraph isn't backend GPU, the command buffer doesn't get closed properly - while (last_node > 0 && cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU) { + while (last_node > 0 && (cgraph->nodes[last_node]->backend != GGML_BACKEND_TYPE_GPU || ggml_is_empty(cgraph->nodes[last_node]))) { last_node -= 1; } @@ -5566,7 +6232,7 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { + if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; } @@ -5590,6 +6256,8 @@ GGML_CALL static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backen } GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + // ggml_backend_vk_context * ctx = (ggml_backend_vk_context *) backend->context; + switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { @@ -5602,8 +6270,25 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const } break; case GGML_OP_MUL_MAT: - case GGML_OP_MUL_MAT_ID: + // case GGML_OP_MUL_MAT_ID: { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + break; + default: + return false; + } struct ggml_tensor * a; struct ggml_tensor * b; if (op->op == GGML_OP_MUL_MAT) { @@ -5618,25 +6303,26 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const } return true; } break; - // case GGML_OP_GET_ROWS: - // { - // switch (op->src[0]->type) { - // case GGML_TYPE_F16: - // case GGML_TYPE_F32: - // case GGML_TYPE_Q4_0: - // case GGML_TYPE_Q4_1: - // case GGML_TYPE_Q5_0: - // case GGML_TYPE_Q5_1: - // case GGML_TYPE_Q8_0: - // return true; - // default: - // return false; - // } - // } break; + case GGML_OP_GET_ROWS: + { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } break; case GGML_OP_CPY: + case GGML_OP_DUP: { ggml_type src0_type = op->src[0]->type; - ggml_type src1_type = op->src[1]->type; + ggml_type src1_type = op->src[1] != nullptr ? op->src[1]->type : src0_type; if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) { return true; } @@ -5648,7 +6334,6 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const } return false; } break; - case GGML_OP_DUP: // case GGML_OP_REPEAT: // { // ggml_type src0_type = op->src[0]->type; @@ -5685,6 +6370,20 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const UNUSED(backend); } +GGML_CALL static bool ggml_backend_vk_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + const ggml_tensor * dst = op; + + const int min_batch_size = 32; + + if (dst->ne[1] > min_batch_size && dst->op != GGML_OP_GET_ROWS) { + return true; + } + + return false; + + UNUSED(backend); +} + // TODO: enable async and synchronize static ggml_backend_i ggml_backend_vk_interface = { /* .get_name = */ ggml_backend_vk_name, @@ -5699,7 +6398,7 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, - /* .offload_op = */ NULL, + /* .offload_op = */ ggml_backend_vk_offload_op, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, @@ -5712,22 +6411,22 @@ static ggml_guid_t ggml_backend_vk_guid() { return &guid; } -GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { - if (vk_instance.initialized[idx]) { - return vk_instance.backends[idx]; +GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num) { + if (vk_instance.initialized[dev_num]) { + return vk_instance.backends[dev_num]; } #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_backend_vk_init(" << idx << ")" << std::endl; + std::cerr << "ggml_backend_vk_init(" << dev_num << ")" << std::endl; #endif - ggml_backend_vk_context * ctx = &vk_instance.contexts[idx]; - ggml_vk_init(ctx, idx); - ctx->name = GGML_VK_NAME + std::to_string(idx); - vk_instance.buffer_types[idx] = { + ggml_backend_vk_context * ctx = &vk_instance.contexts[dev_num]; + ggml_vk_init(ctx, dev_num); + ctx->name = GGML_VK_NAME + std::to_string(dev_num); + vk_instance.buffer_types[dev_num] = { /* .iface = */ ggml_backend_vk_buffer_type_interface, /* .context = */ new ggml_backend_vk_buffer_type_context{ ctx->name, ctx }, }; - vk_instance.initialized[idx] = true; + vk_instance.initialized[dev_num] = true; ggml_backend_t vk_backend = new ggml_backend { /* .guid = */ ggml_backend_vk_guid(), @@ -5735,7 +6434,7 @@ GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t idx) { /* .context = */ &vk_instance.contexts[ctx->idx], }; - vk_instance.backends[idx] = vk_backend; + vk_instance.backends[dev_num] = vk_backend; return vk_backend; } @@ -5779,10 +6478,12 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_vk_init(const char * params, vo extern "C" GGML_CALL int ggml_backend_vk_reg_devices(); GGML_CALL int ggml_backend_vk_reg_devices() { - for (auto idx : vk_instance.device_indices) { + ggml_vk_instance_init(); + + for (size_t i = 0; i < vk_instance.device_indices.size(); i++) { char name[128]; - snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, idx); - ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(idx), (void *) (intptr_t) idx); + snprintf(name, sizeof(name), "%s%ld", GGML_VK_NAME, i); + ggml_backend_register(name, ggml_backend_reg_vk_init, ggml_backend_vk_buffer_type(i), (void *) (intptr_t) i); // NOLINT } return vk_instance.device_indices.size(); } @@ -5866,6 +6567,8 @@ static void ggml_vk_print_tensor_area(const ggml_tensor * tensor, const void * d val = *(const float *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0]); } else if (tensor->type == GGML_TYPE_F16) { val = ggml_fp16_to_fp32(*(const ggml_fp16_t *) ((const char *) data + i3*tensor->nb[3] + i2*tensor->nb[2] + idx1*tensor->nb[1] + idx0*tensor->nb[0])); + } else { + GGML_ASSERT(false); } fprintf(stderr, "% 7.2f ", val); } else { @@ -5960,6 +6663,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ return; } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_check_results_0(" << tensor->name << ")" << std::endl; +#endif + ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; ggml_tensor * src2 = tensor->src[2]; @@ -6173,7 +6880,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ tensor_clone = ggml_soft_max(ggml_ctx, src0_clone); } } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { - tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(float *)tensor->op_params); + tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(int *)tensor->op_params); } else if (tensor->op == GGML_OP_ROPE) { const int n_dims = ((int32_t *) tensor->op_params)[1]; const int mode = ((int32_t *) tensor->op_params)[2]; @@ -6219,6 +6926,10 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ tensor_clone = ggml_permute(ggml_ctx, src0_clone, params[0], params[1], params[2], params[3]); } else if (tensor->op == GGML_OP_TRANSPOSE) { tensor_clone = ggml_transpose(ggml_ctx, src0_clone); + } else if (tensor->op == GGML_OP_GET_ROWS) { + tensor_clone = ggml_get_rows(ggml_ctx, src0_clone, src1_clone); + } else if (tensor->op == GGML_OP_ARGSORT) { + tensor_clone = ggml_argsort(ggml_ctx, src0_clone, (ggml_sort_order) *(int *)tensor->op_params); } else { std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl; GGML_ASSERT(false); @@ -6252,7 +6963,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ free(src1_buffer); } if (src2 != nullptr) { - free(src1_buffer); + free(src2_buffer); } ggml_free(ggml_ctx); @@ -6269,6 +6980,10 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ return; } +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_check_results_1(" << tensor->name << ")" << std::endl; +#endif + ggml_tensor * src0 = tensor->src[0]; ggml_tensor * src1 = tensor->src[1]; @@ -6412,10 +7127,4 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ free(tensor_data); } } - -void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor) { - ggml_backend_vk_context * ctx = &vk_instance.contexts[0]; - - ggml_vk_check_results_0(ctx, params, tensor); -} #endif diff --git a/ggml-vulkan.h b/ggml-vulkan.h index e4317c3e0..af661c2d7 100644 --- a/ggml-vulkan.h +++ b/ggml-vulkan.h @@ -11,17 +11,6 @@ extern "C" { #define GGML_VK_MAX_DEVICES 16 GGML_API void ggml_vk_instance_init(void); -GGML_API void ggml_vk_init_cpu_assist(void); - -GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node); -GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void); -GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node); -GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); -#ifdef GGML_VULKAN_CHECK_RESULTS -void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor); -#endif -GGML_API void ggml_vk_graph_cleanup_cpu_assist(void); -GGML_API void ggml_vk_free_cpu_assist(void); // backend API GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num); diff --git a/ggml.c b/ggml.c index 1d5854960..093d38d00 100644 --- a/ggml.c +++ b/ggml.c @@ -3,6 +3,8 @@ #include "ggml-impl.h" #include "ggml-quants.h" +#include "ggml.h" +#include "sgemm.h" #if defined(_MSC_VER) || defined(__MINGW32__) #include // using malloc.h with MSC/MINGW @@ -31,6 +33,10 @@ #include #endif +#ifdef __ARM_FEATURE_MATMUL_INT8 +#undef GGML_USE_LLAMAFILE +#endif + #if defined(_MSC_VER) // disable "possible loss of data" to avoid hundreds of casts // we should just be careful :) @@ -43,6 +49,10 @@ #if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX + #define NOMINMAX +#endif #include typedef volatile LONG atomic_int; @@ -273,8 +283,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #include #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions #include "ggml-opencl.h" -#elif defined(GGML_USE_VULKAN) -#include "ggml-vulkan.h" #endif #elif defined(GGML_USE_OPENBLAS) #if defined(GGML_BLAS_USE_MKL) @@ -284,10 +292,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #endif #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" -#elif defined(GGML_USE_VULKAN) -#include "ggml-vulkan.h" -#elif defined(GGML_USE_SYCL) -#include "ggml-sycl.h" #endif // floating point type used to accumulate sums @@ -318,7 +322,7 @@ static ggml_fp16_t ggml_table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) (ggml-impl.h) float ggml_table_f32_f16[1 << 16]; -const char * ggml_status_to_string(enum ggml_status status) { +GGML_CALL const char * ggml_status_to_string(enum ggml_status status) { switch (status) { case GGML_STATUS_ALLOC_FAILED: return "GGML status: error (failed to allocate memory)"; case GGML_STATUS_FAILED: return "GGML status: error (operation failed)"; @@ -329,24 +333,34 @@ const char * ggml_status_to_string(enum ggml_status status) { return "GGML status: unknown"; } -// note: do not use these inside ggml.c -// these are meant to be used via the ggml.h API float ggml_fp16_to_fp32(ggml_fp16_t x) { +#define ggml_fp16_to_fp32 do_not_use__ggml_fp16_to_fp32__in_ggml return GGML_FP16_TO_FP32(x); } ggml_fp16_t ggml_fp32_to_fp16(float x) { +#define ggml_fp32_to_fp16 do_not_use__ggml_fp32_to_fp16__in_ggml return GGML_FP32_TO_FP16(x); } -void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n) { - for (int i = 0; i < n; i++) { +float ggml_bf16_to_fp32(ggml_bf16_t x) { +#define ggml_bf16_to_fp32 do_not_use__ggml_bf16_to_fp32__in_ggml + return GGML_BF16_TO_FP32(x); // it just left shifts +} + +ggml_bf16_t ggml_fp32_to_bf16(float x) { +#define ggml_fp32_to_bf16 do_not_use__ggml_fp32_to_bf16__in_ggml + return GGML_FP32_TO_BF16(x); +} + +void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) { + for (int64_t i = 0; i < n; i++) { y[i] = GGML_FP16_TO_FP32(x[i]); } } -void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { - int i = 0; +void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) { + int64_t i = 0; #if defined(__F16C__) for (; i + 7 < n; i += 8) { __m256 x_vec = _mm256_loadu_ps(x + i); @@ -364,6 +378,49 @@ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n) { } } +void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) { + int64_t i = 0; +#if defined(__AVX512F__) + for (; i + 16 <= n; i += 16) { + _mm512_storeu_ps(y + i, + _mm512_castsi512_ps( + _mm512_slli_epi32( + _mm512_cvtepu16_epi32( + _mm256_loadu_si256( + (const __m256i *)(x + i))), + 16))); + } +#elif defined(__AVX2__) + for (; i + 8 <= n; i += 8) { + _mm256_storeu_ps(y + i, + _mm256_castsi256_ps( + _mm256_slli_epi32( + _mm256_cvtepu16_epi32( + _mm_loadu_si128( + (const __m128i *)(x + i))), + 16))); + } +#endif + for (; i < n; i++) { + y[i] = GGML_BF16_TO_FP32(x[i]); + } +} + +void ggml_fp32_to_bf16_row(const float * x, ggml_bf16_t * y, int64_t n) { + int i = 0; +#if defined(__AVX512BF16__) + for (; i + 32 <= n; i += 32) { + _mm512_storeu_ps( + (__m512 *)(y + i), + (__m512)_mm512_cvtne2ps_pbh(_mm512_loadu_ps(x + i + 16), + _mm512_loadu_ps(x + i))); + } +#endif + for (; i < n; i++) { + y[i] = GGML_FP32_TO_BF16(x[i]); + } +} + bool ggml_guid_matches(ggml_guid_t guid_a, ggml_guid_t guid_b) { return memcmp(guid_a, guid_b, sizeof(ggml_guid)) == 0; } @@ -430,6 +487,57 @@ int64_t ggml_cycles_per_ms(void) { #define ggml_perf_cycles_per_ms() 0 #endif +// +// cross-platform UTF-8 file paths +// + +#ifdef _WIN32 +static wchar_t * ggml_mbstowcs(const char * mbs) { + int wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, NULL, 0); + if (!wlen) { + errno = EINVAL; + return NULL; + } + + wchar_t * wbuf = GGML_MALLOC(wlen * sizeof(wchar_t)); + wlen = MultiByteToWideChar(CP_UTF8, 0, mbs, -1, wbuf, wlen); + if (!wlen) { + GGML_FREE(wbuf); + errno = EINVAL; + return NULL; + } + + return wbuf; +} +#endif + +FILE * ggml_fopen(const char * fname, const char * mode) { +#ifdef _WIN32 + FILE * file = NULL; + + // convert fname (UTF-8) + wchar_t * wfname = ggml_mbstowcs(fname); + if (wfname) { + // convert mode (ANSI) + wchar_t * wmode = GGML_MALLOC((strlen(mode) + 1) * sizeof(wchar_t)); + wchar_t * wmode_p = wmode; + do { + *wmode_p++ = (wchar_t)*mode; + } while (*mode++); + + // open file + file = _wfopen(wfname, wmode); + + GGML_FREE(wfname); + GGML_FREE(wmode); + } + + return file; +#else + return fopen(fname, mode); +#endif +} + // // cache line // @@ -448,6 +556,7 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * restrict x, size_t bx, const float * restrict y, size_t by, int nrc); static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc); +static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { @@ -740,6 +849,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot_type = GGML_TYPE_Q8_K, .nrows = 1, }, + [GGML_TYPE_IQ1_M] = { + .type_name = "iq1_m", + .blck_size = QK_K, + .type_size = sizeof(block_iq1_m), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq1_m, + .from_float = NULL, + .from_float_reference = NULL, + .vec_dot = ggml_vec_dot_iq1_m_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + .nrows = 1, + }, [GGML_TYPE_IQ4_NL] = { .type_name = "iq4_nl", .blck_size = QK4_NL, @@ -778,6 +899,18 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .type_size = sizeof(block_q8_K), .is_quantized = true, .from_float = quantize_row_q8_K, + }, + [GGML_TYPE_BF16] = { + .type_name = "bf16", + .blck_size = 1, + .type_size = sizeof(ggml_bf16_t), + .is_quantized = false, + .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row, + .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .from_float_reference = (ggml_from_float_t) ggml_fp32_to_bf16_row, + .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16, + .vec_dot_type = GGML_TYPE_BF16, + .nrows = 1, } }; @@ -791,18 +924,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { // simd mappings // -#if defined(__ARM_NEON) -#if !defined(__aarch64__) - -// 64-bit compatibility - -inline static float vaddvq_f32(float32x4_t v) { - return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3); -} - -#endif -#endif - // we define a common set of C macros which map to specific intrinsics based on the current architecture // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros @@ -896,7 +1017,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO #define GGML_F16_VEC_SET1 GGML_F16x8_SET1 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i]) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i]) #define GGML_F16_VEC_FMA GGML_F16x8_FMA #define GGML_F16_VEC_ADD GGML_F16x8_ADD #define GGML_F16_VEC_MUL GGML_F16x8_MUL @@ -922,7 +1043,7 @@ inline static float vaddvq_f32(float32x4_t v) { #define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO #define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1 #define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p) - #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i]) + #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i]) #define GGML_F16_VEC_FMA GGML_F32Cx4_FMA #define GGML_F16_VEC_ADD GGML_F32Cx4_ADD #define GGML_F16_VEC_MUL GGML_F32Cx4_MUL @@ -991,7 +1112,7 @@ do { \ // unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F // so F16C guard isn't required -#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((__m256i *)(x))) +#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x))) #define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0)) #define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a) @@ -1089,7 +1210,7 @@ do { \ #if defined(__F16C__) // the _mm256_cvt intrinsics require F16C -#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) +#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x))) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) #else static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { @@ -1425,6 +1546,8 @@ inline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { inline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; } +inline static void ggml_vec_set_bf16(const int n, ggml_bf16_t * x, const ggml_bf16_t v) { for (int i = 0; i < n; ++i) x[i] = v; } + inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } inline static void ggml_vec_add1_f32(const int n, float * z, const float * x, const float v) { for (int i = 0; i < n; ++i) z[i] = x[i] + v; } inline static void ggml_vec_acc_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] += x[i]; } @@ -1443,7 +1566,7 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * UNUSED(by); UNUSED(bs); -#ifdef GGML_SIMD +#if defined(GGML_SIMD) float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1479,6 +1602,70 @@ static void ggml_vec_dot_f32(int n, float * restrict s, size_t bs, const float * *s = sumf; } +static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t * restrict x, size_t bx, ggml_bf16_t * restrict y, size_t by, int nrc) { + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + int i = 0; + ggml_float sumf = 0; + +#if defined(__AVX512BF16__) + __m512 c1 = _mm512_setzero_ps(); + __m512 c2 = _mm512_setzero_ps(); + for (; i + 64 <= n; i += 64) { + c1 = _mm512_dpbf16_ps(c1, (__m512bh)_mm512_loadu_ps((const float *)(x + i)), + (__m512bh)_mm512_loadu_ps((const float *)(y + i))); + c2 = _mm512_dpbf16_ps(c2, (__m512bh)_mm512_loadu_ps((const float *)(x + i + 32)), + (__m512bh)_mm512_loadu_ps((const float *)(y + i + 32))); + } + sumf += (ggml_float)_mm512_reduce_add_ps(c1); + sumf += (ggml_float)_mm512_reduce_add_ps(c2); + +#elif defined(__AVX512F__) +#define LOAD(p) _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(_mm256_loadu_si256((const __m256i *)(p))), 16)) + __m512 c1 = _mm512_setzero_ps(); + __m512 c2 = _mm512_setzero_ps(); + for (; i + 32 <= n; i += 32) { + c1 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i), LOAD(y + i)), c1); + c2 = _mm512_add_ps(_mm512_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c2); + } + sumf += (ggml_float)_mm512_reduce_add_ps(c1); + sumf += (ggml_float)_mm512_reduce_add_ps(c2); + +#undef LOAD +#elif defined(__AVX2__) +#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)) + __m256 c1 = _mm256_setzero_ps(); + __m256 c2 = _mm256_setzero_ps(); + __m256 c3 = _mm256_setzero_ps(); + __m256 c4 = _mm256_setzero_ps(); + for (; i + 32 <= n; i += 32) { + c1 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i), LOAD(y + i)), c1); + c2 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 8), LOAD(y + i + 8)), c2); + c3 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 16), LOAD(y + i + 16)), c3); + c4 = _mm256_add_ps(_mm256_mul_ps(LOAD(x + i + 24), LOAD(y + i + 24)), c4); + } + __m128 g; + c1 = _mm256_add_ps(_mm256_add_ps(c1, c3), + _mm256_add_ps(c2, c4)); + g = _mm_add_ps(_mm256_extractf128_ps(c1, 1), + _mm256_castps256_ps128(c1)); + g = _mm_add_ps(g, _mm_movehl_ps(g, g)); + g = _mm_add_ss(g, _mm_movehdup_ps(g)); + sumf += (ggml_float)_mm_cvtss_f32(g); + +#undef LOAD +#endif + + for (; i < n; ++i) { + sumf += (ggml_float)(GGML_BF16_TO_FP32(x[i]) * + GGML_BF16_TO_FP32(y[i])); + } + *s = sumf; +} + static void ggml_vec_dot_f16(int n, float * restrict s, size_t bs, ggml_fp16_t * restrict x, size_t bx, ggml_fp16_t * restrict y, size_t by, int nrc) { assert(nrc == 1); UNUSED(nrc); @@ -1607,6 +1794,37 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float #endif } +inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, const ggml_fp16_t * restrict x, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + + GGML_F16_VEC ax[GGML_F16_ARR]; + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx); + + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v); + } +#endif +} + // xs and vs are byte strides of x and v inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { @@ -1691,6 +1909,35 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #endif } +inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float v) { +#if defined(GGML_SIMD) + const int np = (n & ~(GGML_F16_STEP - 1)); + + GGML_F16_VEC vx = GGML_F16_VEC_SET1(v); + + GGML_F16_VEC ay[GGML_F16_ARR]; + + for (int i = 0; i < np; i += GGML_F16_STEP) { + for (int j = 0; j < GGML_F16_ARR; j++) { + ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j); + ay[j] = GGML_F16_VEC_MUL(ay[j], vx); + + GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j); + } + } + + // leftovers + for (int i = np; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + } +#else + // scalar + for (int i = 0; i < n; ++i) { + y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i])*v); + } +#endif +} + inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s); } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } @@ -1852,6 +2099,14 @@ inline static void ggml_vec_sum_f16_ggf(const int n, float * s, const ggml_fp16_ *s = sum; } +inline static void ggml_vec_sum_bf16_ggf(const int n, float * s, const ggml_bf16_t * x) { + float sum = 0.0f; + for (int i = 0; i < n; ++i) { + sum += GGML_BF16_TO_FP32(x[i]); + } + *s = sum; +} + inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE float max = -INFINITY; @@ -1945,6 +2200,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "LEAKY_RELU", "FLASH_ATTN", + "FLASH_ATTN_EXT", "FLASH_FF", "FLASH_ATTN_BACK", "SSM_CONV", @@ -1971,7 +2227,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); +static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -2035,6 +2291,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "leaky_relu(x)", "flash_attn(x)", + "flash_attn_ext(x)", "flash_ff(x)", "flash_attn_back(x)", "ssm_conv(x)", @@ -2061,7 +2318,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76"); +static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -2260,7 +2517,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) { // figure out which node we're on uint current_cpu; int getcpu_ret = 0; -#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) +#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__) getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node); #else // old glibc doesn't have a wrapper for this call. Fall back on direct syscall @@ -2471,6 +2728,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { switch (ftype) { case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; + case GGML_FTYPE_MOSTLY_BF16: wtype = GGML_TYPE_BF16; break; case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; @@ -2485,6 +2743,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_MOSTLY_IQ3_XXS: wtype = GGML_TYPE_IQ3_XXS; break; case GGML_FTYPE_MOSTLY_IQ1_S: wtype = GGML_TYPE_IQ1_S; break; + case GGML_FTYPE_MOSTLY_IQ1_M: wtype = GGML_TYPE_IQ1_M; break; case GGML_FTYPE_MOSTLY_IQ4_NL: wtype = GGML_TYPE_IQ4_NL; break; case GGML_FTYPE_MOSTLY_IQ4_XS: wtype = GGML_TYPE_IQ4_XS; break; case GGML_FTYPE_MOSTLY_IQ3_S: wtype = GGML_TYPE_IQ3_S; break; @@ -2540,6 +2799,16 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } +GGML_CALL bool ggml_is_empty(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] == 0) { + // empty if any dimension has no elements + return true; + } + } + return false; +} + bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); @@ -2554,7 +2823,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) { static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - return + return ggml_is_empty(t0) ? ggml_is_empty(t1) : (t1->ne[0]%t0->ne[0] == 0) && (t1->ne[1]%t0->ne[1] == 0) && (t1->ne[2]%t0->ne[2] == 0) && @@ -2601,15 +2870,16 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { { const uint64_t t_start = ggml_time_us(); UNUSED(t_start); - ggml_fp16_t ii; for (int i = 0; i < (1 << 16); ++i) { - uint16_t ui = i; - memcpy(&ii, &ui, sizeof(ii)); - const float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); + union { + uint16_t u16; + ggml_fp16_t fp16; + } u = {i}; + float f = ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16); ggml_table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); ggml_table_gelu_quick_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_quick_f32(f)); ggml_table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); - ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); + ggml_table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); @@ -2640,10 +2910,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { #if defined(GGML_USE_CLBLAST) ggml_cl_init(); -#elif defined(GGML_USE_VULKAN) - ggml_vk_init_cpu_assist(); -#elif defined(GGML_USE_SYCL) - ggml_init_sycl(); #endif ggml_setup_op_has_task_pass(); @@ -2863,7 +3129,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( data_size *= ne[i]; } - GGML_ASSERT(view_src == NULL || data_size + view_offs <= ggml_nbytes(view_src)); + GGML_ASSERT(view_src == NULL || data_size == 0 || data_size + view_offs <= ggml_nbytes(view_src)); void * data = view_src != NULL ? view_src->data : NULL; if (data != NULL) { @@ -3077,6 +3343,13 @@ struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) { ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; + case GGML_TYPE_BF16: + { + assert(tensor->nb[0] == sizeof(ggml_fp16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value)); + } + } break; case GGML_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); @@ -3129,6 +3402,13 @@ struct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) { ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), GGML_FP32_TO_FP16(value)); } } break; + case GGML_TYPE_BF16: + { + assert(tensor->nb[0] == sizeof(ggml_bf16_t)); + for (int i = 0; i < n; i++) { + ggml_vec_set_bf16(nc, (ggml_bf16_t *)(data + i*n1), GGML_FP32_TO_BF16(value)); + } + } break; case GGML_TYPE_F32: { assert(tensor->nb[0] == sizeof(float)); @@ -3196,6 +3476,11 @@ int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } + case GGML_TYPE_BF16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); + return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); + } case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3238,6 +3523,11 @@ void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); } break; + case GGML_TYPE_BF16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); + ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); + } break; case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3261,6 +3551,8 @@ int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i return ((int32_t *) data)[0]; case GGML_TYPE_F16: return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + case GGML_TYPE_BF16: + return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: return ((float *) data)[0]; default: @@ -3289,6 +3581,10 @@ void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, { ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); } break; + case GGML_TYPE_BF16: + { + ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value); + } break; case GGML_TYPE_F32: { ((float *)(data))[0] = value; @@ -3327,6 +3623,11 @@ float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]); } + case GGML_TYPE_BF16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); + return GGML_BF16_TO_FP32(((ggml_bf16_t *)(tensor->data))[i]); + } case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3369,6 +3670,11 @@ void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) { GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t)); ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value); } break; + case GGML_TYPE_BF16: + { + GGML_ASSERT(tensor->nb[0] == sizeof(ggml_bf16_t)); + ((ggml_bf16_t *)(tensor->data))[i] = GGML_FP32_TO_BF16(value); + } break; case GGML_TYPE_F32: { GGML_ASSERT(tensor->nb[0] == sizeof(float)); @@ -3392,6 +3698,8 @@ float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, return ((int32_t *) data)[0]; case GGML_TYPE_F16: return GGML_FP16_TO_FP32(((ggml_fp16_t *) data)[0]); + case GGML_TYPE_BF16: + return GGML_BF16_TO_FP32(((ggml_bf16_t *) data)[0]); case GGML_TYPE_F32: return ((float *) data)[0]; default: @@ -3420,6 +3728,10 @@ void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, { ((ggml_fp16_t *)(data))[0] = GGML_FP32_TO_FP16(value); } break; + case GGML_TYPE_BF16: + { + ((ggml_bf16_t *)(data))[0] = GGML_FP32_TO_BF16(value); + } break; case GGML_TYPE_F32: { ((float *)(data))[0] = value; @@ -3614,7 +3926,11 @@ static struct ggml_tensor * ggml_add_cast_impl( // TODO: support less-strict constraint // GGML_ASSERT(ggml_can_repeat(b, a)); GGML_ASSERT(ggml_can_repeat_rows(b, a)); - GGML_ASSERT(ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16); // currently only supported for quantized input and f16 + + // currently only supported for quantized input and f16 + GGML_ASSERT(ggml_is_quantized(a->type) || + a->type == GGML_TYPE_F16 || + a->type == GGML_TYPE_BF16); bool is_node = false; @@ -4497,6 +4813,8 @@ struct ggml_tensor * ggml_mul_mat( void ggml_mul_mat_set_prec( struct ggml_tensor * a, enum ggml_prec prec) { + GGML_ASSERT(a->op == GGML_OP_MUL_MAT); + const int32_t prec_i32 = (int32_t) prec; ggml_set_op_params_i32(a, 0, prec_i32); @@ -4504,45 +4822,47 @@ void ggml_mul_mat_set_prec( // ggml_mul_mat_id +/* + c = ggml_mul_mat_id(ctx, as, b, ids); + + as -> [cols, rows, n_expert] + ids -> [n_experts_used, n_tokens] (i32) + b -> [cols, n_expert_used, n_tokens] + c -> [cols, n_expert_used, n_tokens] + + in b, n_experts_used can be broadcasted to match the n_expert_used of ids + + c ~= as[:,:,i] @ b[:,i%r,t], i = ids[e,t] for all e,t in ids +*/ struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, - struct ggml_tensor * const as[], - int n_as, - struct ggml_tensor * ids, - int id, - struct ggml_tensor * b) { - + struct ggml_tensor * as, + struct ggml_tensor * b, + struct ggml_tensor * ids) { + GGML_ASSERT(!ggml_is_transposed(as)); GGML_ASSERT(ids->type == GGML_TYPE_I32); - GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); - GGML_ASSERT(ids->ne[1] == b->ne[1]); - GGML_ASSERT(ids->ne[2] == b->ne[2] && ids->ne[3] == b->ne[3]); - GGML_ASSERT(n_as > 0 && n_as <= GGML_MAX_SRC - 2); - GGML_ASSERT(id >= 0 && id < ids->ne[0]); + + GGML_ASSERT(as->ne[3] == 1); // as is 3d (one matrix per expert) + GGML_ASSERT(b->ne[3] == 1); // b is 3d + GGML_ASSERT(ids->ne[2] == 1 && ids->ne[3] == 1); // ids is 2d + GGML_ASSERT(ids->ne[1] == b->ne[2]); // must have an expert list per b row + GGML_ASSERT(as->ne[0] == b->ne[0]); // can_mul_mat + GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast bool is_node = false; - if (as[0]->grad || b->grad) { + if (as->grad || b->grad) { is_node = true; } - const int64_t ne[4] = { as[0]->ne[1], b->ne[1], b->ne[2], b->ne[3] }; + const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - ggml_set_op_params_i32(result, 0, id); - ggml_set_op_params_i32(result, 1, n_as); - result->op = GGML_OP_MUL_MAT_ID; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; - result->src[0] = ids; + result->src[0] = as; result->src[1] = b; - - for (int i = 0; i < n_as; i++) { - struct ggml_tensor * a = as[i]; - GGML_ASSERT(ggml_are_same_shape(as[0], a)); - GGML_ASSERT(ggml_can_mul_mat(a, b)); - GGML_ASSERT(!ggml_is_transposed(a)); - result->src[i + 2] = a; - } + result->src[2] = ids; return result; } @@ -5333,17 +5653,23 @@ static struct ggml_tensor * ggml_soft_max_impl( GGML_ASSERT(ggml_is_contiguous(a)); if (mask) { + GGML_ASSERT(mask->type == GGML_TYPE_F16 || mask->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_contiguous(mask)); GGML_ASSERT(ggml_is_matrix(mask)); - GGML_ASSERT(ggml_can_repeat_rows(mask, a)); + GGML_ASSERT(mask->ne[0] == a->ne[0]); + GGML_ASSERT(mask->ne[1] >= a->ne[1]); } if (pos) { GGML_ASSERT(ggml_is_vector(pos)); - GGML_ASSERT(pos->type == GGML_TYPE_F32); + GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32); GGML_ASSERT(pos->ne[0] == a->ne[0]); } + if (pos && mask) { + GGML_ASSERT(pos->type == mask->type); + } + if (max_bias > 0.0f) { GGML_ASSERT(pos); } @@ -6152,6 +6478,59 @@ struct ggml_tensor * ggml_flash_attn( return result; } +// ggml_flash_attn_ext + +struct ggml_tensor * ggml_flash_attn_ext( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * mask, + float scale) { + GGML_ASSERT(ggml_can_mul_mat(k, q)); + // TODO: check if vT can be multiplied by (k*qT) + if (mask) { + GGML_ASSERT(ggml_is_contiguous(mask)); + GGML_ASSERT(mask->ne[2] == 1); + GGML_ASSERT(mask->ne[3] == 1); + GGML_ASSERT(mask->ne[1] >= GGML_PAD(q->ne[1], GGML_KQ_MASK_PAD) && + "the Flash-Attention kernel requires the mask to be padded to GGML_KQ_MASK_PAD and at least n_queries big"); + //GGML_ASSERT(ggml_can_repeat_rows(mask, qk)); + } + + bool is_node = false; + + if (q->grad || k->grad || v->grad) { + is_node = true; + } + + // permute(0, 2, 1, 3) + int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + float params[] = { scale }; + ggml_set_op_params(result, params, sizeof(params)); + + result->op = GGML_OP_FLASH_ATTN_EXT; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = q; + result->src[1] = k; + result->src[2] = v; + result->src[3] = mask; + + return result; +} + +void ggml_flash_attn_ext_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec) { + GGML_ASSERT(a->op == GGML_OP_FLASH_ATTN_EXT); + + const int32_t prec_i32 = (int32_t) prec; + + ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos +} + // ggml_flash_ff struct ggml_tensor * ggml_flash_ff( @@ -7028,8 +7407,8 @@ static void ggml_compute_forward_dup_same_cont( ((char *) src0->data + ie0*nb00), (ie1 - ie0) * ggml_type_size(src0->type)); } - } + static void ggml_compute_forward_dup_f16( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -7303,6 +7682,366 @@ static void ggml_compute_forward_dup_f16( } } +static void ggml_compute_forward_dup_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + GGML_TENSOR_UNARY_OP_LOCALS + + const int ith = params->ith; // thread index + const int nth = params->nth; // number of threads + + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) { + ggml_compute_forward_dup_same_cont(params, dst); + return; + } + + // parallelize by rows + const int nr = ne01; + // number of rows per thread + const int dr = (nr + nth - 1) / nth; + // row range for this thread + const int ir0 = dr * ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (src0->type == dst->type && + ne00 == ne0 && + nb00 == ggml_type_size(src0->type) && nb0 == ggml_type_size(dst->type)) { + // copy by rows + const size_t rs = ne00*nb00; + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = ir0; i01 < ir1; i01++) { + memcpy( + ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3), + ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03), + rs); + } + } + } + return; + } + + // TODO: add more special-case implementations for tensor shapes/strides that can benefit from memcpy + + if (ggml_is_contiguous(dst)) { + if (nb00 == sizeof(ggml_bf16_t)) { + if (dst->type == GGML_TYPE_BF16) { + size_t id = 0; + const size_t rs = ne00 * nb00; + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + memcpy(dst_ptr + id, src0_ptr, rs); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(src0_ptr[i00])); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + for (int i00 = 0; i00 < ne00; i00++) { + dst_ptr[id] = GGML_BF16_TO_FP32(src0_ptr[i00]); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (type_traits[dst->type].from_float) { + ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float; + float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith; + + size_t id = 0; + size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type)); + char * dst_ptr = (char *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += rs * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + + for (int i00 = 0; i00 < ne00; i00++) { + src0_f32[i00] = GGML_BF16_TO_FP32(src0_ptr[i00]); + } + + quantize_row_q(src0_f32, dst_ptr + id, ne00); + id += rs; + } + id += rs * (ne01 - ir1); + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } else { + //printf("%s: this is not optimal - fix me\n", __func__); + + if (dst->type == GGML_TYPE_F32) { + size_t id = 0; + float * dst_ptr = (float *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_BF16_TO_FP32(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_BF16) { + size_t id = 0; + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = *src0_ptr; + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else if (dst->type == GGML_TYPE_F16) { + size_t id = 0; + ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*src0_ptr)); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } + } + return; + } + + // dst counters + int64_t i10 = 0; + int64_t i11 = 0; + int64_t i12 = 0; + int64_t i13 = 0; + + if (dst->type == GGML_TYPE_BF16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + memcpy(dst_ptr, src0_ptr, sizeof(ggml_bf16_t)); + + if (++i10 == ne00) { + i10 = 0; + if (++i11 == ne01) { + i11 = 0; + if (++i12 == ne02) { + i12 = 0; + if (++i13 == ne03) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_fp16_t *) dst_ptr = GGML_FP32_TO_FP16(GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr)); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else if (dst->type == GGML_TYPE_F32) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(float *) dst_ptr = GGML_BF16_TO_FP32(*(const ggml_bf16_t *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + } else { + GGML_ASSERT(false); // TODO: implement + } +} + static void ggml_compute_forward_dup_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -7430,6 +8169,24 @@ static void ggml_compute_forward_dup_f32( id += ne00 * (ne01 - ir1); } } + } else if (dst->type == GGML_TYPE_BF16) { + size_t id = 0; + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) dst->data; + + for (int i03 = 0; i03 < ne03; i03++) { + for (int i02 = 0; i02 < ne02; i02++) { + id += ne00 * ir0; + for (int i01 = ir0; i01 < ir1; i01++) { + for (int i00 = 0; i00 < ne00; i00++) { + const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + dst_ptr[id] = GGML_FP32_TO_BF16(*src0_ptr); + id++; + } + } + id += ne00 * (ne01 - ir1); + } + } } else { GGML_ASSERT(false); // TODO: implement } @@ -7549,6 +8306,58 @@ static void ggml_compute_forward_dup_f32( } } } + } else if (dst->type == GGML_TYPE_BF16) { + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + i10 += ne00 * ir0; + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + for (int64_t i01 = ir0; i01 < ir1; i01++) { + for (int64_t i00 = 0; i00 < ne00; i00++) { + const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3); + + *(ggml_bf16_t *) dst_ptr = GGML_FP32_TO_BF16(*(const float *) src0_ptr); + + if (++i10 == ne0) { + i10 = 0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } + i10 += ne00 * (ne01 - ir1); + while (i10 >= ne0) { + i10 -= ne0; + if (++i11 == ne1) { + i11 = 0; + if (++i12 == ne2) { + i12 = 0; + if (++i13 == ne3) { + i13 = 0; + } + } + } + } + } + } } else { GGML_ASSERT(false); // TODO: implement } @@ -7722,6 +8531,10 @@ static void ggml_compute_forward_dup( { ggml_compute_forward_dup_f16(params, dst); } break; + case GGML_TYPE_BF16: + { + ggml_compute_forward_dup_bf16(params, dst); + } break; case GGML_TYPE_F32: { ggml_compute_forward_dup_f32(params, dst); @@ -7904,6 +8717,85 @@ static void ggml_compute_forward_add_f16_f32( } } +static void ggml_compute_forward_add_bf16_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + if (dst->type == GGML_TYPE_F32) { + GGML_ASSERT( nb0 == sizeof(float)); + } + else { + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + } + + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(float)) { + if (dst->type == GGML_TYPE_BF16) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]); + } + } + } else { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + float * dst_ptr = (float *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + float * src1_ptr = (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_BF16_TO_FP32(src0_ptr[i]) + src1_ptr[i]; + } + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + static void ggml_compute_forward_add_f16_f16( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -7960,6 +8852,62 @@ static void ggml_compute_forward_add_f16_f16( } } +static void ggml_compute_forward_add_bf16_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_BINARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_BF16); + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + if (nb10 == sizeof(ggml_bf16_t)) { + for (int ir = ir0; ir < ir1; ++ir) { + // src0, src1 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + ggml_bf16_t * src1_ptr = (ggml_bf16_t *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11); + + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + GGML_BF16_TO_FP32(src1_ptr[i])); + } + } + } + else { + // src1 is not contiguous + GGML_ASSERT(false); + } +} + static void ggml_compute_forward_add_q_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -8069,6 +9017,18 @@ static void ggml_compute_forward_add( GGML_ASSERT(false); } } break; + case GGML_TYPE_BF16: + { + if (src1->type == GGML_TYPE_BF16) { + ggml_compute_forward_add_bf16_bf16(params, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add_bf16_f32(params, dst); + } + else { + GGML_ASSERT(false); + } + } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -8083,6 +9043,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -8326,6 +9287,110 @@ static void ggml_compute_forward_add1_q_f32( } } +static void ggml_compute_forward_add1_bf16_f32( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + // scalar to add + const float v = *(float *) src1->data; + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); + } + } +} + +static void ggml_compute_forward_add1_bf16_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + GGML_ASSERT(ggml_are_same_shape(src0, dst)); + GGML_ASSERT(ggml_is_scalar(src1)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + // scalar to add + const float v = GGML_BF16_TO_FP32(*(ggml_bf16_t *) src1->data); + + const int ith = params->ith; + const int nth = params->nth; + + const int nr = ggml_nrows(src0); + + GGML_TENSOR_UNARY_OP_LOCALS + + GGML_ASSERT(src0->type == GGML_TYPE_BF16); + GGML_ASSERT(src1->type == GGML_TYPE_BF16); + GGML_ASSERT(dst->type == GGML_TYPE_BF16); + + GGML_ASSERT( nb0 == sizeof(ggml_bf16_t)); + GGML_ASSERT(nb00 == sizeof(ggml_bf16_t)); + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int ir = ir0; ir < ir1; ++ir) { + // src0 and dst are same shape => same indices + const int i3 = ir/(ne2*ne1); + const int i2 = (ir - i3*ne2*ne1)/ne1; + const int i1 = (ir - i3*ne2*ne1 - i2*ne1); + + ggml_bf16_t * dst_ptr = (ggml_bf16_t *) ((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 ); + ggml_bf16_t * src0_ptr = (ggml_bf16_t *) ((char *) src0->data + i3*nb03 + i2*nb02 + i1*nb01); + for (int i = 0; i < ne0; i++) { + dst_ptr[i] = GGML_FP32_TO_BF16(GGML_BF16_TO_FP32(src0_ptr[i]) + v); + } + } +} + static void ggml_compute_forward_add1( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -8350,6 +9415,18 @@ static void ggml_compute_forward_add1( GGML_ASSERT(false); } } break; + case GGML_TYPE_BF16: + { + if (src1->type == GGML_TYPE_BF16) { + ggml_compute_forward_add1_bf16_bf16(params, dst); + } + else if (src1->type == GGML_TYPE_F32) { + ggml_compute_forward_add1_bf16_f32(params, dst); + } + else { + GGML_ASSERT(false); + } + } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -8365,6 +9442,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -8477,6 +9555,7 @@ static void ggml_compute_forward_acc( ggml_compute_forward_acc_f32(params, dst); } break; case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -8492,6 +9571,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -8997,6 +10077,40 @@ static void ggml_compute_forward_sum_f16( ((ggml_fp16_t *) dst->data)[0] = GGML_FP32_TO_FP16(sum); } +static void ggml_compute_forward_sum_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + + assert(params->ith == 0); + assert(ggml_is_scalar(dst)); + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + assert(src0->nb[0] == sizeof(ggml_bf16_t)); + + GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) + GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) + + float sum = 0; + float row_sum = 0; + + for (int64_t i03 = 0; i03 < ne03; i03++) { + for (int64_t i02 = 0; i02 < ne02; i02++) { + for (int64_t i01 = 0; i01 < ne01; i01++) { + ggml_vec_sum_bf16_ggf(ne00, + &row_sum, + (ggml_bf16_t *) ((char *) src0->data + i01 * nb01 + i02 * nb02 + i03 * nb03)); + sum += row_sum; + } + } + } + ((ggml_bf16_t *) dst->data)[0] = GGML_FP32_TO_BF16(sum); +} + static void ggml_compute_forward_sum( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -9012,6 +10126,10 @@ static void ggml_compute_forward_sum( { ggml_compute_forward_sum_f16(params, dst); } break; + case GGML_TYPE_BF16: + { + ggml_compute_forward_sum_bf16(params, dst); + } break; default: { GGML_ASSERT(false); @@ -9286,6 +10404,7 @@ static void ggml_compute_forward_repeat( switch (src0->type) { case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_I16: { ggml_compute_forward_repeat_f16(params, dst); @@ -10745,6 +11864,28 @@ static void ggml_compute_forward_mul_mat( } #endif +#if GGML_USE_LLAMAFILE + if (src1_cont) { + for (int64_t i13 = 0; i13 < ne13; i13++) + for (int64_t i12 = 0; i12 < ne12; i12++) + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + nb01/ggml_type_size(src0->type), + (const char *)src1->data + i12*nb12 + i13*nb13, + nb11/ggml_type_size(src1->type), + (char *)dst->data + i12*nb2 + i13*nb3, + nb1/ggml_type_size(dst->type), + ith, nth, + params->type, + src0->type, + src1->type, + dst->type)) + goto UseGgmlGemm1; + return; + } +UseGgmlGemm1:; +#endif + if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { return; @@ -10776,6 +11917,28 @@ static void ggml_compute_forward_mul_mat( const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); +#if GGML_USE_LLAMAFILE + if (src1->type != vec_dot_type) { + for (int64_t i13 = 0; i13 < ne13; i13++) + for (int64_t i12 = 0; i12 < ne12; i12++) + if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type), + (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03, + nb01/ggml_type_size(src0->type), + (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size, + row_size/ggml_type_size(vec_dot_type), + (char *)dst->data + i12*nb2 + i13*nb3, + nb1/ggml_type_size(dst->type), + ith, nth, + params->type, + src0->type, + vec_dot_type, + dst->type)) + goto UseGgmlGemm2; + return; + } +UseGgmlGemm2:; +#endif + const int64_t nr0 = ne01; // src0 rows const int64_t nr1 = ne1*ne12*ne13; // src1 rows @@ -10876,10 +12039,9 @@ static void ggml_compute_forward_mul_mat_id( const struct ggml_compute_params * params, struct ggml_tensor * dst) { - const struct ggml_tensor * ids = dst->src[0]; + const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src1 = dst->src[1]; - - const struct ggml_tensor * src0 = dst->src[2]; // only for GGML_TENSOR_BINARY_OP_LOCALS + const struct ggml_tensor * ids = dst->src[2]; GGML_TENSOR_BINARY_OP_LOCALS @@ -10894,11 +12056,6 @@ static void ggml_compute_forward_mul_mat_id( enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type; ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float; - GGML_ASSERT(ne0 == ne01); - GGML_ASSERT(ne1 == ne11); - GGML_ASSERT(ne2 == ne12); - GGML_ASSERT(ne3 == ne13); - // we don't support permuted src0 or src1 GGML_ASSERT(nb00 == ggml_type_size(type)); GGML_ASSERT(nb10 == ggml_type_size(src1->type)); @@ -10909,22 +12066,21 @@ static void ggml_compute_forward_mul_mat_id( GGML_ASSERT(nb1 <= nb2); GGML_ASSERT(nb2 <= nb3); - // broadcast factors - const int64_t r2 = ne12/ne02; - const int64_t r3 = ne13/ne03; - // row groups - const int id = ggml_get_op_params_i32(dst, 0); - const int n_as = ggml_get_op_params_i32(dst, 1); + const int n_ids = ids->ne[0]; // n_expert_used + const int n_as = ne02; // n_expert char * wdata_src1_end = (src1->type == vec_dot_type) ? (char *) params->wdata : (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t)); - int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] - int64_t * matrix_rows = matrix_row_counts + n_as; // [n_as][ne11] + struct mmid_row_mapping { + int32_t i1; + int32_t i2; + }; - #define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne11 + (i1)] + int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as] + struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11] if (params->type == GGML_TASK_TYPE_INIT) { if (ith != 0) { @@ -10948,16 +12104,20 @@ static void ggml_compute_forward_mul_mat_id( } // initialize matrix_row_counts - GGML_ASSERT(wdata == wdata_src1_end); memset(matrix_row_counts, 0, n_as*sizeof(int64_t)); - // group rows by src0 matrix - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - const int32_t row_id = *(const int32_t *) ((const char *) ids->data + i01*ids->nb[1] + id*ids->nb[0]); +#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)] - GGML_ASSERT(row_id >= 0 && row_id < n_as); - MMID_MATRIX_ROW(row_id, matrix_row_counts[row_id]) = i01; - matrix_row_counts[row_id] += 1; + // group rows by src0 matrix + for (int64_t iid1 = 0; iid1 < ids->ne[1]; ++iid1) { + for (int id = 0; id < n_ids; ++id) { + const int32_t i02 = *(const int32_t *) ((const char *) ids->data + iid1*ids->nb[1] + id*ids->nb[0]); + + assert(i02 >= 0 && i02 < n_as); + + MMID_MATRIX_ROW(i02, matrix_row_counts[i02]) = (struct mmid_row_mapping) {id, iid1}; + matrix_row_counts[i02] += 1; + } } return; @@ -10975,15 +12135,13 @@ static void ggml_compute_forward_mul_mat_id( continue; } - const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + const char * src0_cur = (const char *) src0->data + cur_a*nb02; const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); - const int64_t nr0 = ne01; // src0 rows - const int64_t nr1 = cne1*ne12*ne13; // src1 rows - - //printf("nr0 = %lld, nr1 = %lld\n", nr0, nr1); + const int64_t nr0 = ne01; // src0 rows + const int64_t nr1 = cne1; // src1 rows // distribute the thread work across the inner or outer loop based on which one is larger @@ -11002,16 +12160,11 @@ static void ggml_compute_forward_mul_mat_id( const int64_t ir110 = dr1*ith1; const int64_t ir111 = MIN(ir110 + dr1, nr1); - //printf("ir010 = %6lld, ir011 = %6lld, ir110 = %6lld, ir111 = %6lld\n", ir010, ir011, ir110, ir111); - // threads with no work simply yield (not sure if it helps) - if (ir010 >= ir011 || ir110 >= ir111) { - sched_yield(); - continue; - } - - assert(ne12 % ne02 == 0); - assert(ne13 % ne03 == 0); + //if (ir010 >= ir011 || ir110 >= ir111) { + // sched_yield(); + // continue; + //} // block-tiling attempt const int64_t blck_0 = 16; @@ -11023,20 +12176,16 @@ static void ggml_compute_forward_mul_mat_id( for (int64_t iir1 = ir110; iir1 < ir111; iir1 += blck_1) { for (int64_t iir0 = ir010; iir0 < ir011; iir0 += blck_0) { for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir111; ++ir1) { - const int64_t i13 = (ir1/(ne12*cne1)); // Note: currently, src1 is always a matrix - const int64_t i12 = (ir1 - i13*ne12*cne1)/cne1; - const int64_t _i11 = (ir1 - i13*ne12*cne1 - i12*cne1); - const int64_t i11 = MMID_MATRIX_ROW(cur_a, _i11); + const int64_t _i12 = ir1; // logical row index for this expert - // broadcast src0 into src1 - const int64_t i03 = i13/r3; - const int64_t i02 = i12/r2; + struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12); + const int id = row_mapping.i1; // selected expert index - const int64_t i1 = i11; - const int64_t i2 = i12; - const int64_t i3 = i13; + const int64_t i11 = id % ne11; + const int64_t i12 = row_mapping.i2; // row index in src1 - const char * src0_row = (const char *) src0_cur->data + (0 + i02*nb02 + i03*nb03); + const int64_t i1 = id; // selected expert index + const int64_t i2 = i12; // row // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using @@ -11044,25 +12193,26 @@ static void ggml_compute_forward_mul_mat_id( // TODO: this is a bit of a hack, we should probably have a better way to handle this const char * src1_col = (const char *) wdata + (src1_cont || src1->type != vec_dot_type - ? (i11 + i12*ne11 + i13*ne12*ne11)*row_size - : (i11*nb11 + i12*nb12 + i13*nb13)); + ? (i11 + i12*ne11)*row_size + : (i11*nb11 + i12*nb12)); - float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)); + float * dst_col = (float *) ((char *) dst->data + (i1*nb1 + i2*nb2)); //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col); //} for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) { - vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_row + ir0*nb01, 0, src1_col, 0, 1); + vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1); } + memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir011) - iir0)*sizeof(float)); } } } } - #undef MMID_MATRIX_ROW +#undef MMID_MATRIX_ROW } // ggml_compute_forward_out_prod @@ -11395,6 +12545,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -11571,6 +12722,7 @@ static void ggml_compute_forward_set( ggml_compute_forward_set_f32(params, dst); } break; case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -11586,6 +12738,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -11744,6 +12897,49 @@ static void ggml_compute_forward_get_rows_f16( } } +static void ggml_compute_forward_get_rows_bf16( + const struct ggml_compute_params * params, + struct ggml_tensor * dst) { + + const struct ggml_tensor * src0 = dst->src[0]; + const struct ggml_tensor * src1 = dst->src[1]; + + if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS + + const int64_t nc = ne00; + const int64_t nr = ggml_nelements(src1); + + assert(ne0 == nc); + assert(ne02 == ne11); + assert(nb00 == sizeof(ggml_bf16_t)); + assert(ggml_nrows(dst) == nr); + + const int ith = params->ith; + const int nth = params->nth; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + for (int64_t i = ir0; i < ir1; ++i) { + const int64_t i12 = i/(ne11*ne10); + const int64_t i11 = (i - i12*ne11*ne10)/ne10; + const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10); + const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12); + + ggml_bf16_to_fp32_row( + (const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03), + (float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc); + } +} + static void ggml_compute_forward_get_rows_f32( const struct ggml_compute_params * params, struct ggml_tensor * dst) { @@ -11809,6 +13005,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -11820,6 +13017,10 @@ static void ggml_compute_forward_get_rows( { ggml_compute_forward_get_rows_f16(params, dst); } break; + case GGML_TYPE_BF16: + { + ggml_compute_forward_get_rows_bf16(params, dst); + } break; case GGML_TYPE_F32: case GGML_TYPE_I32: { @@ -12154,7 +13355,7 @@ static void ggml_compute_forward_soft_max_f32( GGML_TENSOR_UNARY_OP_LOCALS - const int64_t ne11 = src1 ? src1->ne[1] : 1; + //const int64_t ne11 = src1 ? src1->ne[1] : 1; // TODO: is this supposed to be ceil instead of floor? // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 @@ -12177,19 +13378,31 @@ static void ggml_compute_forward_soft_max_f32( float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; // when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching - float * pos = src2 ? (float *) src2->data : src0->data; + ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data; + float * pos_f32 = src2 ? (float *) src2->data : src0->data; + + const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16); for (int i1 = ir0; i1 < ir1; i1++) { float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); // broadcast the mask across rows - float * mp = src1 ? (float *)((char *) src1->data + (i1%ne11)*src1->nb[1]) : NULL; + ggml_fp16_t * mp_f16 = src1 ? (ggml_fp16_t *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; + float * mp_f32 = src1 ? (float *)((char *) src1->data) + (i1%ne01)*ne00 : NULL; ggml_vec_cpy_f32 (nc, wp, sp); ggml_vec_scale_f32(nc, wp, scale); - if (mp) { - ggml_vec_acc_f32(nc, wp, mp); + if (mp_f32) { + if (use_f16) { + for (int i = 0; i < nc; ++i) { + wp[i] += GGML_FP16_TO_FP32(mp_f16[i]); + } + } else { + for (int i = 0; i < nc; ++i) { + wp[i] += mp_f32[i]; + } + } } // ALiBi bias @@ -12197,8 +13410,14 @@ static void ggml_compute_forward_soft_max_f32( const uint32_t h = (i1/ne01)%ne02; // head const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1); - for (int i = 0; i < nc; i++) { - wp[i] = wp[i] + slope*pos[i]; + if (use_f16) { + for (int i = 0; i < nc; ++i) { + wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[i]); + } + } else { + for (int i = 0; i < nc; ++i) { + wp[i] += slope*pos_f32[i]; + } } } @@ -12497,6 +13716,7 @@ static void ggml_compute_forward_alibi( { ggml_compute_forward_alibi_f32(params, dst); } break; + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -12512,6 +13732,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -12585,6 +13806,7 @@ static void ggml_compute_forward_clamp( ggml_compute_forward_clamp_f32(params, dst); } break; case GGML_TYPE_F16: + case GGML_TYPE_BF16: case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q5_0: @@ -12600,6 +13822,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ4_NL: case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ3_S: @@ -14466,6 +15689,198 @@ static void ggml_compute_forward_flash_attn( } } +// ggml_compute_forward_flash_attn_ext + +static void ggml_compute_forward_flash_attn_ext_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * mask, + struct ggml_tensor * dst) { + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_LOCALS(int64_t, neq, q, ne) + GGML_TENSOR_LOCALS(size_t, nbq, q, nb) + GGML_TENSOR_LOCALS(int64_t, nek, k, ne) + GGML_TENSOR_LOCALS(size_t, nbk, k, nb) + GGML_TENSOR_LOCALS(int64_t, nev, v, ne) + GGML_TENSOR_LOCALS(size_t, nbv, v, nb) + GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) + GGML_TENSOR_LOCALS(size_t, nb, dst, nb) + + const int ith = params->ith; + const int nth = params->nth; + + const int64_t D = neq0; + const int64_t N = neq1; + + GGML_ASSERT(ne0 == D); + GGML_ASSERT(ne2 == N); + + GGML_ASSERT(nbq0 == sizeof(float)); + GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t)); + + GGML_ASSERT(neq0 == D); + GGML_ASSERT(nek0 == D); + GGML_ASSERT(nev0 == D); + + GGML_ASSERT(neq1 == N); + GGML_ASSERT(nev0 == D); + + // dst cannot be transposed or permuted + GGML_ASSERT(nb0 == sizeof(float)); + GGML_ASSERT(nb0 <= nb1); + GGML_ASSERT(nb1 <= nb2); + GGML_ASSERT(nb2 <= nb3); + + // broadcast factors + const int64_t rk2 = neq2/nek2; + const int64_t rk3 = neq3/nek3; + + const int64_t rv2 = neq2/nev2; + const int64_t rv3 = neq3/nev3; + + if (params->type == GGML_TASK_TYPE_INIT) { + return; + } + + if (params->type == GGML_TASK_TYPE_FINALIZE) { + return; + } + + // parallelize by q rows using ggml_vec_dot_f32 + + // total rows in q + const int nr = neq1*neq2*neq3; + + // rows per thread + const int dr = (nr + nth - 1)/nth; + + // row range for this thread + const int ir0 = dr*ith; + const int ir1 = MIN(ir0 + dr, nr); + + float scale = 1.0f; + memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); + + // loop over n_batch and n_head + for (int ir = ir0; ir < ir1; ++ir) { + // q indices + const int iq3 = ir/(neq2*neq1); + const int iq2 = (ir - iq3*neq2*neq1)/neq1; + const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); + + float S = 0.0f; + float M = -INFINITY; + + float * V32 = (float *) params->wdata + ith*(2*D + CACHE_LINE_SIZE_F32); + ggml_fp16_t * Q16 = (ggml_fp16_t *) (V32); // reuse memory + ggml_fp16_t * V16 = (ggml_fp16_t *) (V32 + D); + + memset(V16, 0, D*sizeof(ggml_fp16_t)); + + const ggml_fp16_t * mp = mask ? (ggml_fp16_t *)((char *) mask->data + iq1*mask->nb[1]) : NULL; + + // k indices + const int ik3 = iq3 / rk3; + const int ik2 = iq2 / rk2; + + // v indices + const int iv3 = iq3 / rv3; + const int iv2 = iq2 / rv2; + + // online softmax / attention + // loop over n_kv and n_head_kv + // ref: https://arxiv.org/pdf/2112.05682.pdf + for (int64_t ic = 0; ic < nek1; ++ic) { + const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f; + if (mv == -INFINITY) { + continue; + } + + float s; + + // convert Q to F16 in V32 + { + const float * pq = (const float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)); + + for (int64_t d = 0; d < D; ++d) { + Q16[d] = GGML_FP32_TO_FP16(pq[d]); + } + } + + ggml_vec_dot_f16(D, + &s, 0, + (ggml_fp16_t *) ((char *) k->data + ( ic*nbk1 + ik2*nbk2 + ik3*nbk3)), 0, + Q16, 0, 1); + + s = s*scale + mv; + + const float Mold = M; + + float ms = 1.0f; + float vs = 1.0f; + + if (s > M) { + M = s; + ms = expf(Mold - M); + + // V = V*expf(Mold - M) + ggml_vec_scale_f16(D, V16, ms); + } else { + vs = expf(s - M); + } + + const ggml_fp16_t * v16 = (const ggml_fp16_t *) ((char *) v->data + (ic*nbv1 + iv2*nbv2 + iv3*nbv3)); + + // V += v*expf(s - M) + ggml_vec_mad_f16(D, V16, v16, vs); + + S = S*ms + vs; + } + + // V /= S + for (int64_t d = 0; d < D; ++d) { + V32[d] = GGML_FP16_TO_FP32(V16[d])/S; + } + + // dst indices + const int i1 = iq1; + const int i2 = iq2; + const int i3 = iq3; + + // original + //memcpy((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3), V, nev0*sizeof(float)); + + // permute(0, 2, 1, 3) + memcpy((char *) dst->data + (i3*ne2*ne1 + i2 + i1*ne1)*nb1, V32, nb1); + } +} + +static void ggml_compute_forward_flash_attn_ext( + const struct ggml_compute_params * params, + const struct ggml_tensor * q, + const struct ggml_tensor * k, + const struct ggml_tensor * v, + const struct ggml_tensor * mask, + struct ggml_tensor * dst) { + switch (dst->op_params[1]) { + case GGML_PREC_DEFAULT: + case GGML_PREC_F32: + { + // uses F32 accumulators + ggml_compute_forward_flash_attn_ext_f16(params, q, k, v, mask, dst); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_flash_ff static void ggml_compute_forward_flash_ff_f16( @@ -15485,6 +16900,7 @@ static void ggml_compute_forward_get_rel_pos( switch (src0->type) { case GGML_TYPE_F16: + case GGML_TYPE_BF16: { ggml_compute_forward_get_rel_pos_f16(params, dst); } break; @@ -16041,30 +17457,10 @@ static void ggml_compute_forward_cross_entropy_loss_back( static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); - if (tensor->op == GGML_OP_NONE) { + if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) { return; } -#if defined(GGML_USE_VULKAN) - const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); -#ifdef GGML_VULKAN_CHECK_RESULTS - if (skip_cpu) { - ggml_vk_check_results_1_cpu_assist(params, tensor); - } -#endif - if (skip_cpu) { - return; - } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); -#endif // GGML_USE_VULKAN - -#ifdef GGML_USE_SYCL - bool skip_cpu = ggml_sycl_compute_forward(params, tensor); - if (skip_cpu) { - return; - } -#endif // GGML_USE_SYCL switch (tensor->op) { case GGML_OP_DUP: { @@ -16293,6 +17689,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm const bool masked = t != 0; ggml_compute_forward_flash_attn(params, masked, tensor); } break; + case GGML_OP_FLASH_ATTN_EXT: + { + ggml_compute_forward_flash_attn_ext(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor); + } break; case GGML_OP_FLASH_FF: { ggml_compute_forward_flash_ff(params, tensor); @@ -17305,6 +18705,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor GGML_ASSERT(false); // TODO: not implemented } break; case GGML_OP_FLASH_ATTN: + case GGML_OP_FLASH_ATTN_EXT: { struct ggml_tensor * flash_grad = NULL; if (src0->grad || src1->grad || tensor->src[2]->grad) { @@ -17916,6 +19317,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) { int n_tasks = 0; + if (ggml_is_empty(node)) { + // no need to multi-thread a no-op + n_tasks = 1; + return n_tasks; + } + switch (node->op) { case GGML_OP_CPY: case GGML_OP_DUP: @@ -18071,6 +19478,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_ n_tasks = n_threads; } break; case GGML_OP_FLASH_ATTN: + case GGML_OP_FLASH_ATTN_EXT: { n_tasks = n_threads; } break; @@ -18357,7 +19765,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa case GGML_OP_CPY: case GGML_OP_DUP: { - if (ggml_is_quantized(node->type)) { + if (ggml_is_quantized(node->type) || + // F16 -> BF16 and BF16 -> F16 copies go through intermediate F32 + (node->src[0]->type == GGML_TYPE_F16 && node->src[1] && node->src[1]->type == GGML_TYPE_BF16) || + (node->src[0]->type == GGML_TYPE_BF16 && node->src[1] && node->src[1]->type == GGML_TYPE_F16)) { cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks; } } break; @@ -18401,16 +19812,16 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa case GGML_OP_MUL_MAT_ID: { cur = 0; - const struct ggml_tensor * src0 = node->src[2]; + const struct ggml_tensor * src0 = node->src[0]; const struct ggml_tensor * src1 = node->src[1]; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); } - const int n_as = ggml_get_op_params_i32(node, 1); + const int n_as = src0->ne[2]; cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += n_as * sizeof(int64_t); // matrix_row_counts - cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows + cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows } break; case GGML_OP_OUT_PROD: { @@ -18436,7 +19847,8 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa const int64_t ne10 = node->src[1]->ne[0]; // L const int64_t ne11 = node->src[1]->ne[1]; // Cin - if (node->src[0]->type == GGML_TYPE_F16 && + if ((node->src[0]->type == GGML_TYPE_F16 || + node->src[0]->type == GGML_TYPE_BF16) && node->src[1]->type == GGML_TYPE_F32) { cur += sizeof(ggml_fp16_t)*ne00*ne01*ne02; cur += sizeof(ggml_fp16_t)*ne10*ne11; @@ -18472,8 +19884,17 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } else if (node->src[1]->type == GGML_TYPE_F16) { cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_BF16) { + cur = sizeof(float)*ne11*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*ne11*n_tasks; // this is overestimated by x2 } } break; + case GGML_OP_FLASH_ATTN_EXT: + { + const int64_t ne00 = node->src[0]->ne[0]; // D + + cur = 2*sizeof(float)*ne00*n_tasks; // 2x head size + } break; case GGML_OP_FLASH_FF: { if (node->src[1]->type == GGML_TYPE_F32) { @@ -18482,6 +19903,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } else if (node->src[1]->type == GGML_TYPE_F16) { cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_BF16) { + cur = sizeof(float)*node->src[1]->ne[1]*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*node->src[1]->ne[1]*n_tasks; // this is overestimated by x2 } } break; case GGML_OP_FLASH_ATTN_BACK: @@ -18495,6 +19919,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa } else if (node->src[1]->type == GGML_TYPE_F16) { cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 + } else if (node->src[1]->type == GGML_TYPE_BF16) { + cur = sizeof(float)*mxDn*n_tasks; // TODO: this can become (n_tasks-1) + cur += sizeof(float)*mxDn*n_tasks; // this is overestimated by x2 } } break; @@ -18534,17 +19961,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } } -#ifdef GGML_USE_VULKAN - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]); - } - ggml_vk_preallocate_buffers_cpu_assist(); - - for (int i = 0; i < cgraph->n_nodes; i++) { - ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1); - } -#endif - const int n_threads = cplan->n_threads; struct ggml_compute_state_shared state_shared = { @@ -18601,10 +20017,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl } } -#ifdef GGML_USE_VULKAN - ggml_vk_graph_cleanup_cpu_assist(); -#endif - // performance stats (graph) { int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles; @@ -18739,7 +20151,7 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { // write binary data { - FILE * fout = fopen(fname, "wb"); + FILE * fout = ggml_fopen(fname, "wb"); if (!fout) { fprintf(stderr, "%s: failed to open %s\n", __func__, fname); @@ -18877,7 +20289,7 @@ struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context * // read file into data { - FILE * fin = fopen(fname, "rb"); + FILE * fin = ggml_fopen(fname, "rb"); if (!fin) { fprintf(stderr, "%s: failed to open %s\n", __func__, fname); return result; @@ -19213,7 +20625,7 @@ static void ggml_graph_dump_dot_leaf_edge(FILE * fp, struct ggml_tensor * node, void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) { char color[16]; - FILE * fp = fopen(filename, "w"); + FILE * fp = ggml_fopen(filename, "w"); GGML_ASSERT(fp); fprintf(fp, "digraph G {\n"); @@ -19286,7 +20698,9 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) { fprintf(fp, "%d", ggml_get_i32_1d(node, j)); } - else if (node->type == GGML_TYPE_F32 || node->type == GGML_TYPE_F16) { + else if (node->type == GGML_TYPE_F32 || + node->type == GGML_TYPE_F16 || + node->type == GGML_TYPE_BF16) { fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, j)); } else { @@ -20260,7 +21674,8 @@ void ggml_quantize_init(enum ggml_type type) { case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ1_S: iq2xs_init_impl(type); break; + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: iq2xs_init_impl(type); break; case GGML_TYPE_IQ3_XXS: iq3xs_init_impl(256); break; case GGML_TYPE_IQ3_S: iq3xs_init_impl(512); break; default: // nothing @@ -20285,18 +21700,19 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) { return type == GGML_TYPE_IQ2_XXS || type == GGML_TYPE_IQ2_XS || - type == GGML_TYPE_IQ1_S; + type == GGML_TYPE_IQ1_S;// || + //type == GGML_TYPE_IQ1_M; } size_t ggml_quantize_chunk( enum ggml_type type, const float * src, void * dst, - int start, - int nrows, - int n_per_row, + int64_t start, + int64_t nrows, + int64_t n_per_row, const float * imatrix) { - const int n = nrows * n_per_row; + const int64_t n = (int64_t) nrows * n_per_row; if (ggml_quantize_requires_imatrix(type)) { GGML_ASSERT(imatrix != NULL); @@ -20329,6 +21745,7 @@ size_t ggml_quantize_chunk( case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; + case GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; #if QK_K == 64 case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break; @@ -20341,6 +21758,12 @@ size_t ggml_quantize_chunk( ggml_fp32_to_fp16_row(src + start, (ggml_fp16_t *)dst + start, n); result = n * elemsize; } break; + case GGML_TYPE_BF16: + { + size_t elemsize = sizeof(ggml_bf16_t); + ggml_fp32_to_bf16_row(src + start, (ggml_bf16_t *)dst + start, n); + result = n * elemsize; + } break; case GGML_TYPE_F32: { size_t elemsize = sizeof(float); @@ -20510,8 +21933,34 @@ static bool gguf_fread_str(FILE * file, struct gguf_str * p, size_t * offset) { return ok; } +static void gguf_free_kv(struct gguf_kv * kv) { + if (kv->key.data) { + GGML_FREE(kv->key.data); + } + + if (kv->type == GGUF_TYPE_STRING) { + if (kv->value.str.data) { + GGML_FREE(kv->value.str.data); + } + } + + if (kv->type == GGUF_TYPE_ARRAY) { + if (kv->value.arr.data) { + if (kv->value.arr.type == GGUF_TYPE_STRING) { + for (uint64_t j = 0; j < kv->value.arr.n; ++j) { + struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; + if (str->data) { + GGML_FREE(str->data); + } + } + } + GGML_FREE(kv->value.arr.data); + } + } +} + struct gguf_context * gguf_init_empty(void) { - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context)); memcpy(ctx->header.magic, GGUF_MAGIC, sizeof(ctx->header.magic)); ctx->header.version = GGUF_VERSION; @@ -20531,7 +21980,7 @@ struct gguf_context * gguf_init_empty(void) { } struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params) { - FILE * file = fopen(fname, "rb"); + FILE * file = ggml_fopen(fname, "rb"); if (!file) { return NULL; } @@ -20556,7 +22005,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p bool ok = true; - struct gguf_context * ctx = GGML_ALIGNED_MALLOC(sizeof(struct gguf_context)); + struct gguf_context * ctx = GGML_CALLOC(1, sizeof(struct gguf_context)); // read the header { @@ -20593,9 +22042,13 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p // read the kv pairs { - ctx->kv = GGML_MALLOC(ctx->header.n_kv * sizeof(struct gguf_kv)); + const uint64_t n_kv = ctx->header.n_kv; - for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { + // header.n_kv will hold the actual value of pairs that were successfully read in the loop below + ctx->header.n_kv = 0; + ctx->kv = GGML_CALLOC(n_kv, sizeof(struct gguf_kv)); + + for (uint64_t i = 0; i < n_kv; ++i) { struct gguf_kv * kv = &ctx->kv[i]; //fprintf(stderr, "%s: reading kv %d\n", __func__, i); @@ -20644,7 +22097,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * gguf_type_size(kv->value.arr.type)); + kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, gguf_type_size(kv->value.arr.type)); ok = ok && gguf_fread_el(file, kv->value.arr.data, kv->value.arr.n * gguf_type_size(kv->value.arr.type), &offset); } break; @@ -20658,7 +22111,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p return NULL; } - kv->value.arr.data = GGML_MALLOC(kv->value.arr.n * sizeof(struct gguf_str)); + kv->value.arr.data = GGML_CALLOC(kv->value.arr.n, sizeof(struct gguf_str)); for (uint64_t j = 0; j < kv->value.arr.n; ++j) { ok = ok && gguf_fread_str(file, &((struct gguf_str *) kv->value.arr.data)[j], &offset); @@ -20674,6 +22127,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p if (!ok) { break; } + + ctx->header.n_kv++; } if (!ok) { @@ -20685,8 +22140,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p } // read the tensor infos - { - ctx->infos = GGML_MALLOC(ctx->header.n_tensors * sizeof(struct gguf_tensor_info)); + if (ctx->header.n_tensors > 0) { + ctx->infos = GGML_CALLOC(ctx->header.n_tensors, sizeof(struct gguf_tensor_info)); for (uint64_t i = 0; i < ctx->header.n_tensors; ++i) { struct gguf_tensor_info * info = &ctx->infos[i]; @@ -20707,8 +22162,17 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && gguf_fread_el (file, &info->type, sizeof(info->type), &offset); ok = ok && gguf_fread_el (file, &info->offset, sizeof(info->offset), &offset); + // TODO: return an error instead of crashing with GGML_ASSERT gguf_tensor_info_sanitize(info); + // make sure there is no duplicated tensor names + for (uint64_t j = 0; j < i; ++j) { + if (strcmp(info->name.data, ctx->infos[j].name.data) == 0) { + fprintf(stderr, "%s: duplicated tensor name %s\n", __func__, info->name.data); + ok = false; + } + } + if (!ok) { fprintf(stderr, "%s: failed to read tensor info\n", __func__); fclose(file); @@ -20822,12 +22286,12 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p ok = ok && cur != NULL; - ggml_set_name(cur, ctx->infos[i].name.data); - if (!ok) { break; } + ggml_set_name(cur, ctx->infos[i].name.data); + // point the data member to the appropriate location in the binary blob using the tensor infos if (!params.no_alloc) { //cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file @@ -20859,31 +22323,7 @@ void gguf_free(struct gguf_context * ctx) { if (ctx->kv) { // free string memory - not great.. for (uint64_t i = 0; i < ctx->header.n_kv; ++i) { - struct gguf_kv * kv = &ctx->kv[i]; - - if (kv->key.data) { - GGML_FREE(kv->key.data); - } - - if (kv->type == GGUF_TYPE_STRING) { - if (kv->value.str.data) { - GGML_FREE(kv->value.str.data); - } - } - - if (kv->type == GGUF_TYPE_ARRAY) { - if (kv->value.arr.data) { - if (kv->value.arr.type == GGUF_TYPE_STRING) { - for (uint64_t j = 0; j < kv->value.arr.n; ++j) { - struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[j]; - if (str->data) { - GGML_FREE(str->data); - } - } - } - GGML_FREE(kv->value.arr.data); - } - } + gguf_free_kv(&ctx->kv[i]); } GGML_FREE(ctx->kv); @@ -20901,7 +22341,7 @@ void gguf_free(struct gguf_context * ctx) { GGML_FREE(ctx->infos); } - GGML_ALIGNED_FREE(ctx); + GGML_FREE(ctx); } const char * gguf_type_name(enum gguf_type type) { @@ -21108,6 +22548,19 @@ static int gguf_get_or_add_key(struct gguf_context * ctx, const char * key) { return n_kv; } +void gguf_remove_key(struct gguf_context * ctx, const char * key) { + const int idx = gguf_find_key(ctx, key); + if (idx >= 0) { + const int n_kv = gguf_get_n_kv(ctx); + gguf_free_kv(&ctx->kv[idx]); + for (int i = idx; i < n_kv-1; ++i) { + ctx->kv[i] = ctx->kv[i+1]; + } + ctx->kv = realloc(ctx->kv, (n_kv - 1) * sizeof(struct gguf_kv)); + ctx->header.n_kv--; + } +} + void gguf_set_val_u8(struct gguf_context * ctx, const char * key, uint8_t val) { const int idx = gguf_get_or_add_key(ctx, key); @@ -21199,7 +22652,7 @@ void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_ty ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = type; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = GGML_MALLOC(n*gguf_type_size(type)); + ctx->kv[idx].value.arr.data = GGML_CALLOC(n, gguf_type_size(type)); memcpy(ctx->kv[idx].value.arr.data, data, n*gguf_type_size(type)); } @@ -21209,7 +22662,7 @@ void gguf_set_arr_str(struct gguf_context * ctx, const char * key, const char ** ctx->kv[idx].type = GGUF_TYPE_ARRAY; ctx->kv[idx].value.arr.type = GGUF_TYPE_STRING; ctx->kv[idx].value.arr.n = n; - ctx->kv[idx].value.arr.data = GGML_MALLOC(n*sizeof(struct gguf_str)); + ctx->kv[idx].value.arr.data = GGML_CALLOC(n, sizeof(struct gguf_str)); for (int i = 0; i < n; i++) { struct gguf_str * str = &((struct gguf_str *)ctx->kv[idx].value.arr.data)[i]; str->n = strlen(data[i]); @@ -21236,7 +22689,7 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { case GGUF_TYPE_ARRAY: { if (src->kv[i].value.arr.type == GGUF_TYPE_STRING) { - const char ** data = GGML_MALLOC(src->kv[i].value.arr.n*sizeof(char *)); + const char ** data = GGML_CALLOC(src->kv[i].value.arr.n, sizeof(char *)); for (uint32_t j = 0; j < src->kv[i].value.arr.n; j++) { data[j] = ((struct gguf_str *)src->kv[i].value.arr.data)[j].data; } @@ -21256,6 +22709,10 @@ void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src) { void gguf_add_tensor( struct gguf_context * ctx, const struct ggml_tensor * tensor) { + if (gguf_find_tensor(ctx, tensor->name) != -1) { + GGML_ASSERT(false && "duplicated tensor name"); + } + const int idx = ctx->header.n_tensors; ctx->infos = realloc(ctx->infos, (idx + 1)*sizeof(struct gguf_tensor_info)); @@ -21324,7 +22781,7 @@ struct gguf_buf { static struct gguf_buf gguf_buf_init(size_t size) { struct gguf_buf buf = { - /*buf.data =*/ size == 0 ? NULL : GGML_MALLOC(size), + /*buf.data =*/ size == 0 ? NULL : GGML_CALLOC(1, size), /*buf.size =*/ size, /*buf.offset =*/ 0, }; @@ -21486,7 +22943,7 @@ static void gguf_write_to_buf(const struct gguf_context * ctx, struct gguf_buf * } void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta) { - FILE * file = fopen(fname, "wb"); + FILE * file = ggml_fopen(fname, "wb"); if (!file) { GGML_ASSERT(false && "failed to open file for writing"); } @@ -21628,15 +23085,15 @@ int ggml_cpu_has_wasm_simd(void) { } int ggml_cpu_has_blas(void) { -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL) return 1; #else return 0; #endif } -int ggml_cpu_has_cublas(void) { -#if defined(GGML_USE_CUBLAS) +int ggml_cpu_has_cuda(void) { +#if defined(GGML_USE_CUDA) return 1; #else return 0; @@ -21676,7 +23133,7 @@ int ggml_cpu_has_sycl(void) { } int ggml_cpu_has_gpublas(void) { - return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || + return ggml_cpu_has_cuda() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_kompute() || ggml_cpu_has_sycl(); } diff --git a/ggml.h b/ggml.h index c937d4a53..fe6053822 100644 --- a/ggml.h +++ b/ggml.h @@ -214,9 +214,10 @@ # define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__))) #endif -#include -#include #include +#include +#include +#include #define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_VERSION 1 @@ -325,14 +326,20 @@ extern "C" { // get ggml_status name string GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status); + // ieee 754-2008 half-precision float16 + // todo: make this not an integral type typedef uint16_t ggml_fp16_t; + GGML_API float ggml_fp16_to_fp32(ggml_fp16_t); + GGML_API ggml_fp16_t ggml_fp32_to_fp16(float); + GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t); + GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t); - // convert FP16 <-> FP32 - GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x); - GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x); - - GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n); - GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n); + // google brain half-precision bfloat16 + typedef struct { uint16_t bits; } ggml_bf16_t; + GGML_API ggml_bf16_t ggml_fp32_to_bf16(float); + GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16 + GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t); + GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t); struct ggml_object; struct ggml_context; @@ -368,6 +375,8 @@ extern "C" { GGML_TYPE_I32 = 26, GGML_TYPE_I64 = 27, GGML_TYPE_F64 = 28, + GGML_TYPE_IQ1_M = 29, + GGML_TYPE_BF16 = 30, GGML_TYPE_COUNT, }; @@ -407,6 +416,8 @@ extern "C" { GGML_FTYPE_MOSTLY_IQ3_S = 20, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors + GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors }; // available tensor operations: @@ -472,6 +483,7 @@ extern "C" { GGML_OP_LEAKY_RELU, GGML_OP_FLASH_ATTN, + GGML_OP_FLASH_ATTN_EXT, GGML_OP_FLASH_FF, GGML_OP_FLASH_ATTN_BACK, GGML_OP_SSM_CONV, @@ -708,6 +720,9 @@ extern "C" { GGML_API void ggml_print_backtrace(void); + // accepts a UTF-8 path, even on Windows + GGML_API FILE * ggml_fopen(const char * fname, const char * mode); + GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node @@ -744,6 +759,7 @@ extern "C" { GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_contiguous(const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor); + GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor); GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); @@ -755,6 +771,8 @@ extern "C" { // use this to compute the memory overhead of a tensor GGML_API size_t ggml_tensor_overhead(void); + GGML_API bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbytes); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); @@ -1154,14 +1172,11 @@ extern "C" { enum ggml_prec prec); // indirect matrix multiplication - // ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b) GGML_API struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, - struct ggml_tensor * const as[], - int n_as, - struct ggml_tensor * ids, - int id, - struct ggml_tensor * b); + struct ggml_tensor * as, + struct ggml_tensor * b, + struct ggml_tensor * ids); // A: m columns, n rows, // B: p columns, n rows, @@ -1716,6 +1731,25 @@ extern "C" { struct ggml_tensor * v, bool masked); +#define GGML_KQ_MASK_PAD 32 + + // q: [n_embd, n_batch, n_head, 1] + // k: [n_embd, n_kv, n_head_kv, 1] + // v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !! + // mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !! + // res: [n_embd, n_head, n_batch, 1] !! permuted !! + GGML_API struct ggml_tensor * ggml_flash_attn_ext( + struct ggml_context * ctx, + struct ggml_tensor * q, + struct ggml_tensor * k, + struct ggml_tensor * v, + struct ggml_tensor * mask, + float scale); + + GGML_API void ggml_flash_attn_ext_set_prec( + struct ggml_tensor * a, + enum ggml_prec prec); + GGML_API struct ggml_tensor * ggml_flash_attn_back( struct ggml_context * ctx, struct ggml_tensor * q, @@ -2204,9 +2238,9 @@ extern "C" { enum ggml_type type, const float * src, void * dst, - int start, - int nrows, - int n_per_row, + int64_t start, + int64_t nrows, + int64_t n_per_row, const float * imatrix); // @@ -2283,6 +2317,9 @@ extern "C" { GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i); GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i); + // removes key if it exists + GGML_API void gguf_remove_key(struct gguf_context * ctx, const char * key); + // overrides existing values or adds a new one GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val); GGML_API void gguf_set_val_i8 (struct gguf_context * ctx, const char * key, int8_t val); @@ -2350,7 +2387,7 @@ extern "C" { GGML_API int ggml_cpu_has_fp16_va (void); GGML_API int ggml_cpu_has_wasm_simd (void); GGML_API int ggml_cpu_has_blas (void); - GGML_API int ggml_cpu_has_cublas (void); + GGML_API int ggml_cpu_has_cuda (void); GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_vulkan (void); GGML_API int ggml_cpu_has_kompute (void); @@ -2371,8 +2408,8 @@ extern "C" { #else #define GGML_RESTRICT restrict #endif - typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); - typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); + typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); + typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, const void * GGML_RESTRICT y, size_t by, int nrc); diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 4a6f5e323..162cf5c6e 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -1,11 +1,14 @@ #!/usr/bin/env python +import logging import argparse import asyncio import os import sys from tempfile import gettempdir, NamedTemporaryFile +logger = logging.getLogger("ggml-vk-generate-shaders") + shader_f32 = """ #define FLOAT_TYPE float """ @@ -18,6 +21,12 @@ shader_int8_ext = """ """ # Type-specific defines +shader_f32_defines = """ +#define QUANT_K 1 +#define QUANT_R 1 + +#define A_TYPE float +""" shader_f16_defines = """ #define QUANT_K 1 #define QUANT_R 1 @@ -157,48 +166,55 @@ struct block_q6_K """ # Dequant functions -shader_f16_dequant_func = """ -#define DEQUANT_FUNC vec2 v = vec2(data_a[ib + 0], data_a[ib + 1]); +shader_float_dequant_func = """ +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + return vec2(data_a[a_offset + ib], data_a[a_offset + ib + 1]); +} """ shader_q4_0_dequant_func = """ -#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ -const uint vui = uint(data_a[ib].qs[iqs]); \ -vec2 v = vec2(vui & 0xF, vui >> 4); \ -v = (v - 8.0f)*d; +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a[a_offset + ib].d); + const uint vui = uint(data_a[a_offset + ib].qs[iqs]); + return (vec2(vui & 0xF, vui >> 4) - 8.0f) * d; +} """ shader_q4_1_dequant_func = """ -#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ -const float m = float(data_a[ib].m); \ -const uint vui = uint(data_a[ib].qs[iqs]); \ -vec2 v = vec2(vui & 0xF, vui >> 4); \ -v = v*d + m; +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a[a_offset + ib].d); + const float m = float(data_a[a_offset + ib].m); + const uint vui = uint(data_a[a_offset + ib].qs[iqs]); + return vec2(vui & 0xF, vui >> 4) * d + m; +} """ shader_q5_0_dequant_func = """ -#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ -const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0]; \ -const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \ -const uint vui = uint(data_a[ib].qs[iqs]); \ -vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \ -v = (v - 16.0f) * d; +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a[a_offset + ib].d); + const uint uint_qh = uint(data_a[a_offset + ib].qh[1]) << 16 | data_a[a_offset + ib].qh[0]; + const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const uint vui = uint(data_a[a_offset + ib].qs[iqs]); + return (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d; +} """ shader_q5_1_dequant_func = """ -#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ -const float m = float(data_a[ib].m); \ -const uint uint_qh = data_a[ib].qh; \ -const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); \ -const uint vui = uint(data_a[ib].qs[iqs]); \ -vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y); \ -v = v*d + m; +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a[a_offset + ib].d); + const float m = float(data_a[a_offset + ib].m); + const uint uint_qh = data_a[a_offset + ib].qh; + const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10); + const uint vui = uint(data_a[a_offset + ib].qs[iqs]); + return vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m; +} """ shader_q8_0_dequant_func = """ -#define DEQUANT_FUNC const float d = float(data_a[ib].d); \ -vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])); \ -v = v * d; +vec2 dequantize(uint ib, uint iqs, uint a_offset) { + const float d = float(data_a[a_offset + ib].d); + return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d; +} """ # MULMAT @@ -208,6 +224,15 @@ mulmat_head = """#version 450 #extension GL_EXT_control_flow_attributes : enable #extension GL_EXT_shader_16bit_storage : require +#ifdef MUL_MAT_ID +#extension GL_EXT_buffer_reference2 : require +#extension GL_EXT_nonuniform_qualifier : require +#extension GL_EXT_scalar_block_layout : require +#extension GL_EXT_shader_explicit_arithmetic_types_int8 : require + +#define EXPERT_COUNT 8 +#endif + #ifndef LOAD_VEC_A #define LOAD_VEC_A 1 #endif @@ -223,6 +248,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; +#ifdef MUL_MAT_ID +layout (binding = 3) readonly buffer IDS {int data_ids[];}; +#endif + layout (push_constant) uniform parameter { uint M; @@ -241,6 +270,21 @@ layout (push_constant) uniform parameter uint batch_stride_a; uint batch_stride_b; uint batch_stride_d; + +#ifdef MUL_MAT_ID + uint expert_stride_a; + uint expert_stride_b0; + uint expert_stride_b1; + uint expert_stride_d; + + uint ids_stride; + + uint n_as; + uint nei0; + uint nei1; + uint nbi1; + uint ne11; +#endif } p; layout (constant_id = 1) const uint BM = 64; @@ -256,9 +300,20 @@ layout (constant_id = 9) const uint WARP = 32; shared FLOAT_TYPE buf_a[BM * (BK+1)]; shared FLOAT_TYPE buf_b[BN * (BK+1)]; +#ifdef MUL_MAT_ID +shared u8vec2 rowids[2048]; +#endif + void main() { - const uint i13 = gl_GlobalInvocationID.z / p.ne12; - const uint i12 = gl_GlobalInvocationID.z % p.ne12; +#ifdef MUL_MAT_ID + const uint batch_idx = gl_GlobalInvocationID.z / p.n_as; + const uint expert_idx = gl_GlobalInvocationID.z % p.n_as; +#else + const uint batch_idx = gl_GlobalInvocationID.z; +#endif + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; const uint i03 = i13 / p.broadcast3; const uint i02 = i12 / p.broadcast2; @@ -290,11 +345,35 @@ void main() { const uint loadstride_a = gl_WorkGroupSize.x * LOAD_VEC_A / BK; const uint loadstride_b = gl_WorkGroupSize.x * LOAD_VEC_B / BK; +#ifdef MUL_MAT_ID + uint _ne1 = 0; + for (uint ii1 = 0; ii1 < p.nei1; ii1++) { + for (uint ii0 = 0; ii0 < p.nei0; ii0++) { + if (data_ids[ii1*p.nbi1 + ii0] == expert_idx) { + rowids[_ne1] = u8vec2(ii0, ii1); + _ne1++; + } + } + } + + const u8vec2 id = rowids[ir * BN + ic]; +#endif + const uint start_k = ik * p.k_split; const uint end_k = min(p.K, (ik + 1) * p.k_split); - uint pos_a = (batch_idx_a * p.batch_stride_a + ir * BM * p.stride_a + start_k) / LOAD_VEC_A; - uint pos_b = (gl_GlobalInvocationID.z * p.batch_stride_b + ic * BN * p.stride_b + start_k) / LOAD_VEC_B; + uint pos_a = ( +#ifdef MUL_MAT_ID + expert_idx * p.expert_stride_a + +#endif + batch_idx_a * p.batch_stride_a + ir * BM * p.stride_a + start_k) / LOAD_VEC_A; + uint pos_b = ( +#ifdef MUL_MAT_ID + id.y * p.expert_stride_b1 + + (id.x % p.ne11) * p.expert_stride_b0 + +#endif + batch_idx * p.batch_stride_b + + ic * BN * p.stride_b + start_k) / LOAD_VEC_B; float sums[WMITER * TM * WNITER * TN]; FLOAT_TYPE cache_a[WMITER * TM]; @@ -410,6 +489,133 @@ mulmat_load_q8_0 = """ buf_a[buf_idx ] = FLOAT_TYPE(v.x); buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);""" + +mulmat_load_q2_K = """ + const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; + const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint qsi = (iqs / 64) * 32 + (iqs % 16) * 2; // 0,2,4..30 + const uint scalesi = iqs / 8; // 0..15 + const uint qsshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 + + const uvec2 qs = uvec2(data_a[ib].qs[qsi], data_a[ib].qs[qsi + 1]); + const uint scales = data_a[ib].scales[scalesi]; + const vec2 d = vec2(data_a[ib].d); + + const vec2 v = d.x * float(scales & 0xF) * vec2((qs >> qsshift) & 3) - d.y * float(scales >> 4); + + buf_a[buf_idx ] = FLOAT_TYPE(v.x); + buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);""" + +mulmat_load_q3_K = """ + const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; + const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 64; // 0,1 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + const uint hmi = (iqs % 16) * 2; // 0,2,4..30 + const uint j = (iqs % 64) / 4; // 0..3 + const uint is = iqs / 8; // 0..15 + const uint halfsplit = ((iqs % 64) / 16); // 0,1,2,3 + const uint qsshift = halfsplit * 2; // 0,2,4,6 + const uint m = 1 << (4 * n + halfsplit); // 1,2,4,8,16,32,64,128 + + const int8_t us = int8_t(is < 4 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+8] >> 0) & 3) << 4) : + is < 8 ? (data_a[ib].scales[is-0] & 0xF) | (((data_a[ib].scales[is+4] >> 2) & 3) << 4) : + is < 12 ? (data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is+0] >> 4) & 3) << 4) : + (data_a[ib].scales[is-8] >> 4) | (((data_a[ib].scales[is-4] >> 6) & 3) << 4)); + const float dl = float(data_a[ib].d) * float(us - 32); + + buf_a[buf_idx ] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi ] >> qsshift) & 3) - (((data_a[ib].hmask[hmi ] & m) != 0) ? 0 : 4))); + buf_a[buf_idx + 1] = FLOAT_TYPE(dl * float(int8_t((data_a[ib].qs[qsi + 1] >> qsshift) & 3) - (((data_a[ib].hmask[hmi + 1] & m) != 0) ? 0 : 4)));""" + +mulmat_load_q4_K = """ + const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; + const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 32; // 0,1,2,3 + const uint b = (iqs % 32) / 16; // 0,1 + const uint is = 2 * n + b; // 0..7 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 + + const vec2 loadd = vec2(data_a[ib].d); + + uint8_t sc; + uint8_t mbyte; + if (is < 4) { + sc = uint8_t(data_a[ib].scales[is ] & 63); + mbyte = uint8_t(data_a[ib].scales[is + 4] & 63); + } else { + sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4)); + mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); + } + const float d = loadd.x * sc; + const float m = loadd.y * mbyte; + + buf_a[buf_idx ] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) - m); + buf_a[buf_idx + 1] = FLOAT_TYPE(d * float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) - m);""" + +mulmat_load_q5_K = """ + const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; + const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 32; // 0,1,2,3 + const uint b = (iqs % 32) / 16; // 0,1 + const uint is = 2 * n + b; // 0..7 + const uint qsi = n * 32 + (iqs % 16) * 2; // 0,2,4..126 + const uint qhi = (iqs % 16) * 2; // 0,2,4..30 + + const uint8_t hm = uint8_t(1 << (iqs / 16)); + + const vec2 loadd = vec2(data_a[ib].d); + + uint8_t sc; + uint8_t mbyte; + if (is < 4) { + sc = uint8_t(data_a[ib].scales[is ] & 63); + mbyte = uint8_t(data_a[ib].scales[is + 4] & 63); + } else { + sc = uint8_t((data_a[ib].scales[is + 4] & 0xF) | ((data_a[ib].scales[is - 4] >> 6) << 4)); + mbyte = uint8_t((data_a[ib].scales[is + 4] >> 4) | ((data_a[ib].scales[is ] >> 6) << 4)); + } + const float d = loadd.x * sc; + const float m = loadd.y * mbyte; + + buf_a[buf_idx ] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi ] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi ] & hm) != 0 ? 16 : 0)) - m); + buf_a[buf_idx + 1] = FLOAT_TYPE(d * (float((data_a[ib].qs[qsi + 1] >> (b * 4)) & 0xF) + float((data_a[ib].qh[qhi + 1] & hm) != 0 ? 16 : 0)) - m);""" + +mulmat_load_q6_K = """ + const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a; + const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a * LOAD_VEC_A; + + const uint ib = idx / 128; // 2 values per idx + const uint iqs = idx % 128; // 0..127 + + const uint n = iqs / 64; // 0,1 + const uint b = (iqs % 64) / 32; // 0,1 + const uint is_b = (iqs % 16) / 8; // 0,1 + const uint qhshift = ((iqs % 64) / 16) * 2; // 0,2,4,6 + const uint is = 8 * n + qhshift + is_b; // 0..15 + const uint qsi = n * 64 + (iqs % 32) * 2; // 0,2,4..126 + const uint qhi = n * 32 + (iqs % 16) * 2; // 0,2,4..62 + + const float dscale = float(data_a[ib].d) * float(data_a[ib].scales[is]); + + buf_a[buf_idx ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi ] >> qhshift) & 3) << 4)) - 32)); + buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));""" + mulmat_body2 = """ } [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) { @@ -475,7 +681,11 @@ mulmat_body2 = """ const uint dr = ir * BM + warp_r * WM; const uint dc = ic * BN + warp_c * WN; - const uint offsets = gl_GlobalInvocationID.z * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; + const uint offsets = +#ifdef MUL_MAT_ID + expert_idx * p.expert_stride_d + +#endif + batch_idx * p.batch_stride_d + ik * p.batch_stride_d * gl_NumWorkGroups.z; [[unroll]] for (uint wsic = 0; wsic < WNITER; wsic++) { [[unroll]] for (uint wsir = 0; wsir < WMITER; wsir++) { @@ -941,21 +1151,54 @@ mul_mat_vec_head = """#version 450 #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_shader_8bit_storage : require +#ifdef MUL_MAT_ID +#define EXPERT_COUNT 8 +#endif +""" + + +mul_mat_vec_layout = """ +layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; +layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; +#ifdef MUL_MAT_ID +layout (binding = 3) readonly buffer IDS {int data_ids[];}; +#endif + layout (push_constant) uniform parameter { uint ncols; - uint b_offset; - uint d_offset; + uint stride_a; + uint stride_b; + uint stride_d; + + uint ne02; + uint ne12; + uint broadcast2; + uint broadcast3; + + uint batch_stride_a; + uint batch_stride_b; + uint batch_stride_d; + +#ifdef MUL_MAT_ID + uint expert_stride_a; + uint expert_stride_b0; + uint expert_stride_b1; + uint expert_stride_d0; + uint expert_stride_d1; + + uint ne11; + uint nei0; + uint nbi1; + uint n_as; +#endif } p; """ mul_mat_vec_body = """ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; - layout (constant_id = 0) const uint BLOCK_SIZE = 32; shared FLOAT_TYPE tmp[BLOCK_SIZE]; @@ -963,6 +1206,41 @@ shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; + const uint batch_idx = gl_GlobalInvocationID.y; +#ifdef MUL_MAT_ID + const uint expert_idx1 = gl_GlobalInvocationID.z / p.nei0; + const uint expert_idx0 = gl_GlobalInvocationID.z % p.nei0; +#endif + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; + +#ifdef MUL_MAT_ID + const uint expert_id = data_ids[expert_idx1 * p.nbi1 + expert_idx0]; +#endif + + const uint a_offset = +#ifdef MUL_MAT_ID + expert_id * p.expert_stride_a + +#endif + batch_idx_a * p.batch_stride_a; + const uint b_offset = +#ifdef MUL_MAT_ID + (expert_idx0 % p.ne11) * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_b; + const uint d_offset = +#ifdef MUL_MAT_ID + expert_idx0 * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_d; const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; @@ -974,11 +1252,11 @@ void main() { const uint iqs = (col%QUANT_K)/QUANT_R; // quant index const uint iybs = col - col%QUANT_K; // y block start index - DEQUANT_FUNC + vec2 v = dequantize(ib, iqs, a_offset / QUANT_K); // matrix multiplication - tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[p.b_offset + iybs + iqs + 0]) + - FLOAT_TYPE(v.y) * FLOAT_TYPE(data_b[p.b_offset + iybs + iqs + y_offset]); + tmp[tid] += FLOAT_TYPE(v.x) * FLOAT_TYPE(data_b[b_offset + iybs + iqs]) + + FLOAT_TYPE(v.y) * FLOAT_TYPE(data_b[b_offset + iybs + iqs + y_offset]); } // sum up partial sums and write back result @@ -990,7 +1268,7 @@ void main() { barrier(); } if (tid == 0) { - dst[p.d_offset + row] = D_TYPE(tmp[0]); + data_d[d_offset + row] = D_TYPE(tmp[0]); } } """ @@ -999,17 +1277,48 @@ void main() { mul_mat_vec_q2_K_body = """ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; - shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x; + const uint batch_idx = gl_GlobalInvocationID.y; +#ifdef MUL_MAT_ID + const uint expert_idx1 = gl_GlobalInvocationID.z / p.nei0; + const uint expert_idx0 = gl_GlobalInvocationID.z % p.nei0; +#endif + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; + +#ifdef MUL_MAT_ID + const uint expert_id = data_ids[expert_idx1 * p.nbi1 + expert_idx0]; +#endif + + const uint a_offset = +#ifdef MUL_MAT_ID + expert_id * p.expert_stride_a + +#endif + batch_idx_a * p.batch_stride_a; + const uint b_offset = +#ifdef MUL_MAT_ID + (expert_idx0 % p.ne11) * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_b; + const uint d_offset = +#ifdef MUL_MAT_ID + expert_idx0 * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_d; const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = row*num_blocks_per_row; + const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 @@ -1035,22 +1344,22 @@ void main() { FLOAT_TYPE sum1 = FLOAT_TYPE(0.0); FLOAT_TYPE sum2 = FLOAT_TYPE(0.0); for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum1 += FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3); - sum2 += FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 0]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 16]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 32]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 48]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 64]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 80]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 96]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l +112]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF); + sum1 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 0) & 3) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 0) & 3) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 2) & 3) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 2) & 3) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 4) & 3) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 4) & 3) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l + 0] >> 6) & 3) + + FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7] & 0xF) * FLOAT_TYPE((data_a[ib0 + i].qs[q_offset + l +16] >> 6) & 3); + sum2 += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 0] >> 4) & 0xF) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 1] >> 4) & 0xF) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 2] >> 4) & 0xF) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 3] >> 4) & 0xF) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 4] >> 4) & 0xF) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 5] >> 4) & 0xF) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 6] >> 4) & 0xF) + + FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE((data_a[ib0 + i].scales[s_offset + 7] >> 4) & 0xF); } tmp[16 * ix + tid] += dall * sum1 - dmin * sum2; } @@ -1064,24 +1373,55 @@ void main() { barrier(); } if (tid == 0) { - dst[p.d_offset + row] = D_TYPE(tmp[0]); + data_d[d_offset + row] = D_TYPE(tmp[0]); } } """ mul_mat_vec_q3_K_body = """ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; - shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x; + const uint batch_idx = gl_GlobalInvocationID.y; +#ifdef MUL_MAT_ID + const uint expert_idx1 = gl_GlobalInvocationID.z / p.nei0; + const uint expert_idx0 = gl_GlobalInvocationID.z % p.nei0; +#endif + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; + +#ifdef MUL_MAT_ID + const uint expert_id = data_ids[expert_idx1 * p.nbi1 + expert_idx0]; +#endif + + const uint a_offset = +#ifdef MUL_MAT_ID + expert_id * p.expert_stride_a + +#endif + batch_idx_a * p.batch_stride_a; + const uint b_offset = +#ifdef MUL_MAT_ID + (expert_idx0 % p.ne11) * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_b; + const uint d_offset = +#ifdef MUL_MAT_ID + expert_idx0 * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_d; const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = row*num_blocks_per_row; + const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 @@ -1108,14 +1448,14 @@ void main() { FLOAT_TYPE sum = FLOAT_TYPE(0.0); for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) { - sum += FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)); + sum += FLOAT_TYPE(data_b[b_offset + y_idx + l + 0]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[0] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 32]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[2] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 64]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[4] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 8] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 96]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[6] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[10] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 16]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[1] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 48]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[3] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 0) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)) + + FLOAT_TYPE(data_b[b_offset + y_idx + l + 80]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[5] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[ 9] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)) + + FLOAT_TYPE(data_b[b_offset + y_idx + l +112]) * FLOAT_TYPE(int8_t(((data_a[ib0 + i].scales[7] >> s_shift) & 0xF) | ((data_a[ib0 + i].scales[11] >> (s_shift + 2) & 0x3) << 4)) - 32) * FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)); } tmp[16 * ix + tid] += d * sum; } @@ -1129,24 +1469,55 @@ void main() { barrier(); } if (tid == 0) { - dst[p.d_offset + row] = D_TYPE(tmp[0]); + data_d[d_offset + row] = D_TYPE(tmp[0]); } } """ mul_mat_vec_q4_K_body = """ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; - shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x; + const uint batch_idx = gl_GlobalInvocationID.y; +#ifdef MUL_MAT_ID + const uint expert_idx1 = gl_GlobalInvocationID.z / p.nei0; + const uint expert_idx0 = gl_GlobalInvocationID.z % p.nei0; +#endif + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; + +#ifdef MUL_MAT_ID + const uint expert_id = data_ids[expert_idx1 * p.nbi1 + expert_idx0]; +#endif + + const uint a_offset = +#ifdef MUL_MAT_ID + expert_id * p.expert_stride_a + +#endif + batch_idx_a * p.batch_stride_a; + const uint b_offset = +#ifdef MUL_MAT_ID + (expert_idx0 % p.ne11) * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_b; + const uint d_offset = +#ifdef MUL_MAT_ID + expert_idx0 * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_d; const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = row*num_blocks_per_row; + const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 @@ -1200,15 +1571,15 @@ void main() { const uint8_t q4_14 = uint8_t(data_a[ib0 + i].qs[q_offset + 66] >> 4); const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 67] >> 4); - const FLOAT_TYPE sx = FLOAT_TYPE(data_b[p.b_offset + y1_idx] * q4_0 + data_b[p.b_offset + y1_idx + 1] * q4_1 + data_b[p.b_offset + y1_idx + 2] * q4_2 + data_b[p.b_offset + y1_idx + 3] * q4_3); - const FLOAT_TYPE sy = FLOAT_TYPE(data_b[p.b_offset + y1_idx + 32] * q4_4 + data_b[p.b_offset + y1_idx + 33] * q4_5 + data_b[p.b_offset + y1_idx + 34] * q4_6 + data_b[p.b_offset + y1_idx + 35] * q4_7); - const FLOAT_TYPE sz = FLOAT_TYPE(data_b[p.b_offset + y2_idx] * q4_8 + data_b[p.b_offset + y2_idx + 1] * q4_9 + data_b[p.b_offset + y2_idx + 2] * q4_10 + data_b[p.b_offset + y2_idx + 3] * q4_11); - const FLOAT_TYPE sw = FLOAT_TYPE(data_b[p.b_offset + y2_idx + 32] * q4_12 + data_b[p.b_offset + y2_idx + 33] * q4_13 + data_b[p.b_offset + y2_idx + 34] * q4_14 + data_b[p.b_offset + y2_idx + 35] * q4_15); + const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1 + FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * q4_3); + const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_5 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * q4_7); + const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx]) * q4_8 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_9 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * q4_10 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * q4_11); + const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_12 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_13 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * q4_14 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * q4_15); const FLOAT_TYPE smin = FLOAT_TYPE( - data_b[p.b_offset + y1_idx ] * sc2 + data_b[p.b_offset + y1_idx + 32] * sc3 + data_b[p.b_offset + y2_idx ] * sc6 + data_b[p.b_offset + y2_idx + 32] * sc7 - + data_b[p.b_offset + y1_idx + 1] * sc2 + data_b[p.b_offset + y1_idx + 33] * sc3 + data_b[p.b_offset + y2_idx + 1] * sc6 + data_b[p.b_offset + y2_idx + 33] * sc7 - + data_b[p.b_offset + y1_idx + 2] * sc2 + data_b[p.b_offset + y1_idx + 34] * sc3 + data_b[p.b_offset + y2_idx + 2] * sc6 + data_b[p.b_offset + y2_idx + 34] * sc7 - + data_b[p.b_offset + y1_idx + 3] * sc2 + data_b[p.b_offset + y1_idx + 35] * sc3 + data_b[p.b_offset + y2_idx + 3] * sc6 + data_b[p.b_offset + y2_idx + 35] * sc7 + FLOAT_TYPE(data_b[b_offset + y1_idx ]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx ]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7 + + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7 + + FLOAT_TYPE(data_b[b_offset + y1_idx + 2]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 34]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 2]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 34]) * sc7 + + FLOAT_TYPE(data_b[b_offset + y1_idx + 3]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 35]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 3]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 35]) * sc7 ); tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin); #else @@ -1221,13 +1592,13 @@ void main() { const uint8_t q4_6 = uint8_t(data_a[ib0 + i].qs[q_offset + 64] >> 4); const uint8_t q4_7 = uint8_t(data_a[ib0 + i].qs[q_offset + 65] >> 4); - const FLOAT_TYPE sx = FLOAT_TYPE(data_b[p.b_offset + y1_idx ] * q4_0 + data_b[p.b_offset + y1_idx + 1] * q4_1); - const FLOAT_TYPE sy = FLOAT_TYPE(data_b[p.b_offset + y1_idx + 32] * q4_2 + data_b[p.b_offset + y1_idx + 33] * q4_3); - const FLOAT_TYPE sz = FLOAT_TYPE(data_b[p.b_offset + y2_idx ] * q4_4 + data_b[p.b_offset + y2_idx + 1] * q4_5); - const FLOAT_TYPE sw = FLOAT_TYPE(data_b[p.b_offset + y2_idx + 32] * q4_6 + data_b[p.b_offset + y2_idx + 33] * q4_7); + const FLOAT_TYPE sx = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx ]) * q4_0 + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * q4_1); + const FLOAT_TYPE sy = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * q4_2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * q4_3); + const FLOAT_TYPE sz = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx ]) * q4_4 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * q4_5); + const FLOAT_TYPE sw = FLOAT_TYPE(FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * q4_6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * q4_7); const FLOAT_TYPE smin = FLOAT_TYPE( - data_b[p.b_offset + y1_idx] * sc2 + data_b[p.b_offset + y1_idx + 32] * sc3 + data_b[p.b_offset + y2_idx] * sc6 + data_b[p.b_offset + y2_idx + 32] * sc7 - + data_b[p.b_offset + y1_idx + 1] * sc2 + data_b[p.b_offset + y1_idx + 33] * sc3 + data_b[p.b_offset + y2_idx + 1] * sc6 + data_b[p.b_offset + y2_idx + 33] * sc7 + FLOAT_TYPE(data_b[b_offset + y1_idx]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * sc7 + + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * sc2 + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * sc3 + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * sc6 + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * sc7 ); tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * FLOAT_TYPE(data_a[ib0 + i].scales[v_im] & 0x3f) + sy * FLOAT_TYPE(data_a[ib0 + i].scales[v_im + 1] & 0x3f) + sz * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 4] & 0x0f) | ((data_a[ib0 + i].scales[v_im] & 0xc0) >> 2)) + sw * FLOAT_TYPE((data_a[ib0 + i].scales[v_im + 5] & 0x0f) | ((data_a[ib0 + i].scales[v_im + 1] & 0xc0) >> 2))) - dmin * smin); @@ -1243,24 +1614,55 @@ void main() { barrier(); } if (tid == 0) { - dst[p.d_offset + row] = D_TYPE(tmp[0]); + data_d[d_offset + row] = D_TYPE(tmp[0]); } } """ mul_mat_vec_q5_K_body = """ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; - shared FLOAT_TYPE tmp[32]; void main() { const uint row = gl_WorkGroupID.x; + const uint batch_idx = gl_GlobalInvocationID.y; +#ifdef MUL_MAT_ID + const uint expert_idx1 = gl_GlobalInvocationID.z / p.nei0; + const uint expert_idx0 = gl_GlobalInvocationID.z % p.nei0; +#endif + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; + +#ifdef MUL_MAT_ID + const uint expert_id = data_ids[expert_idx1 * p.nbi1 + expert_idx0]; +#endif + + const uint a_offset = +#ifdef MUL_MAT_ID + expert_id * p.expert_stride_a + +#endif + batch_idx_a * p.batch_stride_a; + const uint b_offset = +#ifdef MUL_MAT_ID + (expert_idx0 % p.ne11) * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_b; + const uint d_offset = +#ifdef MUL_MAT_ID + expert_idx0 * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_d; const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = row*num_blocks_per_row; + const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; const uint tid = gl_LocalInvocationID.x/2; // 0...31 or 0...16 const uint ix = gl_LocalInvocationID.x%2; // 0 or 0, 1 @@ -1314,32 +1716,32 @@ void main() { const uint8_t q4_15 = uint8_t(data_a[ib0 + i].qs[q_offset + 81] >> 4); const FLOAT_TYPE sx = FLOAT_TYPE( - data_b[p.b_offset + y1_idx ] * (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)) - + data_b[p.b_offset + y1_idx + 1] * (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)) - + data_b[p.b_offset + y1_idx + 16] * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)) - + data_b[p.b_offset + y1_idx + 17] * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)) + FLOAT_TYPE(data_b[b_offset + y1_idx ]) * (q4_0 + (((data_a[ib0 + i].qh[l0 ] & hm1) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) * (q4_1 + (((data_a[ib0 + i].qh[l0 + 1] & hm1) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) * (q4_2 + (((data_a[ib0 + i].qh[l0 + 16] & hm1) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y1_idx + 17]) * (q4_3 + (((data_a[ib0 + i].qh[l0 + 17] & hm1) != 0) ? 16 : 0)) ); const FLOAT_TYPE sy = FLOAT_TYPE( - data_b[p.b_offset + y1_idx + 32] * (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)) - + data_b[p.b_offset + y1_idx + 33] * (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)) - + data_b[p.b_offset + y1_idx + 48] * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)) - + data_b[p.b_offset + y1_idx + 49] * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)) + FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) * (q4_4 + (((data_a[ib0 + i].qh[l0 ] & (hm1 << 1)) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) * (q4_5 + (((data_a[ib0 + i].qh[l0 + 1] & (hm1 << 1)) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) * (q4_6 + (((data_a[ib0 + i].qh[l0 + 16] & (hm1 << 1)) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y1_idx + 49]) * (q4_7 + (((data_a[ib0 + i].qh[l0 + 17] & (hm1 << 1)) != 0) ? 16 : 0)) ); const FLOAT_TYPE sz = FLOAT_TYPE( - data_b[p.b_offset + y2_idx ] * (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)) - + data_b[p.b_offset + y2_idx + 1] * (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)) - + data_b[p.b_offset + y2_idx + 16] * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)) - + data_b[p.b_offset + y2_idx + 17] * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)) + FLOAT_TYPE(data_b[b_offset + y2_idx ]) * (q4_8 + (((data_a[ib0 + i].qh[l0 ] & hm2) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) * (q4_9 + (((data_a[ib0 + i].qh[l0 + 1] & hm2) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) * (q4_10 + (((data_a[ib0 + i].qh[l0 + 16] & hm2) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y2_idx + 17]) * (q4_11 + (((data_a[ib0 + i].qh[l0 + 17] & hm2) != 0) ? 16 : 0)) ); const FLOAT_TYPE sw = FLOAT_TYPE( - data_b[p.b_offset + y2_idx + 32] * (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)) - + data_b[p.b_offset + y2_idx + 33] * (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)) - + data_b[p.b_offset + y2_idx + 48] * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)) - + data_b[p.b_offset + y2_idx + 49] * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)) + FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) * (q4_12 + (((data_a[ib0 + i].qh[l0 ] & (hm2 << 1)) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) * (q4_13 + (((data_a[ib0 + i].qh[l0 + 1] & (hm2 << 1)) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) * (q4_14 + (((data_a[ib0 + i].qh[l0 + 16] & (hm2 << 1)) != 0) ? 16 : 0)) + + FLOAT_TYPE(data_b[b_offset + y2_idx + 49]) * (q4_15 + (((data_a[ib0 + i].qh[l0 + 17] & (hm2 << 1)) != 0) ? 16 : 0)) ); const FLOAT_TYPE smin = FLOAT_TYPE( - (data_b[p.b_offset + y1_idx] + data_b[p.b_offset + y1_idx + 1] + data_b[p.b_offset + y1_idx + 16] + data_b[p.b_offset + y1_idx + 17]) * sc2 + (data_b[p.b_offset + y1_idx + 32] + data_b[p.b_offset + y1_idx + 33] + data_b[p.b_offset + y1_idx + 48] + data_b[p.b_offset + y1_idx + 49]) * sc3 - + (data_b[p.b_offset + y2_idx] + data_b[p.b_offset + y2_idx + 1] + data_b[p.b_offset + y2_idx + 16] + data_b[p.b_offset + y2_idx + 17]) * sc6 + (data_b[p.b_offset + y2_idx + 32] + data_b[p.b_offset + y2_idx + 33] + data_b[p.b_offset + y2_idx + 48] + data_b[p.b_offset + y2_idx + 49]) * sc7 + (FLOAT_TYPE(data_b[b_offset + y1_idx]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 17])) * sc2 + (FLOAT_TYPE(data_b[b_offset + y1_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y1_idx + 49])) * sc3 + + (FLOAT_TYPE(data_b[b_offset + y2_idx]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 1]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 16]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 17])) * sc6 + (FLOAT_TYPE(data_b[b_offset + y2_idx + 32]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 33]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 48]) + FLOAT_TYPE(data_b[b_offset + y2_idx + 49])) * sc7 ); tmp[16 * ix + tid] += FLOAT_TYPE(dall * (sx * sc0 + sy * sc1 + sz * sc4 + sw * sc5) - dmin * smin); } @@ -1353,25 +1755,55 @@ void main() { barrier(); } if (tid == 0) { - dst[p.d_offset + row] = D_TYPE(tmp[0]); + data_d[d_offset + row] = D_TYPE(tmp[0]); } } """ mul_mat_vec_q6_K_body = """ layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; -layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; - shared FLOAT_TYPE tmp[32]; void main() { - const uint block_size = gl_WorkGroupSize.x; const uint row = gl_WorkGroupID.x; + const uint batch_idx = gl_GlobalInvocationID.y; +#ifdef MUL_MAT_ID + const uint expert_idx1 = gl_GlobalInvocationID.z / p.nei0; + const uint expert_idx0 = gl_GlobalInvocationID.z % p.nei0; +#endif + + const uint i13 = batch_idx / p.ne12; + const uint i12 = batch_idx % p.ne12; + + const uint i03 = i13 / p.broadcast3; + const uint i02 = i12 / p.broadcast2; + + const uint batch_idx_a = i03 * p.ne02 + i02; + +#ifdef MUL_MAT_ID + const uint expert_id = data_ids[expert_idx1 * p.nbi1 + expert_idx0]; +#endif + + const uint a_offset = +#ifdef MUL_MAT_ID + expert_id * p.expert_stride_a + +#endif + batch_idx_a * p.batch_stride_a; + const uint b_offset = +#ifdef MUL_MAT_ID + (expert_idx0 % p.ne11) * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_b; + const uint d_offset = +#ifdef MUL_MAT_ID + expert_idx0 * p.expert_stride_b0 + + expert_idx1 * p.expert_stride_b1 + +#endif + batch_idx * p.batch_stride_d; const uint num_blocks_per_row = p.ncols / QUANT_K; - const uint ib0 = row*num_blocks_per_row; + const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row; const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16 const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1 @@ -1402,22 +1834,22 @@ void main() { const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d); #if K_QUANTS_PER_ITERATION == 1 - FLOAT_TYPE sum = FLOAT_TYPE(data_b[p.b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32); + FLOAT_TYPE sum = FLOAT_TYPE(data_b[b_offset + y_idx + 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x03) << 4)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + 16]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 1]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x03) << 4)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + 32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x0c) << 2)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + 48]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 3]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] & 0xF) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x0c) << 2)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + 64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 0] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0x30) >> 0)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + 80]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 5]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 16] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0x30) >> 0)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + 96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 32] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 0] & 0xc0) >> 2)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx +112]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 7]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + 48] >> 4) | ((data_a[ib0 + i].qh[qh_offset + 16] & 0xc0) >> 2)) - 32); tmp[16 * ix + tid] += sum; #else FLOAT_TYPE sum = FLOAT_TYPE(0.0); [[unroll]] for (int l = 0; l < 4; ++l) { - sum += FLOAT_TYPE(data_b[p.b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32) - + FLOAT_TYPE(data_b[p.b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32); + sum += FLOAT_TYPE(data_b[b_offset + y_idx + l+ 0]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 0]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 0) & 3) << 4)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + l+32]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 2]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] & 0xF) | (((data_a[ib0 + i].qh[qh_offset + l] >> 2) & 3) << 4)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + l+64]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 4]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+ 0] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 4) & 3) << 4)) - 32) + + FLOAT_TYPE(data_b[b_offset + y_idx + l+96]) * FLOAT_TYPE(data_a[ib0 + i].scales[s_offset + 6]) * d * FLOAT_TYPE(int8_t((data_a[ib0 + i].ql[ql_offset + l+32] >> 4) | (((data_a[ib0 + i].qh[qh_offset + l] >> 6) & 3) << 4)) - 32); } tmp[16 * ix + tid] += sum; #endif @@ -1432,7 +1864,7 @@ void main() { barrier(); } if (tid == 0) { - dst[p.d_offset + row] = D_TYPE(tmp[0]); + data_d[d_offset + row] = D_TYPE(tmp[0]); } } """ @@ -1611,13 +2043,15 @@ layout (push_constant) uniform parameter uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint d_offset; float param1; float param2; -} p; +} p;""" +generic_unary_op_layout = """ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; +layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};""" +generic_unary_op_funcs = """ uint src0_idx(uint idx) { const uint i03 = idx / (p.ne02*p.ne01*p.ne00); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; @@ -1636,14 +2070,17 @@ uint dst_idx(uint idx) { const uint i11 = (idx - i13_offset - i12_offset) / p.ne10; const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10; return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10; -} +}""" +generic_unary_op_main = """ void main() { if (gl_GlobalInvocationID.x >= p.ne) { return; } """ +generic_unary_op_combined = f"{generic_unary_op_head}\n{generic_unary_op_layout}\n{generic_unary_op_funcs}\n{generic_unary_op_main}" + generic_binary_op_head = """#version 450 #extension GL_EXT_shader_16bit_storage : require @@ -1655,15 +2092,17 @@ layout (push_constant) uniform parameter uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13; uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23; uint d_offset; - uint param1; uint param2; -} p; + float param1; float param2; +} p;""" +generic_binary_op_layout = """ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer B {A_TYPE data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE data_d[];}; +layout (binding = 1) readonly buffer B {B_TYPE data_b[];}; +layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};""" +generic_binary_op_funcs = """ uint src0_idx(uint idx) { const uint i03 = idx / (p.ne02*p.ne01*p.ne00); const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00; @@ -1693,14 +2132,17 @@ uint dst_idx(uint idx) { const uint i21 = (idx - i23_offset - i22_offset) / p.ne20; const uint i20 = idx - i23_offset - i22_offset - i21*p.ne20; return i23*p.nb23 + i22*p.nb22 + i21*p.nb21 + i20*p.nb20; -} +}""" +generic_binary_op_main = """ void main() { if (gl_GlobalInvocationID.x >= p.ne) { return; } """ +generic_binary_op_combined = f"{generic_binary_op_head}\n{generic_binary_op_layout}\n{generic_binary_op_funcs}\n{generic_binary_op_main}" + # MUL F32 mul_body = """ data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)])); @@ -1715,7 +2157,7 @@ add_body = """ # SCALE scale_body = """ - data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(p.param1)); + data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1)); } """ @@ -1745,39 +2187,55 @@ cpy_f16_f16_end = """ """ # GET_ROWS -get_rows_body = """ -#extension GL_EXT_control_flow_attributes : enable -#extension GL_EXT_shader_8bit_storage : require - -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - -layout (binding = 0) readonly buffer X {A_TYPE data_a[];}; -layout (binding = 1) readonly buffer Y {int data_b[];}; -layout (binding = 2) writeonly buffer D {D_TYPE dst[];}; - +get_rows_float_body = """ void main() { - const uint col = int(gl_GlobalInvocationID.x) * 2; - const uint row = int(gl_GlobalInvocationID.y); + const uint i00 = gl_GlobalInvocationID.x; + const uint i10 = gl_GlobalInvocationID.y; + const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; + const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; - if (col >= p.KY) { + if (i00 >= p.ne00) { return; } - const uint r = uint(data_b[row]); + const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12]; - // copy data_a[r*p.KY + col] to dst[row*p.KX + col] - const uint xi = r*p.KY + col; - const uint di = row*p.KY + col; + const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03; + const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23; - const uint ib = xi/QUANT_K; // block index - const uint iqs = (xi%QUANT_K)/QUANT_R; // quant index - const uint iybs = di - di%QUANT_K; // y block start index +#ifndef OPTIMIZATION_ERROR_WORKAROUND + data_d[d_offset + i00] = D_TYPE(data_a[a_offset + i00]); +#else + data_d[d_offset + i00] = data_a[a_offset + i00]; +#endif +} +""" + +get_rows_body = """ +void main() { + const uint i00 = (gl_GlobalInvocationID.x)*2; + const uint i10 = gl_GlobalInvocationID.y; + const uint i11 = (gl_GlobalInvocationID.z)/p.ne12; + const uint i12 = (gl_GlobalInvocationID.z)%p.ne12; + + if (i00 >= p.ne00) { + return; + } + + const uint i01 = data_b[i10*p.nb10 + i11*p.nb11 + i12*p.nb12]; + + const uint a_offset = i01*p.nb01 + i11*p.nb02 + i12*p.nb03; + const uint d_offset = i10*p.nb21 + i11*p.nb22 + i12*p.nb23; + + const uint ib = a_offset + i00/QUANT_K; // block index + const uint iqs = (i00%QUANT_K)/QUANT_R; // quant index + const uint iybs = i00 - i00%QUANT_K; // dst block start index const uint y_offset = QUANT_R == 1 ? 1 : QUANT_K/2; - DEQUANT_FUNC + vec2 v = dequantize(ib, iqs, 0); - dst[iybs + iqs + 0] = D_TYPE(v.x); - dst[iybs + iqs + y_offset] = D_TYPE(v.y); + data_d[d_offset + iybs + iqs ] = D_TYPE(v.x); + data_d[d_offset + iybs + iqs + y_offset] = D_TYPE(v.y); } """ @@ -1873,7 +2331,11 @@ void main() { } const uint i = row*p.ncols + col; - data_d[i] = D_TYPE(data_a[i] - float(uint(col > p.n_past + row % p.rows_per_channel) * 0xFFFFFFFF)); + if (col > p.n_past + row % p.rows_per_channel) { + data_d[i] = D_TYPE(uintBitsToFloat(0xFF800000)); + } else { + data_d[i] = D_TYPE(data_a[i]); + } } """ @@ -1893,8 +2355,6 @@ void main() { const uint row = gl_WorkGroupID.x; const uint tid = gl_LocalInvocationID.x; - const float eps = 1e-5f; - sum[tid] = vec2(0.0f, 0.0f); [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { @@ -1914,7 +2374,7 @@ void main() { const float mean = sum[0].x / p.KX; const float var = sum[0].y / p.KX - mean * mean; - const float inv_std = inversesqrt(var + 1e-5f); + const float inv_std = inversesqrt(var + p.param1); [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { data_d[row*p.KX + col] = D_TYPE((float(data_a[row*p.KX + col]) - mean) * inv_std); @@ -2015,7 +2475,7 @@ void main() { vals[tid] = uintBitsToFloat(0xFF800000); [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) { - vals[tid] = max(vals[tid], FLOAT_TYPE(data_a[rowx * p.KX + col]) * p.scale + (p.KY > 0 ? FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) + (p.KZ > 0 ? slope * data_c[col] : 0.0f)); + vals[tid] = max(vals[tid], FLOAT_TYPE(data_a[rowx * p.KX + col]) * p.scale + (p.KY > 0 ? FLOAT_TYPE(data_b[rowy * p.KX + col]) : FLOAT_TYPE(0.0f)) + (p.KZ > 0 ? slope * FLOAT_TYPE(data_c[col]) : 0.0f)); } barrier(); @@ -2341,7 +2801,7 @@ async def string_to_spv(name, code, defines, fp16=True): stdout, stderr = await proc.communicate() - print(" ".join(cmd)) + logger.info(" ".join(cmd)) if proc.returncode: raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}") @@ -2350,7 +2810,7 @@ async def string_to_spv(name, code, defines, fp16=True): cmd.extend([f"-D{key}={value}" for key, value in defines.items()]) code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())]) - print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}") + logger.error(f"cannot compile {name}\n\n{code_with_lines}\n\n{error}") f.close() os.remove(f.name) sys.exit(proc.returncode) @@ -2363,7 +2823,7 @@ async def string_to_spv(name, code, defines, fp16=True): async def main(): - print("ggml_vulkan: Generating and compiling shaders to SPIR-V") + logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V") tasks = [] @@ -2385,13 +2845,13 @@ async def main(): stream.clear() stream.extend((mulmat_head, shader_float_type, mulmat_body1, mulmat_load_scalar, mulmat_body2)) tasks.append(string_to_spv("matmul_f32", "".join(stream), {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) - tasks.append(string_to_spv("matmul_f32_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_f32_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_f16", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16)) - tasks.append(string_to_spv("matmul_f16_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_f16_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float16_t", "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_f16_f32", "".join(stream), {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) - tasks.append(string_to_spv("matmul_f16_f32_aligned", "".join(stream), {"LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_f16_f32_aligned", "".join(stream), {"LOAD_VEC_A": 1, "LOAD_VEC_B": load_vec, "A_TYPE": "float16_t", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) stream.clear() stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_0_defines, mulmat_body1, mulmat_load_q4_0, mulmat_body2)) @@ -2418,6 +2878,93 @@ async def main(): tasks.append(string_to_spv("matmul_q8_0_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q8_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) tasks.append(string_to_spv("matmul_q8_0_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q8_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + stream.clear() + stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q2_K_defines, mulmat_body1, mulmat_load_q2_K, mulmat_body2)) + tasks.append(string_to_spv("matmul_q2_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q2_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_q2_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q2_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + stream.clear() + stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q3_K_defines, mulmat_body1, mulmat_load_q3_K, mulmat_body2)) + tasks.append(string_to_spv("matmul_q3_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q3_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_q3_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q3_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + stream.clear() + stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_K_defines, mulmat_body1, mulmat_load_q4_K, mulmat_body2)) + tasks.append(string_to_spv("matmul_q4_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q4_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_q4_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + stream.clear() + stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_K_defines, mulmat_body1, mulmat_load_q5_K, mulmat_body2)) + tasks.append(string_to_spv("matmul_q5_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q5_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_q5_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + stream.clear() + stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q6_K_defines, mulmat_body1, mulmat_load_q6_K, mulmat_body2)) + tasks.append(string_to_spv("matmul_q6_k_f32", "".join(stream), {"LOAD_VEC_A": 2, "A_TYPE": "block_q6_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + tasks.append(string_to_spv("matmul_q6_k_f32_aligned", "".join(stream), {"LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q6_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # MUL_MAT_ID + # stream.clear() + # stream.extend((mulmat_head, shader_float_type, mulmat_body1, mulmat_load_scalar, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_f32", "".join(stream), {"MUL_MAT_ID": "1", "A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # tasks.append(string_to_spv("matmul_id_f16", "".join(stream), {"MUL_MAT_ID": "1", "A_TYPE": "float16_t", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_f16_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type_f16, "D_TYPE": "float"}, fp16)) + + # tasks.append(string_to_spv("matmul_id_f16_f32", "".join(stream), {"MUL_MAT_ID": "1", "A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_f16_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "A_TYPE": vec_type_f16, "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_0_defines, mulmat_body1, mulmat_load_q4_0, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q4_0_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q4_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q4_0_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_1_defines, mulmat_body1, mulmat_load_q4_1, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q4_1_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q4_1", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q4_1_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_1", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_0_defines, mulmat_body1, mulmat_load_q5_0, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q5_0_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q5_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q5_0_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_1_defines, mulmat_body1, mulmat_load_q5_1, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q5_1_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q5_1", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q5_1_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_1", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q8_0_defines, mulmat_body1, mulmat_load_q8_0, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q8_0_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q8_0", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q8_0_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q8_0", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q2_K_defines, mulmat_body1, mulmat_load_q2_K, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q2_k_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q2_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q2_k_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q2_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q3_K_defines, mulmat_body1, mulmat_load_q3_K, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q3_k_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q3_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q3_k_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q3_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q4_K_defines, mulmat_body1, mulmat_load_q4_K, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q4_k_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q4_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q4_k_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q4_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q5_K_defines, mulmat_body1, mulmat_load_q5_K, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q5_k_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q5_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q5_k_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q5_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + + # stream.clear() + # stream.extend((mulmat_head, shader_int8_ext, shader_float_type, shader_q6_K_defines, mulmat_body1, mulmat_load_q6_K, mulmat_body2)) + # tasks.append(string_to_spv("matmul_id_q6_k_f32", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "A_TYPE": "block_q6_K", "B_TYPE": "float", "D_TYPE": "float"}, fp16)) + # tasks.append(string_to_spv("matmul_id_q6_k_f32_aligned", "".join(stream), {"MUL_MAT_ID": "1", "LOAD_VEC_A": 2, "LOAD_VEC_B": load_vec, "A_TYPE": "block_q6_K", "B_TYPE": vec_type, "D_TYPE": "float"}, fp16)) + # Shaders where precision is needed, so no fp16 version # mul mat vec @@ -2426,31 +2973,34 @@ async def main(): stream.extend((mul_mat_vec_head, shader_int8_ext, shader_f32)) if i == GGML_TYPE_F16: - stream.extend((shader_f16_defines, shader_f16_dequant_func, mul_mat_vec_body)) + stream.extend((shader_f16_defines, mul_mat_vec_layout, shader_float_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q4_0: - stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, mul_mat_vec_body)) + stream.extend((shader_q4_0_defines, mul_mat_vec_layout, shader_q4_0_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q4_1: - stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, mul_mat_vec_body)) + stream.extend((shader_q4_1_defines, mul_mat_vec_layout, shader_q4_1_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q5_0: - stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, mul_mat_vec_body)) + stream.extend((shader_q5_0_defines, mul_mat_vec_layout, shader_q5_0_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q5_1: - stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, mul_mat_vec_body)) + stream.extend((shader_q5_1_defines, mul_mat_vec_layout, shader_q5_1_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q8_0: - stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, mul_mat_vec_body)) + stream.extend((shader_q8_0_defines, mul_mat_vec_layout, shader_q8_0_dequant_func, mul_mat_vec_body)) elif i == GGML_TYPE_Q2_K: - stream.extend((shader_q2_K_defines, mul_mat_vec_q2_K_body)) + stream.extend((shader_q2_K_defines, mul_mat_vec_layout, mul_mat_vec_q2_K_body)) elif i == GGML_TYPE_Q3_K: - stream.extend((shader_q3_K_defines, mul_mat_vec_q3_K_body)) + stream.extend((shader_q3_K_defines, mul_mat_vec_layout, mul_mat_vec_q3_K_body)) elif i == GGML_TYPE_Q4_K: - stream.extend((shader_q4_K_defines, mul_mat_vec_q4_K_body)) + stream.extend((shader_q4_K_defines, mul_mat_vec_layout, mul_mat_vec_q4_K_body)) elif i == GGML_TYPE_Q5_K: - stream.extend((shader_q5_K_defines, mul_mat_vec_q5_K_body)) + stream.extend((shader_q5_K_defines, mul_mat_vec_layout, mul_mat_vec_q5_K_body)) elif i == GGML_TYPE_Q6_K: - stream.extend((shader_q6_K_defines, mul_mat_vec_q6_K_body)) + stream.extend((shader_q6_K_defines, mul_mat_vec_layout, mul_mat_vec_q6_K_body)) else: continue - tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION})) + tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f32_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION})) + tasks.append(string_to_spv(f"mul_mat_vec_{type_names[i]}_f16_f32", "".join(stream), {"B_TYPE": "float16_t", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION})) + + # tasks.append(string_to_spv(f"mul_mat_vec_id_{type_names[i]}_f32", "".join(stream), {"MUL_MAT_ID": "1", "B_TYPE": "float", "D_TYPE": "float", "K_QUANTS_PER_ITERATION": K_QUANTS_PER_ITERATION})) # Dequant shaders for i in range(0, VK_NUM_TYPES): @@ -2488,25 +3038,32 @@ async def main(): # get_rows for i in range(0, VK_NUM_TYPES): stream.clear() - stream.extend((generic_head, shader_int8_ext, shader_f32)) + stream.extend((generic_binary_op_head, shader_int8_ext, shader_f32)) + optimization_workaround = False - if i == GGML_TYPE_F16: - stream.extend((shader_f16_defines, shader_f16_dequant_func, get_rows_body)) + if i == GGML_TYPE_F32: + stream.extend((shader_f32_defines, generic_binary_op_layout, generic_binary_op_funcs, get_rows_float_body)) + elif i == GGML_TYPE_F16: + stream.extend((shader_f16_defines, generic_binary_op_layout, generic_binary_op_funcs, get_rows_float_body)) + optimization_workaround = True elif i == GGML_TYPE_Q4_0: - stream.extend((shader_q4_0_defines, shader_q4_0_dequant_func, get_rows_body)) + stream.extend((shader_q4_0_defines, generic_binary_op_layout, shader_q4_0_dequant_func, generic_binary_op_funcs, get_rows_body)) elif i == GGML_TYPE_Q4_1: - stream.extend((shader_q4_1_defines, shader_q4_1_dequant_func, get_rows_body)) + stream.extend((shader_q4_1_defines, generic_binary_op_layout, shader_q4_1_dequant_func, generic_binary_op_funcs, get_rows_body)) elif i == GGML_TYPE_Q5_0: - stream.extend((shader_q5_0_defines, shader_q5_0_dequant_func, get_rows_body)) + stream.extend((shader_q5_0_defines, generic_binary_op_layout, shader_q5_0_dequant_func, generic_binary_op_funcs, get_rows_body)) elif i == GGML_TYPE_Q5_1: - stream.extend((shader_q5_1_defines, shader_q5_1_dequant_func, get_rows_body)) + stream.extend((shader_q5_1_defines, generic_binary_op_layout, shader_q5_1_dequant_func, generic_binary_op_funcs, get_rows_body)) elif i == GGML_TYPE_Q8_0: - stream.extend((shader_q8_0_defines, shader_q8_0_dequant_func, get_rows_body)) + stream.extend((shader_q8_0_defines, generic_binary_op_layout, shader_q8_0_dequant_func, generic_binary_op_funcs, get_rows_body)) else: continue - tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float16_t"})) - tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "float", "D_TYPE": "float"})) + if optimization_workaround: + tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"})) + else: + tasks.append(string_to_spv(f"get_rows_{type_names[i]}", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float16_t"})) + tasks.append(string_to_spv(f"get_rows_{type_names[i]}_f32", "".join(stream), {"B_TYPE": "int", "D_TYPE": "float"})) tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", mul_mat_p021_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", mul_mat_nc_src, {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"})) @@ -2515,20 +3072,20 @@ async def main(): tasks.append(string_to_spv("norm_f32", f"{generic_head}\n{shader_f32}\n{norm_body}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("rms_norm_f32", f"{generic_head}\n{shader_f32}\n{rms_norm_body}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"})) - tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_head}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"})) - tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_head}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) + tasks.append(string_to_spv("cpy_f32_f32", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("cpy_f32_f16", f"{generic_unary_op_combined}\n{cpy_end}", {"A_TYPE": "float", "D_TYPE": "float16_t"})) + tasks.append(string_to_spv("cpy_f16_f16", f"{generic_unary_op_combined}\n{cpy_f16_f16_end}", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) - tasks.append(string_to_spv("add_f32", f"{generic_binary_op_head}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) + tasks.append(string_to_spv("add_f32", f"{generic_binary_op_combined}\n{add_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) tasks.append(string_to_spv("split_k_reduce", mulmat_split_k_reduce_src, {})) - tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_head}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) + tasks.append(string_to_spv("mul_f32", f"{generic_binary_op_combined}\n{mul_body}", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_head}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) + tasks.append(string_to_spv("scale_f32", f"{generic_unary_op_combined}\n{scale_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_head}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) + tasks.append(string_to_spv("sqr_f32", f"{generic_unary_op_combined}\n{sqr_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) - tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_head}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) + tasks.append(string_to_spv("clamp_f32", f"{generic_unary_op_combined}\n{clamp_body}", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"})) tasks.append(string_to_spv("gelu_f32", f"{generic_head}\n{shader_f32}\n{gelu_body}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("silu_f32", f"{generic_head}\n{shader_f32}\n{silu_body}", {"A_TYPE": "float", "D_TYPE": "float"})) @@ -2537,6 +3094,7 @@ async def main(): tasks.append(string_to_spv("diag_mask_inf_f32", f"{diag_mask_inf_head}\n{shader_f32}\n{diag_mask_inf_body}", {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("soft_max_f32", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float", "C_TYPE": "float", "D_TYPE": "float"})) + tasks.append(string_to_spv("soft_max_f32_f16", f"{soft_max_head}\n{shader_f32}\n{soft_max_body}", {"A_TYPE": "float", "B_TYPE": "float16_t", "C_TYPE": "float16_t", "D_TYPE": "float"})) tasks.append(string_to_spv("rope_f32", rope_src, {"A_TYPE": "float", "D_TYPE": "float"})) tasks.append(string_to_spv("rope_f16", rope_src, {"A_TYPE": "float16_t", "D_TYPE": "float16_t"})) @@ -2579,9 +3137,12 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator") parser.add_argument("--glslc", help="Path to glslc") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if args.glslc: GLSLC = args.glslc diff --git a/gguf-py/README.md b/gguf-py/README.md index 22d7ffa52..a04c22759 100644 --- a/gguf-py/README.md +++ b/gguf-py/README.md @@ -21,6 +21,8 @@ pip install gguf [scripts/gguf-convert-endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-convert-endian.py) — Allows converting the endianness of GGUF files. +[scripts/gguf-new-metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/scripts/gguf-new-metadata.py) — Copies a GGUF file with added/modified/removed metadata values. + ## Development Maintainers who participate in development of this package are advised to install it in editable mode: diff --git a/gguf-py/examples/reader.py b/gguf-py/examples/reader.py index 62e0769da..d841048c6 100644 --- a/gguf-py/examples/reader.py +++ b/gguf-py/examples/reader.py @@ -1,8 +1,10 @@ #!/usr/bin/env python3 +import logging import sys from pathlib import Path from gguf.gguf_reader import GGUFReader +logger = logging.getLogger("reader") sys.path.insert(0, str(Path(__file__).parent.parent)) @@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path): reader = GGUFReader(gguf_file_path) # List all key-value pairs in a columnized format - print("Key-Value Pairs:") + print("Key-Value Pairs:") # noqa: NP100 max_key_length = max(len(key) for key in reader.fields.keys()) for key, field in reader.fields.items(): value = field.parts[field.data[0]] - print(f"{key:{max_key_length}} : {value}") - print("----") + print(f"{key:{max_key_length}} : {value}") # noqa: NP100 + print("----") # noqa: NP100 # List all tensors - print("Tensors:") + print("Tensors:") # noqa: NP100 tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}" - print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) - print("-" * 80) + print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100 + print("-" * 80) # noqa: NP100 for tensor in reader.tensors: shape_str = "x".join(map(str, tensor.shape)) size_str = str(tensor.n_elements) quantization_str = tensor.tensor_type.name - print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) + print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100 if __name__ == '__main__': if len(sys.argv) < 2: - print("Usage: reader.py ") + logger.info("Usage: reader.py ") sys.exit(1) gguf_file_path = sys.argv[1] read_gguf_file(gguf_file_path) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 4a4facb06..5951c0bb0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -1,6 +1,5 @@ from __future__ import annotations -import sys from enum import Enum, IntEnum, auto from typing import Any @@ -24,6 +23,7 @@ class Keys: ALIGNMENT = "general.alignment" NAME = "general.name" AUTHOR = "general.author" + VERSION = "general.version" URL = "general.url" DESCRIPTION = "general.description" LICENSE = "general.license" @@ -71,6 +71,7 @@ class Keys: class Tokenizer: MODEL = "tokenizer.ggml.model" + PRE = "tokenizer.ggml.pre" LIST = "tokenizer.ggml.tokens" TOKEN_TYPE = "tokenizer.ggml.token_type" TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types @@ -89,6 +90,13 @@ class Keys: HF_JSON = "tokenizer.huggingface.json" RWKV = "tokenizer.rwkv.world" CHAT_TEMPLATE = "tokenizer.chat_template" + CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}" + CHAT_TEMPLATES = "tokenizer.chat_templates" + # FIM/Infill special tokens constants + PREFIX_ID = "tokenizer.ggml.prefix_token_id" + SUFFIX_ID = "tokenizer.ggml.suffix_token_id" + MIDDLE_ID = "tokenizer.ggml.middle_token_id" + EOT_ID = "tokenizer.ggml.eot_token_id" # @@ -100,6 +108,7 @@ class MODEL_ARCH(IntEnum): LLAMA = auto() FALCON = auto() BAICHUAN = auto() + GROK = auto() GPT2 = auto() GPTJ = auto() GPTNEOX = auto() @@ -113,7 +122,9 @@ class MODEL_ARCH(IntEnum): STABLELM = auto() QWEN = auto() QWEN2 = auto() + QWEN2MOE = auto() PHI2 = auto() + PHI3 = auto() PLAMO = auto() CODESHELL = auto() ORION = auto() @@ -122,51 +133,59 @@ class MODEL_ARCH(IntEnum): GEMMA = auto() STARCODER2 = auto() MAMBA = auto() + XVERSE = auto() COMMAND_R = auto() + DBRX = auto() + OLMO = auto() class MODEL_TENSOR(IntEnum): - TOKEN_EMBD = auto() - TOKEN_EMBD_NORM = auto() - TOKEN_TYPES = auto() - POS_EMBD = auto() - OUTPUT = auto() - OUTPUT_NORM = auto() - ROPE_FREQS = auto() - ATTN_Q = auto() - ATTN_K = auto() - ATTN_V = auto() - ATTN_QKV = auto() - ATTN_OUT = auto() - ATTN_NORM = auto() - ATTN_NORM_2 = auto() - ATTN_OUT_NORM = auto() - ATTN_ROT_EMBD = auto() - FFN_GATE_INP = auto() - FFN_NORM = auto() - FFN_GATE = auto() - FFN_DOWN = auto() - FFN_UP = auto() - FFN_ACT = auto() - FFN_GATE_EXP = auto() - FFN_DOWN_EXP = auto() - FFN_UP_EXP = auto() - ATTN_Q_NORM = auto() - ATTN_K_NORM = auto() - LAYER_OUT_NORM = auto() - SSM_IN = auto() - SSM_CONV1D = auto() - SSM_X = auto() - SSM_DT = auto() - SSM_A = auto() - SSM_D = auto() - SSM_OUT = auto() + TOKEN_EMBD = auto() + TOKEN_EMBD_NORM = auto() + TOKEN_TYPES = auto() + POS_EMBD = auto() + OUTPUT = auto() + OUTPUT_NORM = auto() + ROPE_FREQS = auto() + ATTN_Q = auto() + ATTN_K = auto() + ATTN_V = auto() + ATTN_QKV = auto() + ATTN_OUT = auto() + ATTN_NORM = auto() + ATTN_NORM_2 = auto() + ATTN_OUT_NORM = auto() + ATTN_ROT_EMBD = auto() + FFN_GATE_INP = auto() + FFN_GATE_INP_SHEXP = auto() + FFN_NORM = auto() + FFN_GATE = auto() + FFN_DOWN = auto() + FFN_UP = auto() + FFN_ACT = auto() + FFN_GATE_EXP = auto() + FFN_DOWN_EXP = auto() + FFN_UP_EXP = auto() + FFN_GATE_SHEXP = auto() + FFN_DOWN_SHEXP = auto() + FFN_UP_SHEXP = auto() + ATTN_Q_NORM = auto() + ATTN_K_NORM = auto() + LAYER_OUT_NORM = auto() + SSM_IN = auto() + SSM_CONV1D = auto() + SSM_X = auto() + SSM_DT = auto() + SSM_A = auto() + SSM_D = auto() + SSM_OUT = auto() MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.LLAMA: "llama", MODEL_ARCH.FALCON: "falcon", MODEL_ARCH.BAICHUAN: "baichuan", + MODEL_ARCH.GROK: "grok", MODEL_ARCH.GPT2: "gpt2", MODEL_ARCH.GPTJ: "gptj", MODEL_ARCH.GPTNEOX: "gptneox", @@ -180,7 +199,9 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.STABLELM: "stablelm", MODEL_ARCH.QWEN: "qwen", MODEL_ARCH.QWEN2: "qwen2", + MODEL_ARCH.QWEN2MOE: "qwen2moe", MODEL_ARCH.PHI2: "phi2", + MODEL_ARCH.PHI3: "phi3", MODEL_ARCH.PLAMO: "plamo", MODEL_ARCH.CODESHELL: "codeshell", MODEL_ARCH.ORION: "orion", @@ -189,45 +210,52 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.GEMMA: "gemma", MODEL_ARCH.STARCODER2: "starcoder2", MODEL_ARCH.MAMBA: "mamba", + MODEL_ARCH.XVERSE: "xverse", MODEL_ARCH.COMMAND_R: "command-r", + MODEL_ARCH.DBRX: "dbrx", + MODEL_ARCH.OLMO: "olmo", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { - MODEL_TENSOR.TOKEN_EMBD: "token_embd", - MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", - MODEL_TENSOR.TOKEN_TYPES: "token_types", - MODEL_TENSOR.POS_EMBD: "position_embd", - MODEL_TENSOR.OUTPUT_NORM: "output_norm", - MODEL_TENSOR.OUTPUT: "output", - MODEL_TENSOR.ROPE_FREQS: "rope_freqs", - MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", - MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", - MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", - MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", - MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", - MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", - MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", - MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", - MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", - MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", - MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", - MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", - MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", - MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", - MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", - MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", - MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", - MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate.{xid}", - MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down.{xid}", - MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up.{xid}", - MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", - MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", - MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", - MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", - MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", - MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", - MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", - MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", + MODEL_TENSOR.TOKEN_EMBD: "token_embd", + MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm", + MODEL_TENSOR.TOKEN_TYPES: "token_types", + MODEL_TENSOR.POS_EMBD: "position_embd", + MODEL_TENSOR.OUTPUT_NORM: "output_norm", + MODEL_TENSOR.OUTPUT: "output", + MODEL_TENSOR.ROPE_FREQS: "rope_freqs", + MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm", + MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2", + MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv", + MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q", + MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k", + MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v", + MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output", + MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd", + MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm", + MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm", + MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm", + MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp", + MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp", + MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm", + MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate", + MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down", + MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up", + MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp", + MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp", + MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp", + MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn", + MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps", + MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps", + MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps", + MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm", + MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in", + MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d", + MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x", + MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt", + MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a", + MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d", + MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out", } MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { @@ -251,6 +279,28 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN_EXP, MODEL_TENSOR.FFN_UP_EXP, ], + MODEL_ARCH.GROK: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.LAYER_OUT_NORM, + ], MODEL_ARCH.GPTNEOX: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -341,6 +391,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_ACT, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.POS_EMBD, ], MODEL_ARCH.GPTJ: [ MODEL_TENSOR.TOKEN_EMBD, @@ -408,6 +461,8 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, ], MODEL_ARCH.QWEN: [ MODEL_TENSOR.TOKEN_EMBD, @@ -437,6 +492,25 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.QWEN2MOE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_INP_SHEXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + ], MODEL_ARCH.PLAMO: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -478,6 +552,20 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.PHI3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.CODESHELL: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.POS_EMBD, @@ -582,6 +670,22 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.SSM_D, MODEL_TENSOR.SSM_OUT, ], + MODEL_ARCH.XVERSE: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.COMMAND_R: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, @@ -593,6 +697,32 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_Q_NORM, + ], + MODEL_ARCH.DBRX: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_OUT_NORM, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + ], + MODEL_ARCH.OLMO: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, ], # TODO } @@ -626,6 +756,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.XVERSE: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], } # @@ -682,6 +816,8 @@ class GGMLQuantizationType(IntEnum): I32 = 26 I64 = 27 F64 = 28 + IQ1_M = 29 + BF16 = 30 class GGUFEndian(IntEnum): @@ -718,14 +854,13 @@ class GGUFValueType(IntEnum): return GGUFValueType.INT32 # TODO: need help with 64-bit types in Python else: - print("Unknown type:", type(val)) - sys.exit() + raise ValueError(f"Unknown type: {type(val)}") # Note: Does not support GGML_QKK_64 QK_K = 256 # Items here are (block size, type size) -GGML_QUANT_SIZES = { +GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = { GGMLQuantizationType.F32: (1, 4), GGMLQuantizationType.F16: (1, 2), GGMLQuantizationType.Q4_0: (32, 2 + 16), @@ -753,6 +888,8 @@ GGML_QUANT_SIZES = { GGMLQuantizationType.I32: (1, 4), GGMLQuantizationType.I64: (1, 8), GGMLQuantizationType.F64: (1, 8), + GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32), + GGMLQuantizationType.BF16: (1, 2), } @@ -804,6 +941,7 @@ KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK # tokenization KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL +KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES @@ -817,3 +955,7 @@ KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV +KEY_TOKENIZER_PRIFIX_ID = Keys.Tokenizer.PREFIX_ID +KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID +KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID +KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID diff --git a/gguf-py/gguf/gguf_reader.py b/gguf-py/gguf/gguf_reader.py index 33afac552..21b089f8a 100644 --- a/gguf-py/gguf/gguf_reader.py +++ b/gguf-py/gguf/gguf_reader.py @@ -4,6 +4,7 @@ # from __future__ import annotations +import logging import os from collections import OrderedDict from typing import Any, Literal, NamedTuple, TypeVar, Union @@ -27,6 +28,7 @@ from gguf.constants import ( GGUFValueType, ) +logger = logging.getLogger(__name__) READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION] @@ -63,7 +65,7 @@ class ReaderTensor(NamedTuple): class GGUFReader: # I - same as host, S - swapped - byte_order: Literal['I' | 'S'] = 'I' + byte_order: Literal['I'] | Literal['S'] = 'I' alignment: int = GGUF_DEFAULT_ALIGNMENT # Note: Internal helper, API may change. @@ -81,7 +83,7 @@ class GGUFReader: GGUFValueType.BOOL: np.bool_, } - def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'): + def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'): self.data = np.memmap(path, mode = mode) offs = 0 if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC: @@ -126,7 +128,7 @@ class GGUFReader: return self.tensors[idx] def _get( - self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None, + self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None, ) -> npt.NDArray[Any]: count = int(count) itemsize = int(np.empty([], dtype = dtype).itemsize) @@ -139,8 +141,13 @@ class GGUFReader: def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int: if field.name in self.fields: - raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') - self.fields[field.name] = field + # TODO: add option to generate error on duplicate keys + # raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}') + + logger.warning(f'Duplicate key {field.name} at offset {field.offset}') + self.fields[field.name + '_{}'.format(field.offset)] = field + else: + self.fields[field.name] = field return 0 if skip_sum else sum(int(part.nbytes) for part in field.parts) def _get_str(self, offset: int) -> tuple[npt.NDArray[np.uint64], npt.NDArray[np.uint8]]: @@ -234,10 +241,16 @@ class GGUFReader: def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None: tensors = [] + tensor_names = set() # keep track of name to prevent duplicated tensors for field in fields: _name_len, name_data, _n_dims, dims, raw_dtype, offset_tensor = field.parts + # check if there's any tensor having same name already in the list + tensor_name = str(bytes(name_data), encoding = 'utf-8') + if tensor_name in tensor_names: + raise ValueError(f'Found duplicated tensor with name {tensor_name}') + tensor_names.add(tensor_name) ggml_type = GGMLQuantizationType(raw_dtype[0]) - n_elems = np.prod(dims) + n_elems = int(np.prod(dims)) block_size, type_size = GGML_QUANT_SIZES[ggml_type] n_bytes = n_elems * type_size // block_size data_offs = int(start_offs + offset_tensor[0]) @@ -267,7 +280,7 @@ class GGUFReader: item_count = n_bytes item_type = np.uint8 tensors.append(ReaderTensor( - name = str(bytes(name_data), encoding = 'utf-8'), + name = tensor_name, tensor_type = ggml_type, shape = dims, n_elements = n_elems, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 2ae6c814b..8dcf9330b 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1,12 +1,14 @@ from __future__ import annotations +import logging import os import shutil import struct import tempfile from enum import Enum, auto from io import BufferedWriter -from typing import IO, Any, Sequence +from typing import IO, Any, Callable, Sequence, Mapping +from string import ascii_letters, digits import numpy as np @@ -23,6 +25,49 @@ from .constants import ( TokenType, ) +logger = logging.getLogger(__name__) + + +class LazyTensor: + data: Callable[[], np.ndarray[Any, Any]] + # to avoid too deep recursion + functions: list[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]] + dtype: np.dtype[Any] + shape: tuple[int, ...] + + def __init__(self, data: Callable[[], np.ndarray[Any, Any]], *, dtype: type, shape: tuple[int, ...]): + self.data = data + self.functions = [] + self.dtype = np.dtype(dtype) + self.shape = shape + + def astype(self, dtype: type, **kwargs) -> LazyTensor: + self.functions.append(lambda n: n.astype(dtype, **kwargs)) + self.dtype = np.dtype(dtype) + return self + + @property + def nbytes(self) -> int: + size = 1 + for n in self.shape: + size *= n + return size * self.dtype.itemsize + + def tofile(self, *args, **kwargs) -> None: + data = self.data() + for f in self.functions: + data = f(data) + assert data.shape == self.shape + assert data.dtype == self.dtype + assert data.nbytes == self.nbytes + self.functions = [] + self.data = lambda: data + data.tofile(*args, **kwargs) + + def byteswap(self, *args, **kwargs) -> LazyTensor: + self.functions.append(lambda n: n.byteswap(*args, **kwargs)) + return self + class WriterState(Enum): EMPTY = auto() @@ -34,7 +79,7 @@ class WriterState(Enum): class GGUFWriter: fout: BufferedWriter temp_file: tempfile.SpooledTemporaryFile[bytes] | None - tensors: list[np.ndarray[Any, Any]] + tensors: list[np.ndarray[Any, Any] | LazyTensor] _simple_value_packing = { GGUFValueType.UINT8: "B", GGUFValueType.INT8: "b", @@ -62,10 +107,11 @@ class GGUFWriter: self.kv_data_count = 0 self.ti_data = bytearray() self.ti_data_count = 0 + self.ti_names = set() self.use_temp_file = use_temp_file self.temp_file = None self.tensors = [] - print("gguf: This GGUF file is for {0} Endian only".format( + logger.info("gguf: This GGUF file is for {0} Endian only".format( "Big" if self.endianess == GGUFEndian.BIG else "Little", )) self.state = WriterState.EMPTY @@ -171,7 +217,7 @@ class GGUFWriter: if pack_fmt is not None: self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL) elif vtype == GGUFValueType.STRING: - encoded_val = val.encode("utf8") if isinstance(val, str) else val + encoded_val = val.encode("utf-8") if isinstance(val, str) else val self.kv_data += self._pack("Q", len(encoded_val)) self.kv_data += encoded_val elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val: @@ -196,7 +242,11 @@ class GGUFWriter: if self.state is not WriterState.EMPTY: raise ValueError(f'Expected output file to be empty, got {self.state}') - encoded_name = name.encode("utf8") + if name in self.ti_names: + raise ValueError(f'Duplicated tensor name {name}') + self.ti_names.add(name) + + encoded_name = name.encode("utf-8") self.ti_data += self._pack("Q", len(encoded_name)) self.ti_data += encoded_name n_dims = len(tensor_shape) @@ -228,7 +278,7 @@ class GGUFWriter: self.ti_data_count += 1 def add_tensor( - self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, + self, name: str, tensor: np.ndarray[Any, Any] | LazyTensor, raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None, ) -> None: if self.endianess == GGUFEndian.BIG: @@ -253,7 +303,7 @@ class GGUFWriter: if pad != 0: fp.write(bytes([0] * pad)) - def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None: + def write_tensor_data(self, tensor: np.ndarray[Any, Any] | LazyTensor) -> None: if self.state is not WriterState.TI_DATA: raise ValueError(f'Expected output file to contain tensor info, got {self.state}') @@ -263,15 +313,33 @@ class GGUFWriter: tensor.tofile(self.fout) self.write_padding(self.fout, tensor.nbytes) - def write_tensors_to_file(self) -> None: + def write_tensors_to_file(self, *, progress: bool = False) -> None: self.write_ti_data_to_file() self.write_padding(self.fout, self.fout.tell()) if self.temp_file is None: + self.tensors.reverse() # to pop from the "beginning" in constant time + + if progress: + from tqdm import tqdm + + total_bytes = sum(t.nbytes for t in self.tensors) + + bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) + + while True: + try: + tensor = self.tensors.pop() + except IndexError: + break + tensor.tofile(self.fout) + bar.update(tensor.nbytes) + self.write_padding(self.fout, tensor.nbytes) + return while True: try: - tensor = self.tensors.pop(0) + tensor = self.tensors.pop() except IndexError: break tensor.tofile(self.fout) @@ -296,6 +364,9 @@ class GGUFWriter: def add_author(self, author: str) -> None: self.add_string(Keys.General.AUTHOR, author) + def add_version(self, version: str) -> None: + self.add_string(Keys.General.VERSION, version) + def add_tensor_data_layout(self, layout: str) -> None: self.add_string(Keys.LLM.TENSOR_DATA_LAYOUT.format(arch=self.arch), layout) @@ -305,6 +376,9 @@ class GGUFWriter: def add_description(self, description: str) -> None: self.add_string(Keys.General.DESCRIPTION, description) + def add_licence(self, licence: str) -> None: + self.add_string(Keys.General.LICENSE, licence) + def add_source_url(self, url: str) -> None: self.add_string(Keys.General.SOURCE_URL, url) @@ -415,6 +489,9 @@ class GGUFWriter: def add_tokenizer_model(self, model: str) -> None: self.add_string(Keys.Tokenizer.MODEL, model) + def add_tokenizer_pre(self, pre: str) -> None: + self.add_string(Keys.Tokenizer.PRE, pre) + def add_token_list(self, tokens: Sequence[str] | Sequence[bytes] | Sequence[bytearray]) -> None: self.add_array(Keys.Tokenizer.LIST, tokens) @@ -460,9 +537,47 @@ class GGUFWriter: def add_add_space_prefix(self, value: bool) -> None: self.add_bool(Keys.Tokenizer.ADD_PREFIX, value) - def add_chat_template(self, value: str) -> None: + def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None: + if not isinstance(value, str): + template_default = None + template_names = set() + + for choice in value: + name = choice.get('name', '') + template = choice.get('template') + + # Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it + name = ''.join((c if c in ascii_letters + digits else '_' for c in name)) + + if name and template is not None: + if name == 'default': + template_default = template + else: + template_names.add(name) + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template) + + if template_names: + self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names)) + + if template_default is None: + return + + value = template_default + self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value) + def add_prefix_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.PREFIX_ID, id) + + def add_suffix_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.SUFFIX_ID, id) + + def add_middle_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.MIDDLE_ID, id) + + def add_eot_token_id(self, id: int) -> None: + self.add_uint32(Keys.Tokenizer.EOT_ID, id) + def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes: pack_prefix = '' if not skip_pack_prefix: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index ed89955d8..e5750d419 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -10,7 +10,7 @@ class TensorNameMap: # Token embeddings MODEL_TENSOR.TOKEN_EMBD: ( "gpt_neox.embed_in", # gptneox - "transformer.wte", # gpt2 gpt-j mpt refact qwen + "transformer.wte", # gpt2 gpt-j mpt refact qwen dbrx "transformer.word_embeddings", # falcon "word_embeddings", # bloom "model.embed_tokens", # llama-hf @@ -23,6 +23,7 @@ class TensorNameMap: "model.embedding", # mamba-qbert "backbone.embedding", # mamba "backbone.embeddings", # mamba-hf + "transformer.in_out_embed", # Grok ), # Token type embeddings @@ -47,7 +48,7 @@ class TensorNameMap: # Output MODEL_TENSOR.OUTPUT: ( "embed_out", # gptneox - "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba + "lm_head", # gpt2 mpt falcon llama-hf baichuan qwen mamba dbrx "output", # llama-pth bloom internlm2 "word_embeddings_for_head", # persimmon "lm_head.linear", # phi2 @@ -59,13 +60,14 @@ class TensorNameMap: "transformer.ln_f", # gpt2 gpt-j falcon "model.norm", # llama-hf baichuan internlm2 "norm", # llama-pth - "transformer.norm_f", # mpt + "transformer.norm_f", # mpt dbrx "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon "model.final_layernorm", # persimmon "lm_head.ln", # phi2 "model.norm_f", # mamba-qbert "backbone.norm_f", # mamba + "transformer.rms_norm", # Grok ), # Rope frequencies @@ -93,6 +95,8 @@ class TensorNameMap: "model.layers.{bid}.attention_norm", # internlm2 "model.layers.{bid}.norm", # mamba-qbert "backbone.layers.{bid}.norm", # mamba + "transformer.decoder_layer.{bid}.rms_norm", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_1", # dbrx ), # Attention norm 2 @@ -105,6 +109,7 @@ class TensorNameMap: "gpt_neox.layers.{bid}.attention.query_key_value", # gptneox "transformer.h.{bid}.attn.c_attn", # gpt2 qwen "transformer.blocks.{bid}.attn.Wqkv", # mpt + "transformer.blocks.{bid}.norm_attn_norm.attn.Wqkv", # dbrx "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon @@ -112,62 +117,70 @@ class TensorNameMap: "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 "encoder.layers.{bid}.attn.Wqkv", # nomic-bert + "model.layers.{bid}.self_attn.qkv_proj" # phi3 ), # Attention query MODEL_TENSOR.ATTN_Q: ( - "model.layers.{bid}.self_attn.q_proj", # llama-hf - "layers.{bid}.attention.wq", # llama-pth - "encoder.layer.{bid}.attention.self.query", # bert - "transformer.h.{bid}.attn.q_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.q_proj", # plamo - "model.layers.{bid}.attention.wq" # internlm2 + "model.layers.{bid}.self_attn.q_proj", # llama-hf + "layers.{bid}.attention.wq", # llama-pth + "encoder.layer.{bid}.attention.self.query", # bert + "transformer.h.{bid}.attn.q_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.q_proj", # plamo + "model.layers.{bid}.attention.wq", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.query" # Grok ), # Attention key MODEL_TENSOR.ATTN_K: ( - "model.layers.{bid}.self_attn.k_proj", # llama-hf - "layers.{bid}.attention.wk", # llama-pth - "encoder.layer.{bid}.attention.self.key", # bert - "transformer.h.{bid}.attn.k_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.k_proj", # plamo - "model.layers.{bid}.attention.wk" # internlm2 + "model.layers.{bid}.self_attn.k_proj", # llama-hf + "layers.{bid}.attention.wk", # llama-pth + "encoder.layer.{bid}.attention.self.key", # bert + "transformer.h.{bid}.attn.k_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.k_proj", # plamo + "model.layers.{bid}.attention.wk", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.key" # Grok ), # Attention value MODEL_TENSOR.ATTN_V: ( - "model.layers.{bid}.self_attn.v_proj", # llama-hf - "layers.{bid}.attention.wv", # llama-pth - "encoder.layer.{bid}.attention.self.value", # bert - "transformer.h.{bid}.attn.v_proj", # gpt-j - "model.layers.layers.{bid}.self_attn.v_proj", # plamo - "model.layers.{bid}.attention.wv" # internlm2 + "model.layers.{bid}.self_attn.v_proj", # llama-hf + "layers.{bid}.attention.wv", # llama-pth + "encoder.layer.{bid}.attention.self.value", # bert + "transformer.h.{bid}.attn.v_proj", # gpt-j + "model.layers.layers.{bid}.self_attn.v_proj", # plamo + "model.layers.{bid}.attention.wv", # internlm2 + "transformer.decoder_layer.{bid}.multi_head_attention.value" # Grok ), # Attention output MODEL_TENSOR.ATTN_OUT: ( - "gpt_neox.layers.{bid}.attention.dense", # gptneox - "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen - "transformer.blocks.{bid}.attn.out_proj", # mpt - "transformer.h.{bid}.self_attention.dense", # falcon - "h.{bid}.self_attention.dense", # bloom - "model.layers.{bid}.self_attn.o_proj", # llama-hf - "layers.{bid}.attention.wo", # llama-pth - "encoder.layer.{bid}.attention.output.dense", # bert - "transformer.h.{bid}.attn.out_proj", # gpt-j - "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon - "model.layers.{bid}.self_attn.dense", # persimmon - "h.{bid}.attn.c_proj", # gpt2 - "transformer.h.{bid}.mixer.out_proj", # phi2 - "model.layers.layers.{bid}.self_attn.o_proj", # plamo - "model.layers.{bid}.attention.wo", # internlm2 - "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "gpt_neox.layers.{bid}.attention.dense", # gptneox + "transformer.h.{bid}.attn.c_proj", # gpt2 refact qwen + "transformer.blocks.{bid}.attn.out_proj", # mpt + "transformer.h.{bid}.self_attention.dense", # falcon + "h.{bid}.self_attention.dense", # bloom + "model.layers.{bid}.self_attn.o_proj", # llama-hf + "layers.{bid}.attention.wo", # llama-pth + "encoder.layer.{bid}.attention.output.dense", # bert + "transformer.h.{bid}.attn.out_proj", # gpt-j + "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon + "h.{bid}.attn.c_proj", # gpt2 + "transformer.h.{bid}.mixer.out_proj", # phi2 + "model.layers.layers.{bid}.self_attn.o_proj", # plamo + "model.layers.{bid}.attention.wo", # internlm2 + "encoder.layers.{bid}.attn.out_proj", # nomic-bert + "transformer.decoder_layer.{bid}.multi_head_attention.linear", # Grok + "transformer.blocks.{bid}.norm_attn_norm.attn.out_proj", # dbrx ), # Attention output norm MODEL_TENSOR.ATTN_OUT_NORM: ( "encoder.layer.{bid}.attention.output.LayerNorm", # bert "encoder.layers.{bid}.norm1", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_1", # Grok + "transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx ), # Rotary embeddings @@ -190,11 +203,19 @@ class TensorNameMap: "model.layers.{bid}.ln2", # yi "h.{bid}.ln_2", # gpt2 "model.layers.{bid}.ffn_norm", # internlm2 + "transformer.decoder_layer.{bid}.rms_norm_2", # Grok ), MODEL_TENSOR.FFN_GATE_INP: ( - "layers.{bid}.feed_forward.gate", # mixtral - "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "layers.{bid}.feed_forward.gate", # mixtral + "model.layers.{bid}.block_sparse_moe.gate", # mixtral + "model.layers.{bid}.mlp.gate", # qwen2moe + "transformer.decoder_layer.{bid}.router", # Grok + "transformer.blocks.{bid}.ffn.router.layer", # dbrx + ), + + MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( + "model.layers.{bid}.mlp.shared_expert_gate", # qwen2moe ), # Feed-forward up @@ -214,6 +235,7 @@ class TensorNameMap: "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 "model.layers.{bid}.mlp.fc1", # phi2 + "model.layers.{bid}.mlp.gate_up_proj", # phi3 "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert @@ -221,8 +243,14 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_UP_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w3", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w3", # mixtral + "layers.{bid}.feed_forward.experts.w3", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_v", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.v1", # dbrx + "model.layers.{bid}.mlp.experts.up_proj", # qwen2moe (merged) + ), + + MODEL_TENSOR.FFN_UP_SHEXP: ( + "model.layers.{bid}.mlp.shared_expert.up_proj", # qwen2moe ), # AWQ-activation gate @@ -241,8 +269,14 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_GATE_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w1", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w1", # mixtral + "layers.{bid}.feed_forward.experts.w1", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w1", # dbrx + "model.layers.{bid}.mlp.experts.gate_proj", # qwen2moe (merged) + ), + + MODEL_TENSOR.FFN_GATE_SHEXP: ( + "model.layers.{bid}.mlp.shared_expert.gate_proj", # qwen2moe ), # Feed-forward down @@ -268,18 +302,28 @@ class TensorNameMap: ), MODEL_TENSOR.FFN_DOWN_EXP: ( - "layers.{bid}.feed_forward.experts.{xid}.w2", # mixtral - "model.layers.{bid}.block_sparse_moe.experts.{xid}.w2", # mixtral + "layers.{bid}.feed_forward.experts.w2", # mixtral (merged) + "transformer.decoder_layer.{bid}.moe.linear_1", # Grok (merged) + "transformer.blocks.{bid}.ffn.experts.mlp.w2", # dbrx + "model.layers.{bid}.mlp.experts.down_proj", # qwen2moe (merged) + ), + + MODEL_TENSOR.FFN_DOWN_SHEXP: ( + "model.layers.{bid}.mlp.shared_expert.down_proj", # qwen2moe ), MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", "model.layers.{bid}.self_attn.q_layernorm", # persimmon + "model.layers.{bid}.self_attn.q_norm", # cohere + "transformer.blocks.{bid}.attn.q_ln", # sea-lion ), MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", "model.layers.{bid}.self_attn.k_layernorm", # persimmon + "model.layers.{bid}.self_attn.k_norm", # cohere + "transformer.blocks.{bid}.attn.k_ln", # sea-lion ), MODEL_TENSOR.ROPE_FREQS: ( @@ -287,8 +331,9 @@ class TensorNameMap: ), MODEL_TENSOR.LAYER_OUT_NORM: ( - "encoder.layer.{bid}.output.LayerNorm", # bert - "encoder.layers.{bid}.norm2", # nomic-bert + "encoder.layer.{bid}.output.LayerNorm", # bert + "encoder.layers.{bid}.norm2", # nomic-bert + "transformer.decoder_layer.{bid}.rms_norm_3", # Grok ), MODEL_TENSOR.SSM_IN: ( @@ -343,7 +388,7 @@ class TensorNameMap: if tensor not in MODEL_TENSORS[arch]: continue # TODO: make this configurable - n_experts = 8 + n_experts = 60 for xid in range(n_experts): tensor_name = TENSOR_NAMES[tensor].format(bid = bid, xid = xid) self.mapping[tensor_name] = (tensor, tensor_name) diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py index a23136b18..3ba99be4f 100644 --- a/gguf-py/gguf/vocab.py +++ b/gguf-py/gguf/vocab.py @@ -1,23 +1,25 @@ from __future__ import annotations +import logging import json import os -import sys from pathlib import Path -from typing import Any, Callable +from typing import Any, Callable, Sequence, Mapping, Iterable from .gguf_writer import GGUFWriter +logger = logging.getLogger(__name__) + class SpecialVocab: merges: list[str] add_special_token: dict[str, bool] special_token_ids: dict[str, int] - chat_template: str | None + chat_template: str | Sequence[Mapping[str, str]] | None def __init__( self, path: str | os.PathLike[str], load_merges: bool = False, - special_token_types: tuple[str, ...] | None = None, + special_token_types: Iterable[str] | None = None, n_vocab: int | None = None, ): self.special_token_ids = {} @@ -40,38 +42,29 @@ class SpecialVocab: def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None: if self.merges: if not quiet: - print(f'gguf: Adding {len(self.merges)} merge(s).') + logger.info(f'Adding {len(self.merges)} merge(s).') gw.add_token_merges(self.merges) elif self.load_merges: - print( - 'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.', - file = sys.stderr, - ) + logger.warning('Adding merges requested but no merges found, output may be non-functional.') for typ, tokid in self.special_token_ids.items(): id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None) if id_handler is None: - print( - f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping', - file = sys.stderr, - ) + logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping') continue if not quiet: - print(f'gguf: Setting special token type {typ} to {tokid}') + logger.info(f'Setting special token type {typ} to {tokid}') id_handler(tokid) for typ, value in self.add_special_token.items(): add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None) if add_handler is None: - print( - f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping', - file = sys.stderr, - ) + logger.warning(f'No handler for add_{typ}_token with value {value} - skipping') continue if not quiet: - print(f'gguf: Setting add_{typ}_token to {value}') + logger.info(f'Setting add_{typ}_token to {value}') add_handler(value) if self.chat_template is not None: if not quiet: - print(f'gguf: Setting chat_template to {self.chat_template}') + logger.info(f'Setting chat_template to {self.chat_template}') gw.add_chat_template(self.chat_template) def _load(self, path: Path) -> None: @@ -99,10 +92,7 @@ class SpecialVocab: continue parts = line.split(None, 3) if len(parts) != 2: - print( - f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring', - file = sys.stderr, - ) + logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring') continue merges.append(f'{parts[0]} {parts[1]}') self.merges = merges @@ -118,10 +108,7 @@ class SpecialVocab: return self.special_token_ids[typ] = tid return - print( - f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping', - file = sys.stderr, - ) + logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping') def _try_load_from_tokenizer_json(self, path: Path) -> bool: tokenizer_file = path / 'tokenizer.json' @@ -141,13 +128,10 @@ class SpecialVocab: with open(tokenizer_config_file, encoding = 'utf-8') as f: tokenizer_config = json.load(f) chat_template = tokenizer_config.get('chat_template') - if chat_template is None or isinstance(chat_template, str): + if chat_template is None or isinstance(chat_template, (str, list)): self.chat_template = chat_template else: - print( - f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring', - file = sys.stderr - ) + logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring') for typ in self.special_token_types: add_entry = tokenizer_config.get(f'add_{typ}_token') if isinstance(add_entry, bool): diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml index 96396e04e..36e63ee3b 100644 --- a/gguf-py/pyproject.toml +++ b/gguf-py/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "gguf" -version = "0.8.0" +version = "0.9.0" description = "Read and write ML models in GGUF for GGML" authors = ["GGML "] packages = [ @@ -21,6 +21,7 @@ classifiers = [ [tool.poetry.dependencies] python = ">=3.8" numpy = ">=1.17" +tqdm = ">=4.27" [tool.poetry.dev-dependencies] pytest = "^5.2" @@ -33,3 +34,4 @@ build-backend = "poetry.core.masonry.api" gguf-convert-endian = "scripts:gguf_convert_endian_entrypoint" gguf-dump = "scripts:gguf_dump_entrypoint" gguf-set-metadata = "scripts:gguf_set_metadata_entrypoint" +gguf-new-metadata = "scripts:gguf_new_metadata_entrypoint" diff --git a/gguf-py/scripts/__init__.py b/gguf-py/scripts/__init__.py index 77132db7a..1ad45639a 100644 --- a/gguf-py/scripts/__init__.py +++ b/gguf-py/scripts/__init__.py @@ -8,5 +8,6 @@ os.environ["NO_LOCAL_GGUF"] = "TRUE" gguf_convert_endian_entrypoint = import_module("scripts.gguf-convert-endian").main gguf_dump_entrypoint = import_module("scripts.gguf-dump").main gguf_set_metadata_entrypoint = import_module("scripts.gguf-set-metadata").main +gguf_new_metadata_entrypoint = import_module("scripts.gguf-new-metadata").main del import_module, os diff --git a/gguf-py/scripts/gguf-convert-endian.py b/gguf-py/scripts/gguf-convert-endian.py index 10a16ad06..b698af0fe 100755 --- a/gguf-py/scripts/gguf-convert-endian.py +++ b/gguf-py/scripts/gguf-convert-endian.py @@ -1,9 +1,11 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys +from tqdm import tqdm from pathlib import Path import numpy as np @@ -14,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / import gguf +logger = logging.getLogger("gguf-convert-endian") + def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None: if np.uint32(1) == np.uint32(1).newbyteorder("<"): @@ -29,11 +33,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None else: file_endian = host_endian order = host_endian if args.order == "native" else args.order - print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") + logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian") if file_endian == order: - print(f"* File is already {order.upper()} endian. Nothing to do.") + logger.info(f"* File is already {order.upper()} endian. Nothing to do.") sys.exit(0) - print("* Checking tensors for conversion compatibility") + logger.info("* Checking tensors for conversion compatibility") for tensor in reader.tensors: if tensor.tensor_type not in ( gguf.GGMLQuantizationType.F32, @@ -41,51 +45,64 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None gguf.GGMLQuantizationType.Q8_0, ): raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}") - print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") + logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}") if args.dry_run: return - print("\n*** Warning *** Warning *** Warning **") - print("* This conversion process may damage the file. Ensure you have a backup.") + logger.warning("*** Warning *** Warning *** Warning **") + logger.warning("* This conversion process may damage the file. Ensure you have a backup.") if order != host_endian: - print("* Requested endian differs from host, you will not be able to load the model on this machine.") - print("* The file will be modified immediately, so if conversion fails or is interrupted") - print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") + logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.") + logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted") + logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:") response = input("YES, I am sure> ") if response != "YES": - print("You didn't enter YES. Okay then, see ya!") + logger.warning("You didn't enter YES. Okay then, see ya!") sys.exit(0) - print(f"\n* Converting fields ({len(reader.fields)})") + logger.info(f"* Converting fields ({len(reader.fields)})") for idx, field in enumerate(reader.fields.values()): - print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") + logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}") for part in field.parts: part.byteswap(inplace=True) - print(f"\n* Converting tensors ({len(reader.tensors)})") - for idx, tensor in enumerate(reader.tensors): - print( - f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, " - f"elements={tensor.n_elements}... ", - end="", + logger.info(f"* Converting tensors ({len(reader.tensors)})") + + for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")): + log_message = ( + f"Converting tensor {repr(tensor.name)}, " + f"type={tensor.tensor_type.name}, " + f"elements={tensor.n_elements} " ) - tensor_type = tensor.tensor_type + + # Byte-swap each part of the tensor's field for part in tensor.field.parts: part.byteswap(inplace=True) - if tensor_type != gguf.GGMLQuantizationType.Q8_0: + + # Byte-swap tensor data if necessary + if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0: + # Handle Q8_0 tensor blocks (block_q8_0) + # Specific handling of block_q8_0 is required. + # Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations. + + block_size = 34 # 34 bytes = + 32 * + + n_blocks = len(tensor.data) // block_size + for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)): + block_offs = block_num * block_size + + # Byte-Swap f16 sized delta field + delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) + delta.byteswap(inplace=True) + + # Byte-Swap Q8 weights + if block_num % 100000 == 0: + inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]") + + else: + # Handle other tensor types tensor.data.byteswap(inplace=True) - print() - continue - # A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes - block_size = 34 - n_blocks = len(tensor.data) // block_size - for block_num in range(n_blocks): - block_offs = block_num * block_size - # I know I said f16, but it doesn't matter here - any simple 16 bit type works. - delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16) - delta.byteswap(inplace=True) - if block_num % 100000 == 0: - print(f"[{(n_blocks - block_num) // 1000}K]", end="") - sys.stdout.flush() - print() - print("* Completion") + + pbar.set_description(log_message) + + logger.info("* Completion") def main() -> None: @@ -102,8 +119,13 @@ def main() -> None: "--dry-run", action="store_true", help="Don't actually change anything", ) + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + logger.info(f'* Loading: {args.model}') reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+') convert_byteorder(reader, args) diff --git a/gguf-py/scripts/gguf-dump.py b/gguf-py/scripts/gguf-dump.py index dbf891508..1a37a7b91 100755 --- a/gguf-py/scripts/gguf-dump.py +++ b/gguf-py/scripts/gguf-dump.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from __future__ import annotations +import logging import argparse import os import sys @@ -15,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / from gguf import GGUFReader, GGUFValueType # noqa: E402 +logger = logging.getLogger("gguf-dump") + def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG' @@ -29,8 +32,8 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: # please see the comments in the modify_gguf.py example. def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: host_endian, file_endian = get_file_host_endian(reader) - print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') - print(f'\n* Dumping {len(reader.fields)} key/value pair(s)') + print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') # noqa: NP100 + print(f'* Dumping {len(reader.fields)} key/value pair(s)') # noqa: NP100 for n, field in enumerate(reader.fields.values(), 1): if not field.types: pretty_type = 'N/A' @@ -39,20 +42,21 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count else: pretty_type = str(field.types[-1].name) - print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '') + + log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}' if len(field.types) == 1: curr_type = field.types[0] if curr_type == GGUFValueType.STRING: - print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '') + log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60])) elif field.types[0] in reader.gguf_scalar_to_np: - print(' = {0}'.format(field.parts[-1][0]), end = '') - print() + log_message += ' = {0}'.format(field.parts[-1][0]) + print(log_message) # noqa: NP100 if args.no_tensors: return - print(f'\n* Dumping {len(reader.tensors)} tensor(s)') + print(f'* Dumping {len(reader.tensors)} tensor(s)') # noqa: NP100 for n, tensor in enumerate(reader.tensors, 1): prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape))) - print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') + print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') # noqa: NP100 def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None: @@ -103,10 +107,17 @@ def main() -> None: parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata") parser.add_argument("--json", action="store_true", help="Produce JSON output") parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + if not args.json: - print(f'* Loading: {args.model}') + logger.info(f'* Loading: {args.model}') + reader = GGUFReader(args.model, 'r') + if args.json: dump_metadata_json(reader, args) else: diff --git a/gguf-py/scripts/gguf-new-metadata.py b/gguf-py/scripts/gguf-new-metadata.py new file mode 100755 index 000000000..63d3c5d8f --- /dev/null +++ b/gguf-py/scripts/gguf-new-metadata.py @@ -0,0 +1,242 @@ +#!/usr/bin/env python3 +import logging +import argparse +import os +import sys +import json +from pathlib import Path + +import numpy as np +from tqdm import tqdm +from typing import Any, Sequence, NamedTuple + +# Necessary to load the local gguf package +if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): + sys.path.insert(0, str(Path(__file__).parent.parent)) + +import gguf + +logger = logging.getLogger("gguf-new-metadata") + + +class MetadataDetails(NamedTuple): + type: gguf.GGUFValueType + value: Any + description: str = '' + + +def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian: + if np.uint32(1) == np.uint32(1).newbyteorder("<"): + # Host is little endian + host_endian = gguf.GGUFEndian.LITTLE + swapped_endian = gguf.GGUFEndian.BIG + else: + # Sorry PDP or other weird systems that don't use BE or LE. + host_endian = gguf.GGUFEndian.BIG + swapped_endian = gguf.GGUFEndian.LITTLE + + if reader.byte_order == "S": + return swapped_endian + else: + return host_endian + + +def decode_field(field: gguf.ReaderField | None) -> Any: + if field and field.types: + main_type = field.types[0] + + if main_type == gguf.GGUFValueType.ARRAY: + sub_type = field.types[-1] + + if sub_type == gguf.GGUFValueType.STRING: + return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data] + else: + return [pv for idx in field.data for pv in field.parts[idx].tolist()] + if main_type == gguf.GGUFValueType.STRING: + return str(bytes(field.parts[-1]), encoding='utf-8') + else: + return field.parts[-1][0] + + return None + + +def get_field_data(reader: gguf.GGUFReader, key: str) -> Any: + field = reader.get_field(key) + + return decode_field(field) + + +def find_token(token_list: Sequence[int], token: str) -> Sequence[int]: + token_ids = [index for index, value in enumerate(token_list) if value == token] + + if len(token_ids) == 0: + raise LookupError(f'Unable to find "{token}" in token list!') + + return token_ids + + +def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None: + for field in reader.fields.values(): + # Suppress virtual fields and fields written by GGUFWriter + if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): + logger.debug(f'Suppressing {field.name}') + continue + + # Skip old chat templates if we have new ones + if field.name.startswith(gguf.Keys.Tokenizer.CHAT_TEMPLATE) and gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata: + logger.debug(f'Skipping {field.name}') + continue + + if field.name in remove_metadata: + logger.debug(f'Removing {field.name}') + continue + + old_val = MetadataDetails(field.types[0], decode_field(field)) + val = new_metadata.get(field.name, old_val) + + if field.name in new_metadata: + logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}') + del new_metadata[field.name] + elif val.value is not None: + logger.debug(f'Copying {field.name}') + + if val.value is not None: + writer.add_key(field.name) + writer.add_val(val.value, val.type) + + if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata: + logger.debug('Adding chat template(s)') + writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value) + del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] + + for key, val in new_metadata.items(): + logger.debug(f'Adding {key}: "{val.value}" {val.description}') + writer.add_key(key) + writer.add_val(val.value, val.type) + + total_bytes = 0 + + for tensor in reader.tensors: + total_bytes += tensor.n_bytes + # Dimensions are written in reverse order, so flip them first + shape = np.flipud(tensor.shape).tolist() + writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type) + + bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True) + + writer.write_header_to_file() + writer.write_kv_data_to_file() + writer.write_ti_data_to_file() + + for tensor in reader.tensors: + writer.write_tensor_data(tensor.data) + bar.update(tensor.n_bytes) + + writer.close() + + +def main() -> None: + tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_')) + token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id')) + + parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata") + parser.add_argument("input", type=Path, help="GGUF format model input filename") + parser.add_argument("output", type=Path, help="GGUF format model output filename") + parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"') + parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."') + parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."') + parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json') + parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url') + parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '""')) + parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0')) + parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation") + parser.add_argument("--verbose", action="store_true", help="Increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"]) + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + new_metadata = {} + remove_metadata = args.remove_metadata or [] + + if args.general_name: + new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name) + + if args.general_description: + new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description) + + if args.chat_template: + new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template) + + if args.chat_template_config: + with open(args.chat_template_config, 'r') as fp: + config = json.load(fp) + template = config.get('chat_template') + if template: + new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template) + + if remove_metadata: + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning('* Most metadata is required for a fully functional GGUF file,') + logger.warning('* removing crucial metadata may result in a corrupt output file!') + + if not args.force: + logger.warning('* Enter exactly YES if you are positive you want to proceed:') + response = input('YES, I am sure> ') + if response != 'YES': + logger.info("You didn't enter YES. Okay then, see ya!") + sys.exit(0) + + logger.info(f'* Loading: {args.input}') + reader = gguf.GGUFReader(args.input, 'r') + + arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE) + endianess = get_byteorder(reader) + + token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or [] + + for name, token in args.special_token or []: + if name not in token_names: + logger.warning(f'Unknown special token "{name}", ignoring...') + else: + ids = find_token(token_list, token) + new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}') + + if len(ids) > 1: + logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:') + logger.warning(', '.join(str(i) for i in ids)) + + for name, id_string in args.special_token_by_id or []: + if name not in token_names: + logger.warning(f'Unknown special token "{name}", ignoring...') + elif not id_string.isdecimal(): + raise LookupError(f'Token ID "{id_string}" is not a valid ID!') + else: + id_int = int(id_string) + + if id_int >= 0 and id_int < len(token_list): + new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}') + else: + raise LookupError(f'Token ID {id_int} is not within token list!') + + if os.path.isfile(args.output) and not args.force: + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!') + logger.warning('* Enter exactly YES if you are positive you want to proceed:') + response = input('YES, I am sure> ') + if response != 'YES': + logger.info("You didn't enter YES. Okay then, see ya!") + sys.exit(0) + + logger.info(f'* Writing: {args.output}') + writer = gguf.GGUFWriter(args.output, arch=arch, endianess=endianess) + + alignment = get_field_data(reader, gguf.Keys.General.ALIGNMENT) + if alignment is not None: + logger.debug(f'Setting custom alignment: {alignment}') + writer.data_alignment = alignment + + copy_with_new_metadata(reader, writer, new_metadata, remove_metadata) + + +if __name__ == '__main__': + main() diff --git a/gguf-py/scripts/gguf-set-metadata.py b/gguf-py/scripts/gguf-set-metadata.py index 3ebdfa898..e35b651b8 100755 --- a/gguf-py/scripts/gguf-set-metadata.py +++ b/gguf-py/scripts/gguf-set-metadata.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import logging import argparse import os import sys @@ -10,6 +11,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / from gguf import GGUFReader # noqa: E402 +logger = logging.getLogger("gguf-set-metadata") + def minimal_example(filename: str) -> None: reader = GGUFReader(filename, 'r+') @@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None: def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None: field = reader.get_field(args.key) if field is None: - print(f'! Field {repr(args.key)} not found', file = sys.stderr) + logger.error(f'! Field {repr(args.key)} not found') sys.exit(1) # Note that field.types is a list of types. This is because the GGUF # format supports arrays. For example, an array of UINT32 would # look like [GGUFValueType.ARRAY, GGUFValueType.UINT32] handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None if handler is None: - print( - f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}', - file = sys.stderr, - ) + logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}') sys.exit(1) current_value = field.parts[field.data[0]][0] new_value = handler(args.value) - print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') + logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}') if current_value == new_value: - print(f'- Key {repr(args.key)} already set to requested value {current_value}') + logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}') sys.exit(0) if args.dry_run: sys.exit(0) if not args.force: - print('*** Warning *** Warning *** Warning **') - print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') - print('* Enter exactly YES if you are positive you want to proceed:') + logger.warning('*** Warning *** Warning *** Warning **') + logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.') + logger.warning('* Enter exactly YES if you are positive you want to proceed:') response = input('YES, I am sure> ') if response != 'YES': - print("You didn't enter YES. Okay then, see ya!") + logger.info("You didn't enter YES. Okay then, see ya!") sys.exit(0) field.parts[field.data[0]][0] = new_value - print('* Field changed. Successful completion.') + logger.info('* Field changed. Successful completion.') def main() -> None: @@ -80,8 +80,13 @@ def main() -> None: parser.add_argument("value", type=str, help="Metadata value to set") parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything") parser.add_argument("--force", action="store_true", help="Change the field without confirmation") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"]) - print(f'* Loading: {args.model}') + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + logger.info(f'* Loading: {args.model}') reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+') set_metadata(reader, args) diff --git a/grammars/README.md b/grammars/README.md index e1383fa5c..2b8384d9d 100644 --- a/grammars/README.md +++ b/grammars/README.md @@ -51,7 +51,7 @@ single-line ::= [^\n]+ "\n"` ## Sequences and Alternatives -The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc. +The order of symbols in a sequence matters. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc. Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`. @@ -89,3 +89,13 @@ This guide provides a brief overview. Check out the GBNF files in this directory ``` ./main -m --grammar-file grammars/some-grammar.gbnf -p 'Some prompt' ``` + +## Troubleshooting + +Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218). + +### Efficient optional repetitions + +A common pattern is to allow repetitions of a pattern `x` up to N times. + +While semantically correct, the syntax `x? x? x?.... x?` (with N repetitions) will result in extremely slow inference. Instead, you can write `(x (x (x ... (x)?...)?)?)?` (w/ N-deep nesting) diff --git a/llama.cpp b/llama.cpp index 30444e017..a60077759 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7,7 +7,7 @@ #include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA # include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) # include "ggml-opencl.h" @@ -52,12 +52,16 @@ #define NOMINMAX #endif #include + #ifndef PATH_MAX + #define PATH_MAX MAX_PATH + #endif #include #endif #include #include #include +#include #include #include #include @@ -68,10 +72,10 @@ #include #include #include -#include #include #include #include +#include #include #include #include @@ -102,8 +106,7 @@ #endif #define LLAMA_MAX_NODES 8192 -#define LLAMA_MAX_EXPERTS 8 - +#define LLAMA_MAX_EXPERTS 60 // // logging @@ -192,6 +195,7 @@ enum llm_arch { LLM_ARCH_LLAMA, LLM_ARCH_FALCON, LLM_ARCH_BAICHUAN, + LLM_ARCH_GROK, LLM_ARCH_GPT2, LLM_ARCH_GPTJ, LLM_ARCH_GPTNEOX, @@ -205,7 +209,9 @@ enum llm_arch { LLM_ARCH_STABLELM, LLM_ARCH_QWEN, LLM_ARCH_QWEN2, + LLM_ARCH_QWEN2MOE, LLM_ARCH_PHI2, + LLM_ARCH_PHI3, LLM_ARCH_PLAMO, LLM_ARCH_CODESHELL, LLM_ARCH_ORION, @@ -214,13 +220,17 @@ enum llm_arch { LLM_ARCH_GEMMA, LLM_ARCH_STARCODER2, LLM_ARCH_MAMBA, + LLM_ARCH_XVERSE, LLM_ARCH_COMMAND_R, + LLM_ARCH_DBRX, + LLM_ARCH_OLMO, LLM_ARCH_UNKNOWN, }; static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_FALCON, "falcon" }, + { LLM_ARCH_GROK, "grok" }, { LLM_ARCH_GPT2, "gpt2" }, { LLM_ARCH_GPTJ, "gptj" }, { LLM_ARCH_GPTNEOX, "gptneox" }, @@ -235,7 +245,9 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_STABLELM, "stablelm" }, { LLM_ARCH_QWEN, "qwen" }, { LLM_ARCH_QWEN2, "qwen2" }, + { LLM_ARCH_QWEN2MOE, "qwen2moe" }, { LLM_ARCH_PHI2, "phi2" }, + { LLM_ARCH_PHI3, "phi3" }, { LLM_ARCH_PLAMO, "plamo" }, { LLM_ARCH_CODESHELL, "codeshell" }, { LLM_ARCH_ORION, "orion" }, @@ -244,7 +256,10 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, + { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, + { LLM_ARCH_DBRX, "dbrx" }, + { LLM_ARCH_OLMO, "olmo" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -254,6 +269,7 @@ enum llm_kv { LLM_KV_GENERAL_ALIGNMENT, LLM_KV_GENERAL_NAME, LLM_KV_GENERAL_AUTHOR, + LLM_KV_GENERAL_VERSION, LLM_KV_GENERAL_URL, LLM_KV_GENERAL_DESCRIPTION, LLM_KV_GENERAL_LICENSE, @@ -290,12 +306,17 @@ enum llm_kv { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, LLM_KV_ROPE_SCALING_FINETUNED, + LLM_KV_SPLIT_NO, + LLM_KV_SPLIT_COUNT, + LLM_KV_SPLIT_TENSORS_COUNT, + LLM_KV_SSM_INNER_SIZE, LLM_KV_SSM_CONV_KERNEL, LLM_KV_SSM_STATE_SIZE, LLM_KV_SSM_TIME_STEP_RANK, LLM_KV_TOKENIZER_MODEL, + LLM_KV_TOKENIZER_PRE, LLM_KV_TOKENIZER_LIST, LLM_KV_TOKENIZER_TOKEN_TYPE, LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, @@ -306,11 +327,17 @@ enum llm_kv { LLM_KV_TOKENIZER_UNK_ID, LLM_KV_TOKENIZER_SEP_ID, LLM_KV_TOKENIZER_PAD_ID, + LLM_KV_TOKENIZER_CLS_ID, + LLM_KV_TOKENIZER_MASK_ID, LLM_KV_TOKENIZER_ADD_BOS, LLM_KV_TOKENIZER_ADD_EOS, LLM_KV_TOKENIZER_ADD_PREFIX, LLM_KV_TOKENIZER_HF_JSON, LLM_KV_TOKENIZER_RWKV, + LLM_KV_TOKENIZER_PREFIX_ID, + LLM_KV_TOKENIZER_SUFFIX_ID, + LLM_KV_TOKENIZER_MIDDLE_ID, + LLM_KV_TOKENIZER_EOT_ID, }; static const std::map LLM_KV_NAMES = { @@ -319,6 +346,7 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, { LLM_KV_GENERAL_NAME, "general.name" }, { LLM_KV_GENERAL_AUTHOR, "general.author" }, + { LLM_KV_GENERAL_VERSION, "general.version" }, { LLM_KV_GENERAL_URL, "general.url" }, { LLM_KV_GENERAL_DESCRIPTION, "general.description" }, { LLM_KV_GENERAL_LICENSE, "general.license" }, @@ -355,12 +383,17 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, "%s.rope.scaling.original_context_length" }, { LLM_KV_ROPE_SCALING_FINETUNED, "%s.rope.scaling.finetuned" }, + { LLM_KV_SPLIT_NO, "split.no" }, + { LLM_KV_SPLIT_COUNT, "split.count" }, + { LLM_KV_SPLIT_TENSORS_COUNT, "split.tensors.count" }, + { LLM_KV_SSM_CONV_KERNEL, "%s.ssm.conv_kernel" }, { LLM_KV_SSM_INNER_SIZE, "%s.ssm.inner_size" }, { LLM_KV_SSM_STATE_SIZE, "%s.ssm.state_size" }, { LLM_KV_SSM_TIME_STEP_RANK, "%s.ssm.time_step_rank" }, { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, + { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, { LLM_KV_TOKENIZER_TOKEN_TYPE, "tokenizer.ggml.token_type" }, { LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, "tokenizer.ggml.token_type_count" }, @@ -371,11 +404,17 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_TOKENIZER_UNK_ID, "tokenizer.ggml.unknown_token_id" }, { LLM_KV_TOKENIZER_SEP_ID, "tokenizer.ggml.seperator_token_id" }, { LLM_KV_TOKENIZER_PAD_ID, "tokenizer.ggml.padding_token_id" }, + { LLM_KV_TOKENIZER_CLS_ID, "tokenizer.ggml.cls_token_id" }, + { LLM_KV_TOKENIZER_MASK_ID, "tokenizer.ggml.mask_token_id" }, { LLM_KV_TOKENIZER_ADD_BOS, "tokenizer.ggml.add_bos_token" }, { LLM_KV_TOKENIZER_ADD_EOS, "tokenizer.ggml.add_eos_token" }, { LLM_KV_TOKENIZER_ADD_PREFIX, "tokenizer.ggml.add_space_prefix" }, { LLM_KV_TOKENIZER_HF_JSON, "tokenizer.huggingface.json" }, { LLM_KV_TOKENIZER_RWKV, "tokenizer.rwkv.world" }, + { LLM_KV_TOKENIZER_PREFIX_ID, "tokenizer.ggml.prefix_token_id" }, + { LLM_KV_TOKENIZER_SUFFIX_ID, "tokenizer.ggml.suffix_token_id" }, + { LLM_KV_TOKENIZER_MIDDLE_ID, "tokenizer.ggml.middle_token_id" }, + { LLM_KV_TOKENIZER_EOT_ID, "tokenizer.ggml.eot_token_id" }, }; struct LLM_KV { @@ -406,14 +445,21 @@ enum llm_tensor { LLM_TENSOR_ATTN_OUT_NORM, LLM_TENSOR_ATTN_ROT_EMBD, LLM_TENSOR_FFN_GATE_INP, + LLM_TENSOR_FFN_GATE_INP_SHEXP, LLM_TENSOR_FFN_NORM, LLM_TENSOR_FFN_GATE, LLM_TENSOR_FFN_DOWN, LLM_TENSOR_FFN_UP, LLM_TENSOR_FFN_ACT, - LLM_TENSOR_FFN_DOWN_EXP, + LLM_TENSOR_FFN_DOWN_EXP, // split experts for backward compatibility LLM_TENSOR_FFN_GATE_EXP, LLM_TENSOR_FFN_UP_EXP, + LLM_TENSOR_FFN_DOWN_EXPS, // merged experts + LLM_TENSOR_FFN_GATE_EXPS, + LLM_TENSOR_FFN_UP_EXPS, + LLM_TENSOR_FFN_DOWN_SHEXP, + LLM_TENSOR_FFN_GATE_SHEXP, + LLM_TENSOR_FFN_UP_SHEXP, LLM_TENSOR_ATTN_Q_NORM, LLM_TENSOR_ATTN_K_NORM, LLM_TENSOR_LAYER_OUT_NORM, @@ -448,6 +494,9 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, }, }, { @@ -483,6 +532,31 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_GROK, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_EXP, "blk.%d.ffn_gate.%d" }, + { LLM_TENSOR_FFN_DOWN_EXP, "blk.%d.ffn_down.%d" }, + { LLM_TENSOR_FFN_UP_EXP, "blk.%d.ffn_up.%d" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + }, + }, { LLM_ARCH_GPT2, { @@ -548,6 +622,9 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, { LLM_TENSOR_FFN_ACT, "blk.%d.ffn.act" }, + { LLM_TENSOR_POS_EMBD, "position_embd" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm"}, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm"}, }, }, { @@ -645,6 +722,8 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, }, }, { @@ -680,6 +759,28 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_QWEN2MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + }, + }, { LLM_ARCH_PHI2, { @@ -696,6 +797,23 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_PHI3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_PLAMO, { @@ -843,6 +961,25 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" }, }, }, + { + LLM_ARCH_XVERSE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_COMMAND_R, { @@ -856,6 +993,38 @@ static const std::map> LLM_TENSOR_NA { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + }, + }, + { + LLM_ARCH_DBRX, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + }, + }, + { + LLM_ARCH_OLMO, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, { @@ -1030,7 +1199,7 @@ struct llama_file { size_t size; llama_file(const char * fname, const char * mode) { - fp = std::fopen(fname, mode); + fp = ggml_fopen(fname, mode); if (fp == NULL) { throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno))); } @@ -1099,6 +1268,7 @@ struct llama_file { } } }; +using llama_files = std::vector>; struct llama_mmap { void * addr; @@ -1299,6 +1469,7 @@ struct llama_mmap { } #endif }; +using llama_mmaps = std::vector>; // Represents some region of memory being locked using mlock or VirtualLock; // will automatically unlock on destruction. @@ -1448,13 +1619,14 @@ struct llama_mlock { static void raw_unlock(const void * addr, size_t len) {} #endif }; +using llama_mlocks = std::vector>; -static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { +static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) { std::vector result(8, 0); - const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); if (n_tokens < 0) { result.resize(-n_tokens); - int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); + int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special); GGML_ASSERT(check == -n_tokens); } else { @@ -1467,7 +1639,7 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) // host buffers should only be used when data is expected to be copied to/from the GPU if (host_buffer) { buft = ggml_backend_cuda_host_buffer_type(); @@ -1497,7 +1669,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { #ifdef GGML_USE_METAL buft = ggml_backend_metal_buffer_type(); -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) buft = ggml_backend_cuda_buffer_type(gpu); #elif defined(GGML_USE_VULKAN) buft = ggml_backend_vk_buffer_type(gpu); @@ -1523,7 +1695,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (ggml_backend_cuda_get_device_count() > 1) { buft = ggml_backend_cuda_split_buffer_type(tensor_split); } @@ -1544,7 +1716,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g } static size_t llama_get_device_count() { -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) return ggml_backend_cuda_get_device_count(); #elif defined(GGML_USE_SYCL) return ggml_backend_sycl_get_device_count(); @@ -1556,20 +1728,20 @@ static size_t llama_get_device_count() { } static size_t llama_get_device_memory(int device) { -#if defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_CUDA) size_t total; size_t free; - ggml_backend_cuda_get_device_memory(device, &total, &free); + ggml_backend_cuda_get_device_memory(device, &free, &total); return free; #elif defined(GGML_USE_SYCL) size_t total; size_t free; - ggml_backend_sycl_get_device_memory(device, &total, &free); + ggml_backend_sycl_get_device_memory(device, &free, &total); return free; #elif defined(GGML_USE_VULKAN) size_t total; size_t free; - ggml_backend_vk_get_device_memory(device, &total, &free); + ggml_backend_vk_get_device_memory(device, &free, &total); return free; #else return 1; @@ -1611,6 +1783,7 @@ enum e_model { MODEL_4B, MODEL_7B, MODEL_8B, + MODEL_12B, MODEL_13B, MODEL_14B, MODEL_15B, @@ -1621,10 +1794,15 @@ enum e_model { MODEL_40B, MODEL_65B, MODEL_70B, + MODEL_314B, MODEL_SMALL, MODEL_MEDIUM, MODEL_LARGE, MODEL_XL, + MODEL_A2_7B, + MODEL_8x7B, + MODEL_8x22B, + MODEL_16x12B, }; static const size_t kiB = 1024; @@ -1667,7 +1845,7 @@ struct llama_hparams { float f_logit_scale = 0.0f; bool causal_attn = true; - bool need_kq_pos = false; + bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; @@ -1738,6 +1916,7 @@ struct llama_cparams { uint32_t n_ctx; // context size used during inference uint32_t n_batch; uint32_t n_ubatch; + uint32_t n_seq_max; uint32_t n_threads; // number of threads to use for generation uint32_t n_threads_batch; // number of threads to use for batch processing @@ -1756,6 +1935,7 @@ struct llama_cparams { bool embeddings; bool causal_attn; bool offload_kqv; + bool flash_attn; enum llama_pooling_type pooling_type; @@ -1803,9 +1983,15 @@ struct llama_layer { // ff MoE struct ggml_tensor * ffn_gate_inp; - struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS]; - struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS]; - struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS]; + struct ggml_tensor * ffn_gate_exps; + struct ggml_tensor * ffn_down_exps; + struct ggml_tensor * ffn_up_exps ; + + // ff shared expert (shexp) + struct ggml_tensor * ffn_gate_inp_shexp; + struct ggml_tensor * ffn_gate_shexp; + struct ggml_tensor * ffn_down_shexp; + struct ggml_tensor * ffn_up_shexp; // ff bias struct ggml_tensor * ffn_down_b; // b2 @@ -1853,8 +2039,8 @@ struct llama_kv_cache { bool has_shift = false; bool do_defrag = false; bool do_copy = false; - // with recurrent state models, a cell can hold the state for more than one past token - bool recurrent = false; + bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token + bool v_trans = true; // the value tensor is transposed // Note: The value of head isn't only used to optimize searching // for a free KV slot. llama_decode_internal also uses it, so it @@ -1931,7 +2117,8 @@ struct llama_vocab { ttype type; }; - enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM; + enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; std::unordered_map token_to_id; std::vector id_to_token; @@ -1941,20 +2128,22 @@ struct llama_vocab { std::map, int> bpe_ranks; // default LLaMA special tokens - id special_bos_id = 1; - id special_eos_id = 2; - id special_unk_id = 0; - id special_sep_id = -1; - id special_pad_id = -1; + id special_bos_id = 1; + id special_eos_id = 2; + id special_unk_id = 0; + id special_sep_id = -1; + id special_pad_id = -1; + id special_cls_id = -1; + id special_mask_id = -1; int special_add_bos = -1; // -1 unknown, 1 add, 0 don't add. int special_add_eos = -1; // -1 unknown, 1 add, 0 don't add. id linefeed_id = 13; - id special_prefix_id = 32007; - id special_middle_id = 32009; - id special_suffix_id = 32008; - id special_eot_id = 32010; + id special_prefix_id = -1; + id special_suffix_id = -1; + id special_middle_id = -1; + id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token bool add_space_prefix = true; @@ -2023,12 +2212,12 @@ struct llama_model { // the model memory buffers for the tensor data std::vector bufs; - // model memory mapped file - std::unique_ptr mapping; + // model memory mapped files + llama_mmaps mappings; // objects representing data potentially being locked in memory - std::vector> mlock_bufs; - llama_mlock mlock_mmap; + llama_mlocks mlock_bufs; + llama_mlocks mlock_mmaps; // for quantize-stats only std::vector> tensors_by_name; @@ -2041,7 +2230,7 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { -#ifdef GGML_USE_CUBLAS +#ifdef GGML_USE_CUDA if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); } @@ -2060,10 +2249,6 @@ struct llama_context { ggml_backend_free(backend); } -#ifdef GGML_USE_VULKAN - ggml_vk_free_cpu_assist(); -#endif - ggml_backend_buffer_free(buf_output); } @@ -2100,20 +2285,20 @@ struct llama_context { // host buffer for the model output (logits and embeddings) ggml_backend_buffer_t buf_output = nullptr; - // decode output (2-dimensional array: [n_tokens][n_vocab]) - size_t logits_size = 0; - float * logits = nullptr; + // decode output (2-dimensional array: [n_outputs][n_vocab]) + size_t logits_size = 0; // capacity (of floats) for logits + float * logits = nullptr; + + std::vector output_ids; // map batch token positions to ids of the logits and embd buffers + size_t output_size = 0; // capacity (of tokens positions) for the output buffers + int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch -#ifndef NDEBUG - // guard against access to unset logits - std::vector logits_valid; -#endif bool logits_all = false; - // embeddings output (2-dimensional array: [n_tokens][n_embd]) + // embeddings output (2-dimensional array: [n_outputs][n_embd]) // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE - size_t embd_size = 0; - float * embd = nullptr; + size_t embd_size = 0; // capacity (of floats) for embeddings + float * embd = nullptr; // sequence embeddings output (map of [n_embd] vectors) // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE @@ -2130,14 +2315,15 @@ struct llama_context { struct ggml_tensor * inp_tokens; // I32 [n_batch] struct ggml_tensor * inp_embd; // F32 [n_embd, n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch] + struct ggml_tensor * inp_out_ids; // I32 [n_outputs] struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_pos; // F32 [kv_size] + struct ggml_tensor * inp_KQ_pos; // F32 [n_kv] struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch] struct ggml_tensor * inp_s_copy; // I32 [kv_size] - struct ggml_tensor * inp_s_mask; // F32 [1, kv_size] - struct ggml_tensor * inp_s_seq; // I32 [kv_size, n_batch] + struct ggml_tensor * inp_s_mask; // F32 [1, n_kv] + struct ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch] // control vectors struct llama_control_vector cvec; @@ -2153,11 +2339,14 @@ struct llama_context { static bool llama_kv_cache_init( struct llama_kv_cache & cache, - const llama_model & model, + const llama_context * ctx, ggml_type type_k, ggml_type type_v, uint32_t kv_size, bool offload) { + const llama_model & model = ctx->model; + const llama_cparams & cparams = ctx->cparams; + const struct llama_hparams & hparams = model.hparams; const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); @@ -2168,8 +2357,9 @@ static bool llama_kv_cache_init( // TODO: find a nicer way to add other recurrent model architectures cache.recurrent = model.arch == LLM_ARCH_MAMBA; + cache.v_trans = !cparams.flash_attn; - // TODO: support mixed reccurent Transformer architectues + // TODO: support mixed recurrent Transformer architectures // NOTE: (!a || b) is a logical implication (a -> b) GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s()); GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s()); @@ -2380,6 +2570,10 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) { } cache.head = 0; cache.used = 0; + + for (auto & buf : cache.bufs) { + ggml_backend_buffer_clear(buf, 0); + } } static bool llama_kv_cache_seq_rm( @@ -2700,6 +2894,7 @@ namespace GGUFMeta { case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool"; case LLAMA_KV_OVERRIDE_TYPE_INT: return "int"; case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float"; + case LLAMA_KV_OVERRIDE_TYPE_STR: return "str"; } return "unknown"; } @@ -2711,13 +2906,16 @@ namespace GGUFMeta { __func__, override_type_to_str(ovrd->tag), ovrd->key); switch (ovrd->tag) { case LLAMA_KV_OVERRIDE_TYPE_BOOL: { - LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false"); + LLAMA_LOG_INFO("%s\n", ovrd->val_bool ? "true" : "false"); } break; case LLAMA_KV_OVERRIDE_TYPE_INT: { - LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value); + LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->val_i64); } break; case LLAMA_KV_OVERRIDE_TYPE_FLOAT: { - LLAMA_LOG_INFO("%.6f\n", ovrd->float_value); + LLAMA_LOG_INFO("%.6f\n", ovrd->val_f64); + } break; + case LLAMA_KV_OVERRIDE_TYPE_STR: { + LLAMA_LOG_INFO("%s\n", ovrd->val_str); } break; default: // Shouldn't be possible to end up here, but just in case... @@ -2736,7 +2934,7 @@ namespace GGUFMeta { static typename std::enable_if::value, bool>::type try_override(OT & target, const struct llama_model_kv_override * ovrd) { if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) { - target = ovrd->bool_value; + target = ovrd->val_bool; return true; } return false; @@ -2746,7 +2944,7 @@ namespace GGUFMeta { static typename std::enable_if::value && std::is_integral::value, bool>::type try_override(OT & target, const struct llama_model_kv_override * ovrd) { if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) { - target = ovrd->int_value; + target = ovrd->val_i64; return true; } return false; @@ -2756,7 +2954,7 @@ namespace GGUFMeta { static typename std::enable_if::value, bool>::type try_override(T & target, const struct llama_model_kv_override * ovrd) { if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) { - target = ovrd->float_value; + target = ovrd->val_f64; return true; } return false; @@ -2765,12 +2963,11 @@ namespace GGUFMeta { template static typename std::enable_if::value, bool>::type try_override(T & target, const struct llama_model_kv_override * ovrd) { - (void)target; - (void)ovrd; - if (!ovrd) { return false; } - // Currently, we should never end up here so it would be a bug if we do. - throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n", - ovrd ? ovrd->key : "NULL")); + if (validate_override(LLAMA_KV_OVERRIDE_TYPE_STR, ovrd)) { + target = ovrd->val_str; + return true; + } + return false; } static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) { @@ -2792,6 +2989,8 @@ namespace GGUFMeta { }; } +using llama_buf_map = std::unordered_map; + struct llama_model_loader { int n_kv = 0; int n_tensors = 0; @@ -2801,55 +3000,148 @@ struct llama_model_loader { size_t n_bytes = 0; bool use_mmap = false; + bool check_tensors; - llama_file file; + llama_files files; llama_ftype ftype; llama_fver fver; - std::unique_ptr mapping; + llama_mmaps mappings; + + // Holds information on a model weight + struct llama_tensor_weight { + uint16_t idx; // source file index + size_t offs; // tensor data offset in the original file + + ggml_tensor * tensor; + + llama_tensor_weight(const llama_file * file, uint16_t idx, const char * name, const struct gguf_context * gguf_ctx, ggml_tensor * tensor) : idx(idx), tensor(tensor) { + const int tensor_idx = gguf_find_tensor(gguf_ctx, name); + offs = gguf_get_data_offset(gguf_ctx) + gguf_get_tensor_offset(gguf_ctx, tensor_idx); + + if (offs + ggml_nbytes(tensor) < offs || offs + ggml_nbytes(tensor) > file->size) { + throw std::runtime_error(format("tensor '%s' data is not within the file bounds, model is corrupted or incomplete", name)); + } + } + }; + std::vector weights; + std::unordered_map kv_overrides; - struct gguf_context * ctx_gguf = NULL; - struct ggml_context * ctx_meta = NULL; + struct gguf_context * meta = NULL; + std::vector contexts; std::string arch_name; LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN); - llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") { + llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); } - struct gguf_init_params params = { - /*.no_alloc = */ true, - /*.ctx = */ &ctx_meta, - }; - if (param_overrides_p != nullptr) { for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) { kv_overrides.insert({std::string(p->key), *p}); } } - ctx_gguf = gguf_init_from_file(fname.c_str(), params); - if (!ctx_gguf) { + struct ggml_context * ctx = NULL; + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + + meta = gguf_init_from_file(fname.c_str(), params); + if (!meta) { throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str())); } get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false); llm_kv = LLM_KV(llm_arch_from_string(arch_name)); - n_kv = gguf_get_n_kv(ctx_gguf); - n_tensors = gguf_get_n_tensors(ctx_gguf); + files.emplace_back(new llama_file(fname.c_str(), "rb")); + contexts.emplace_back(ctx); - fver = (enum llama_fver ) gguf_get_version(ctx_gguf); + // Save tensors data offset of the main file. + // For subsidiary files, `meta` tensor data offset must not be used, + // so we build a unified tensors index for weights. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(files.back().get(), 0, cur->name, meta, cur); + } + uint16_t n_split = 0; + get_key(llm_kv(LLM_KV_SPLIT_COUNT), n_split, false); - for (int i = 0; i < n_tensors; i++) { - const char * name = gguf_get_tensor_name(ctx_gguf, i); - struct ggml_tensor * t = ggml_get_tensor(ctx_meta, name); - n_elements += ggml_nelements(t); - n_bytes += ggml_nbytes(t); + // Load additional GGML contexts + if (n_split > 1) { + uint16_t idx = 0; + get_key(llm_kv(LLM_KV_SPLIT_NO), idx); + if (idx != 0) { + throw std::runtime_error(format("illegal split file: %d, model must be loaded with the first split", idx)); + } + + char split_prefix[PATH_MAX] = {0}; + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), fname.c_str(), idx, n_split)) { + throw std::runtime_error(format("invalid split file: %s", fname.c_str())); + } + + if (trace > 0) { + LLAMA_LOG_INFO("%s: loading additional %d GGUFs\n", __func__, n_split); + } + + char split_path[PATH_MAX] = {0}; + for (idx = 1; idx < n_split; idx++) { + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + + struct gguf_init_params split_params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx, + }; + struct gguf_context * ctx_gguf = gguf_init_from_file(split_path, split_params); + if (!ctx_gguf) { + throw std::runtime_error(format("%s: failed to load GGUF split from %s\n", __func__, split_path)); + } + + files.emplace_back(new llama_file(split_path, "rb")); + contexts.emplace_back(ctx); + + // Save tensors data offset info of the shard. + for (ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { + weights.emplace_back(files.back().get(), idx, cur->name, ctx_gguf, cur); + } + + gguf_free(ctx_gguf); + } + + get_key(llm_kv(LLM_KV_SPLIT_TENSORS_COUNT), n_tensors); + + // sanity check + { + const int n_tensors_loaded = (int) weights.size(); + if (n_tensors != n_tensors_loaded) { + throw std::runtime_error(format("corrupted model: %d tensors expected but %d found", n_tensors, n_tensors_loaded)); + } + } + + LLAMA_LOG_INFO("%s: additional %d GGUFs metadata loaded.\n", __func__, n_split - 1); + } + + n_kv = gguf_get_n_kv(meta); + n_tensors = weights.size(); + + fver = (enum llama_fver) gguf_get_version(meta); + + std::set tensor_names; + for (auto & w : weights) { + n_elements += ggml_nelements(w.tensor); + n_bytes += ggml_nbytes(w.tensor); + // make sure there is no duplicated tensor names + const std::string name(w.tensor->name); + auto found = tensor_names.find(name); + if (found != tensor_names.end()) { + throw std::runtime_error(format("invalid model: tensor '%s' is duplicated", w.tensor->name)); + } + tensor_names.insert(name); } LLAMA_LOG_INFO("%s: loaded meta data with %d key-value pairs and %d tensors from %s (version %s)\n", @@ -2864,7 +3156,8 @@ struct llama_model_loader { enum ggml_type type_max = GGML_TYPE_F32; for (int i = 0; i < n_tensors; i++) { - enum ggml_type type = gguf_get_tensor_type(ctx_gguf, i); + const ggml_tensor * tensor = weights.at(i).tensor; + enum ggml_type type = tensor->type; n_type[type]++; @@ -2874,14 +3167,15 @@ struct llama_model_loader { } if (trace > 0) { - struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str()); + const uint16_t sid = weights.at(i).idx; + LLAMA_LOG_INFO("%s: - tensor %4d, split %2d: %32s %-8s [ %s ]\n", __func__, i, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); } } switch (type_max) { case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break; case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break; + case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break; case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break; case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break; case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break; @@ -2897,6 +3191,7 @@ struct llama_model_loader { case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break; case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break; case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break; + case GGML_TYPE_IQ1_M: ftype = LLAMA_FTYPE_MOSTLY_IQ1_M; break; case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break; case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break; case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break; @@ -2911,22 +3206,23 @@ struct llama_model_loader { ftype = (llama_ftype) (ftype | LLAMA_FTYPE_GUESSED); { - const int kid = gguf_find_key(ctx_gguf, "general.file_type"); + const int kid = gguf_find_key(meta, "general.file_type"); if (kid >= 0) { - ftype = (llama_ftype) gguf_get_val_u32(ctx_gguf, kid); + ftype = (llama_ftype) gguf_get_val_u32(meta, kid); } } LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__); + for (int i = 0; i < n_kv; i++) { - const char * name = gguf_get_key(ctx_gguf, i); - const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i); + const char * name = gguf_get_key(meta, i); + const enum gguf_type type = gguf_get_kv_type(meta, i); const std::string type_name = type == GGUF_TYPE_ARRAY - ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(ctx_gguf, i)), gguf_get_arr_n(ctx_gguf, i)) + ? format("%s[%s,%d]", gguf_type_name(type), gguf_type_name(gguf_get_arr_type(meta, i)), gguf_get_arr_n(meta, i)) : gguf_type_name(type); - std::string value = gguf_kv_to_str(ctx_gguf, i); + std::string value = gguf_kv_to_str(meta, i); const size_t MAX_VALUE_LEN = 40; if (value.size() > MAX_VALUE_LEN) { value = format("%s...", value.substr(0, MAX_VALUE_LEN - 3).c_str()); @@ -2952,21 +3248,22 @@ struct llama_model_loader { } this->use_mmap = use_mmap; + this->check_tensors = check_tensors; } ~llama_model_loader() { - if (ctx_gguf) { - gguf_free(ctx_gguf); + if (meta) { + gguf_free(meta); } - if (ctx_meta) { - ggml_free(ctx_meta); + for (auto * ctx : contexts) { + ggml_free(ctx); } } template typename std::enable_if::value, bool>::type get_arr_n(const std::string & key, T & result, const bool required = true) { - const int kid = gguf_find_key(ctx_gguf, key.c_str()); + const int kid = gguf_find_key(meta, key.c_str()); if (kid < 0) { if (required) { @@ -2976,7 +3273,7 @@ struct llama_model_loader { } struct GGUFMeta::ArrayInfo arr_info = - GGUFMeta::GKV::get_kv(ctx_gguf, kid); + GGUFMeta::GKV::get_kv(meta, kid); result = arr_info.length; @@ -2996,7 +3293,7 @@ struct llama_model_loader { const struct llama_model_kv_override * override = it != kv_overrides.end() ? &it->second : nullptr; - const bool found = GGUFMeta::GKV::set(ctx_gguf, key, result, override); + const bool found = GGUFMeta::GKV::set(meta, key, result, override); if (required && !found) { throw std::runtime_error(format("key not found in model: %s", key.c_str())); @@ -3019,28 +3316,61 @@ struct llama_model_loader { } const char * get_tensor_name(int i) const { - return gguf_get_tensor_name(ctx_gguf, i); + return weights.at(i).tensor->name; + } + + const llama_tensor_weight * get_weight(const char * name) const { + for (const auto & weight : weights) { + if (strcmp(name, weight.tensor->name) == 0) { + return &weight; + } + } + return nullptr; + } + + const llama_tensor_weight * get_weight(int i) const { + return get_weight(get_tensor_name(i)); + } + + const llama_tensor_weight & require_weight(const char * name) const { + const llama_tensor_weight * weight = get_weight(name); + if (!weight) { + throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); + } + return *weight; } struct ggml_tensor * get_tensor_meta(const char * name) const { - return ggml_get_tensor(ctx_meta, name); + const auto * weight = get_weight(name); + if (!weight) { + return nullptr; + } + return weight->tensor; + } + + struct ggml_tensor * require_tensor_meta(const char * name) const { + struct ggml_tensor * tensor = get_tensor_meta(name); + if (!tensor) { + throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name)); + } + return tensor; } struct ggml_tensor * get_tensor_meta(int i) const { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) { - struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - ggml_set_name(tensor, ggml_get_name(meta)); + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, const struct ggml_tensor * cur) { + struct ggml_tensor * tensor = ggml_dup_tensor(ctx, cur); + ggml_set_name(tensor, ggml_get_name(cur)); n_created++; return tensor; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); + const struct ggml_tensor * check_tensor_dims(const std::string & name, const std::vector & ne, bool required) const { + const struct ggml_tensor * cur = get_tensor_meta(name.c_str()); if (cur == NULL) { if (!required) { @@ -3051,8 +3381,8 @@ struct llama_model_loader { { bool is_ok = true; - for (size_t i = 0; i < ne.size(); ++i) { - if (ne[i] != cur->ne[i]) { + for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { + if ((i < ne.size() && ne[i] != cur->ne[i]) || (i >= ne.size() && cur->ne[i] != 1)) { is_ok = false; break; } @@ -3066,127 +3396,231 @@ struct llama_model_loader { } } + return cur; + } + + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { + const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); + + if (cur == NULL) { + return NULL; + } + return create_tensor_for(ctx, cur); } + struct ggml_tensor * create_tensor_as_view(struct ggml_context * ctx, struct ggml_tensor * base, const std::string & name, const std::vector & ne, size_t offset, bool required = true) { + const struct ggml_tensor * cur = check_tensor_dims(name, ne, required); + + if (cur == NULL) { + return NULL; + } + + if (cur->type != base->type) { + throw std::runtime_error(format("%s: tensor '%s' has wrong type; expected %s, got %s", __func__, name.c_str(), ggml_type_name(base->type), ggml_type_name(cur->type))); + } + + std::array dims; + for (size_t i = 0; i < GGML_MAX_DIMS; ++i) { + dims[i] = i < ne.size() ? ne[i] : 1; + } + + struct ggml_tensor * tensor = ggml_view_4d(ctx, base, + dims[0], dims[1], dims[2], dims[3], + cur->nb[1], cur->nb[2], cur->nb[3], + offset); + + ggml_set_name(tensor, name.c_str()); + + n_created++; + + return tensor; + } + void done_getting_tensors() const { if (n_created != n_tensors) { throw std::runtime_error(format("%s: wrong number of tensors; expected %d, got %d", __func__, n_tensors, n_created)); } } - size_t file_offset(const char * name) const { - const int idx = gguf_find_tensor(ctx_gguf, name); - - if (idx < 0) { - throw std::runtime_error(format("%s: tensor '%s' not found in the file", __func__, name)); - } - - return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); - } - - void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { - // prefetch the whole file - all the data is needed anyway + void init_mappings(bool prefetch = true, llama_mlocks * mlock_mmaps = nullptr) { if (use_mmap) { - mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); + mappings.reserve(files.size()); + mmaps_used.reserve(files.size()); + for (const auto & file : files) { + std::unique_ptr mapping(new llama_mmap(file.get(), prefetch ? -1 : 0, ggml_is_numa())); + mmaps_used.emplace_back(mapping->size, 0); + if (mlock_mmaps) { + std::unique_ptr mlock_mmap(new llama_mlock()); + mlock_mmap->init(mapping->addr); + mlock_mmaps->emplace_back(std::move(mlock_mmap)); + } + mappings.emplace_back(std::move(mapping)); + } } // compute the total size of all tensors for progress reporting - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); - } - - if (use_mmap && mapping) { - if (lmlock) { - lmlock->init(mapping->addr); - } - mmap_used_first = mapping->size; + for (auto & w : weights) { + size_data += ggml_nbytes(w.tensor); } } - void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const { - GGML_ASSERT(mapping); + void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const { + GGML_ASSERT(!mappings.empty()); + const auto & mapping = mappings.at(idx); *first = mapping->size; *last = 0; + *addr = mapping->addr; for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { - const size_t offs = file_offset(ggml_get_name(tensor)); - *first = std::min(*first, offs); - *last = std::max(*last, offs + ggml_nbytes(tensor)); + try { + const auto * weight = get_weight(ggml_get_name(tensor)); + if (!weight) { + continue; + } + if (weight->idx != idx) { + continue; + } + *first = std::min(*first, weight->offs); + *last = std::max(*last, weight->offs + ggml_nbytes(tensor)); + } catch(...) { + // the tensor is not in the model + } } } // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const { - const size_t offs = file_offset(ggml_get_name(cur)); + const auto & w = require_weight(ggml_get_name(cur)); - if (use_mmap && mapping) { + if (use_mmap) { + const auto & mapping = mappings.at(w.idx); if (cur->data == nullptr) { - cur->data = (uint8_t *)mapping->addr + offs; + cur->data = (uint8_t *)mapping->addr + w.offs; } else { - memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur)); + memcpy(cur->data, (uint8_t *)mapping->addr + w.offs, ggml_nbytes(cur)); } } else { GGML_ASSERT(cur->data != nullptr); - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + GGML_ASSERT(w.idx < files.size()); + const auto & file = files.at(w.idx); + file->seek(w.offs, SEEK_SET); + file->read_raw(cur->data, ggml_nbytes(cur)); + } + + if (check_tensors && !ggml_validate_row_data(cur->type, cur->data, ggml_nbytes(cur))) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); } } size_t size_done = 0; size_t size_data = 0; - size_t mmap_used_first = -1; - size_t mmap_used_last = 0; + std::vector> mmaps_used; // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { - GGML_ASSERT(size_data != 0 && "call init_mapping() first"); + bool load_all_data( + struct ggml_context * ctx, + llama_buf_map & bufs_mmap, + llama_mlocks * lmlocks, + llama_progress_callback progress_callback, + void * progress_callback_user_data) { + GGML_ASSERT(size_data != 0 && "call init_mappings() first"); std::vector> read_buf; + std::vector>> validation_result; for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + const auto * weight = get_weight(ggml_get_name(cur)); + if (weight == nullptr) { + // this can happen with split experts models + continue; + } + if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { return false; } } - const size_t offs = file_offset(ggml_get_name(cur)); + size_t n_size = ggml_nbytes(cur); - if (use_mmap && mapping) { + if (use_mmap) { + const auto & mapping = mappings.at(weight->idx); + ggml_backend_buffer_t buf_mmap = nullptr; + if (bufs_mmap.count(weight->idx)) { + buf_mmap = bufs_mmap.at(weight->idx); + } + uint8_t * data = (uint8_t *) mapping->addr + weight->offs; + + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, data, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, data, n_size)); + })); + } + + GGML_ASSERT(buf_mmap || cur->data); // either we have a buffer to allocate the tensor in, or it is already allocated if (buf_mmap && cur->data == nullptr) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); - if (lmlock) { - lmlock->grow_to(offs + ggml_nbytes(cur)); + ggml_backend_tensor_alloc(buf_mmap, cur, data); + if (lmlocks) { + const auto & lmlock = lmlocks->at(weight->idx); + lmlock->grow_to(weight->offs + n_size); } - mmap_used_first = std::min(mmap_used_first, offs); - mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur)); + + auto & mmap_used = mmaps_used[weight->idx]; + mmap_used.first = std::min(mmap_used.first, weight->offs); + mmap_used.second = std::max(mmap_used.second, weight->offs + n_size); } else { - ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); + ggml_backend_tensor_set(cur, data, 0, n_size); } } else { + GGML_ASSERT(weight->idx < files.size()); + const auto & file = files.at(weight->idx); if (ggml_backend_buffer_is_host(cur->buffer)) { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); + file->seek(weight->offs, SEEK_SET); + file->read_raw(cur->data, n_size); + if (check_tensors) { + validation_result.emplace_back(std::async(std::launch::async, [cur, n_size] { + return std::make_pair(cur, ggml_validate_row_data(cur->type, cur->data, n_size)); + })); + } } else { - read_buf.resize(ggml_nbytes(cur)); - file.seek(offs, SEEK_SET); - file.read_raw(read_buf.data(), ggml_nbytes(cur)); - ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); + read_buf.resize(n_size); + file->seek(weight->offs, SEEK_SET); + file->read_raw(read_buf.data(), n_size); + ggml_backend_tensor_set(cur, read_buf.data(), 0, n_size); + if (check_tensors && !ggml_validate_row_data(cur->type, read_buf.data(), n_size)) { + throw std::runtime_error(format("tensor '%s' has invalid data", ggml_get_name(cur))); + } } } - size_done += ggml_nbytes(cur); + size_done += n_size; + } + + // check validation results + bool validation_failed = false; + for (auto & future : validation_result) { + auto result = future.get(); + if (!result.second) { + LLAMA_LOG_ERROR("%s: tensor '%s' has invalid data\n", __func__, ggml_get_name(result.first)); + validation_failed = true; + } + } + if (validation_failed) { + throw std::runtime_error("found tensors with invalid data"); } // check if this is the last call and do final cleanup if (size_done >= size_data) { // unmap offloaded tensors and metadata - if (use_mmap && mapping) { - mapping->unmap_fragment(0, mmap_used_first); - if (mmap_used_last != 0) { - mapping->unmap_fragment(mmap_used_last, mapping->size); + if (use_mmap) { + for (uint32_t idx = 0; idx < mappings.size(); idx++) { + const auto & mmap_used = mmaps_used.at(idx); + auto & mapping = mappings.at(idx); + mapping->unmap_fragment(0, mmap_used.first); + if (mmap_used.second != 0) { + mapping->unmap_fragment(mmap_used.second, mapping->size); + } } } if (progress_callback) { @@ -3233,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { switch (ftype) { case LLAMA_FTYPE_ALL_F32: return "all F32"; case LLAMA_FTYPE_MOSTLY_F16: return "F16"; + case LLAMA_FTYPE_MOSTLY_BF16: return "BF16"; case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0"; case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1"; case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16: @@ -3259,6 +3694,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw"; case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ1_M :return "IQ1_M - 1.75 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw"; case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw"; case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw"; @@ -3280,6 +3716,7 @@ static const char * llama_model_type_name(e_model type) { case MODEL_3B: return "3B"; case MODEL_7B: return "7B"; case MODEL_8B: return "8B"; + case MODEL_12B: return "12B"; case MODEL_13B: return "13B"; case MODEL_14B: return "14B"; case MODEL_15B: return "15B"; @@ -3290,10 +3727,15 @@ static const char * llama_model_type_name(e_model type) { case MODEL_40B: return "40B"; case MODEL_65B: return "65B"; case MODEL_70B: return "70B"; + case MODEL_314B: return "314B"; case MODEL_SMALL: return "0.1B"; case MODEL_MEDIUM: return "0.4B"; case MODEL_LARGE: return "0.8B"; case MODEL_XL: return "1.5B"; + case MODEL_A2_7B: return "A2.7B"; + case MODEL_8x7B: return "8x7B"; + case MODEL_8x22B: return "8x22B"; + case MODEL_16x12B: return "16x12B"; default: return "?B"; } } @@ -3319,7 +3761,7 @@ static void llm_load_hparams( llama_model_loader & ml, llama_model & model) { auto & hparams = model.hparams; - const gguf_context * ctx = ml.ctx_gguf; + const gguf_context * ctx = ml.meta; // get metadata as string for (int i = 0; i < gguf_get_n_kv(ctx); i++) { @@ -3408,15 +3850,23 @@ static void llm_load_hparams( { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - switch (hparams.n_layer) { - case 22: model.type = e_model::MODEL_1B; break; - case 26: model.type = e_model::MODEL_3B; break; - case 32: model.type = e_model::MODEL_7B; break; - case 40: model.type = e_model::MODEL_13B; break; - case 48: model.type = e_model::MODEL_34B; break; - case 60: model.type = e_model::MODEL_30B; break; - case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break; - default: model.type = e_model::MODEL_UNKNOWN; + if (hparams.n_expert == 8) { + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_8x7B; break; + case 56: model.type = e_model::MODEL_8x22B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } else { + switch (hparams.n_layer) { + case 22: model.type = e_model::MODEL_1B; break; + case 26: model.type = e_model::MODEL_3B; break; + case 32: model.type = hparams.n_vocab < 40000 ? e_model::MODEL_7B : e_model::MODEL_8B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 48: model.type = e_model::MODEL_34B; break; + case 60: model.type = e_model::MODEL_30B; break; + case 80: model.type = hparams.n_head == hparams.n_head_kv ? e_model::MODEL_65B : e_model::MODEL_70B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } } } break; case LLM_ARCH_MINICPM: @@ -3428,6 +3878,15 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_GROK: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + + switch (hparams.n_layer) { + case 64: model.type = e_model::MODEL_314B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_FALCON: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); @@ -3549,6 +4008,7 @@ static void llm_load_hparams( switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; + case 40: model.type = e_model::MODEL_12B; break; default: model.type = e_model::MODEL_UNKNOWN; } } break; @@ -3573,10 +4033,28 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_QWEN2MOE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_A2_7B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_PHI2: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 24: model.type = e_model::MODEL_1B; break; + case 32: model.type = e_model::MODEL_3B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_PHI3: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { case 24: model.type = e_model::MODEL_1B; break; case 32: model.type = e_model::MODEL_3B; break; @@ -3679,6 +4157,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_XVERSE: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_13B; break; + case 80: model.type = e_model::MODEL_65B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; case LLM_ARCH_COMMAND_R: { ml.get_key(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale); @@ -3688,20 +4176,44 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_DBRX: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv); + + switch (hparams.n_layer) { + case 40: model.type = e_model::MODEL_16x12B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; + case LLM_ARCH_OLMO: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false); + + switch (hparams.n_layer) { + case 22: model.type = e_model::MODEL_1B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 80: model.type = e_model::MODEL_70B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } model.ftype = ml.ftype; if (hparams.f_max_alibi_bias > 0.0f) { - hparams.need_kq_pos = true; + hparams.use_alibi = true; } hparams.rope_type = llama_rope_type(&model); } // TODO: This should probably be in llama.h -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special = false); +static std::vector llama_tokenize_internal( + const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special = false +); static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch); static void llm_load_vocab( @@ -3709,45 +4221,98 @@ static void llm_load_vocab( llama_model & model) { auto & vocab = model.vocab; - struct gguf_context * ctx = ml.ctx_gguf; + struct gguf_context * ctx = ml.meta; const auto kv = LLM_KV(model.arch); // determine vocab type { - std::string tokenizer_name; + std::string tokenizer_model; + std::string tokenizer_pre; - ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name); + ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model); + ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false); - if (tokenizer_name == "no_vocab") { + if (tokenizer_model == "no_vocab") { vocab.type = LLAMA_VOCAB_TYPE_NONE; // default special tokens - vocab.special_bos_id = -1; - vocab.special_eos_id = -1; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.linefeed_id = -1; + vocab.special_bos_id = -1; + vocab.special_eos_id = -1; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.special_cls_id = -1; + vocab.special_mask_id = -1; + vocab.linefeed_id = -1; return; - } else if (tokenizer_name == "llama") { + } else if (tokenizer_model == "llama") { vocab.type = LLAMA_VOCAB_TYPE_SPM; // default special tokens - vocab.special_bos_id = 1; - vocab.special_eos_id = 2; - vocab.special_unk_id = 0; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; + vocab.special_bos_id = 1; + vocab.special_eos_id = 2; + vocab.special_unk_id = 0; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.special_cls_id = -1; + vocab.special_mask_id = -1; + + // For Fill-In-the-Middle (FIM)/infill models which where converted + // prior to support of FIM special tokens in GGUF, the following + // will allow those models to continue to work. The general names + // of the known models are currently CodeLlama (LLM_ARCH_LLAMA) and + // CodeGemma (LLM_ARCH_GEMMA). This can potentially be removed once + // new versions of these models have been published. + std::string gen_name; + ml.get_key(LLM_KV_GENERAL_NAME, gen_name, false); + + std::transform(gen_name.begin(), gen_name.end(), gen_name.begin(), + [](unsigned char c){ return std::tolower(c); }); + + if (gen_name.find("code") != std::string::npos) { + if (model.arch == LLM_ARCH_LLAMA) { + vocab.special_prefix_id = 32007; + vocab.special_suffix_id = 32008; + vocab.special_middle_id = 32009; + vocab.special_eot_id = 32010; + } else if (model.arch == LLM_ARCH_GEMMA) { + vocab.special_prefix_id = 67; + vocab.special_suffix_id = 69; + vocab.special_middle_id = 68; + // TODO: this is not EOT, it is "file separator" token, needs fix + // https://huggingface.co/google/codegemma-7b-it/blob/9b1d9231388358c04d90bd003458f5070d97db44/tokenizer_config.json#L565-L572 + //vocab.special_eot_id = 70; + vocab.special_eot_id = 107; + } + } const int add_space_prefix_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_ADD_PREFIX).c_str()); if (add_space_prefix_keyidx != -1) { vocab.add_space_prefix = gguf_get_val_bool(ctx, add_space_prefix_keyidx); } // The default value of add_space_prefix is true. - } else if (tokenizer_name == "gpt2") { - vocab.type = LLAMA_VOCAB_TYPE_BPE; + } else if (tokenizer_model == "bert") { + vocab.type = LLAMA_VOCAB_TYPE_WPM; + // default special tokens + vocab.special_bos_id = -1; + vocab.special_eos_id = -1; + vocab.special_unk_id = 100; + vocab.special_sep_id = 102; + vocab.special_pad_id = 0; + vocab.special_cls_id = 101; + vocab.special_mask_id = 103; + vocab.add_space_prefix = false; + } else { + if (tokenizer_model == "gpt2") { + vocab.type = LLAMA_VOCAB_TYPE_BPE; + } else { + LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_model.c_str()); + LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); + vocab.type = LLAMA_VOCAB_TYPE_SPM; + return; + } // read bpe merges and populate bpe ranks const int merges_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_MERGES).c_str()); if (merges_keyidx == -1) { @@ -3774,26 +4339,72 @@ static void llm_load_vocab( } // default special tokens - vocab.special_bos_id = 11; - vocab.special_eos_id = 11; - vocab.special_unk_id = -1; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - } else if (tokenizer_name == "bert") { - vocab.type = LLAMA_VOCAB_TYPE_WPM; + vocab.special_bos_id = 11; + vocab.special_eos_id = 11; + vocab.special_unk_id = -1; + vocab.special_sep_id = -1; + vocab.special_pad_id = -1; + vocab.special_cls_id = -1; + vocab.special_mask_id = -1; + } - // default special tokens - vocab.special_bos_id = 101; - vocab.special_eos_id = 102; - vocab.special_unk_id = 100; - vocab.special_sep_id = -1; - vocab.special_pad_id = -1; - vocab.add_space_prefix = false; + // for now, only BPE models have pre-tokenizers + if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { + if (tokenizer_pre.empty()) { + LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); + LLAMA_LOG_WARN("%s: \n", __func__); + LLAMA_LOG_WARN("%s: ************************************ \n", __func__); + LLAMA_LOG_WARN("%s: GENERATION QUALITY WILL BE DEGRADED! \n", __func__); + LLAMA_LOG_WARN("%s: CONSIDER REGENERATING THE MODEL \n", __func__); + LLAMA_LOG_WARN("%s: ************************************ \n", __func__); + LLAMA_LOG_WARN("%s: \n", __func__); + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } else if ( + tokenizer_pre == "default") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; + } else if ( + tokenizer_pre == "llama3" || + tokenizer_pre == "llama-v3" || + tokenizer_pre == "llama-bpe") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_LLAMA3; + } else if ( + tokenizer_pre == "deepseek-llm") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM; + } else if ( + tokenizer_pre == "deepseek-coder") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER; + } else if ( + tokenizer_pre == "falcon") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON; + } else if ( + tokenizer_pre == "mpt") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_MPT; + } else if ( + tokenizer_pre == "starcoder") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_STARCODER; + } else if ( + tokenizer_pre == "gpt-2") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; + } else if ( + tokenizer_pre == "refact") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; + } else if ( + tokenizer_pre == "command-r") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R; + } else if ( + tokenizer_pre == "qwen2") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2; + } else if ( + tokenizer_pre == "olmo") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO; + } else if ( + tokenizer_pre == "dbrx") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX; + } else { + throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); + } } else { - LLAMA_LOG_WARN("%s: unknown tokenizer: '%s'", __func__, tokenizer_name.c_str()); - LLAMA_LOG_WARN("%s: using default tokenizer: 'llama'", __func__); - - vocab.type = LLAMA_VOCAB_TYPE_SPM; + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } } @@ -3842,7 +4453,7 @@ static void llm_load_vocab( } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else { - const std::vector ids = llama_tokenize_internal(vocab, "\u010A", false); + const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); vocab.linefeed_id = ids[0]; } @@ -3850,12 +4461,19 @@ static void llm_load_vocab( // special tokens { const std::vector> special_token_types = { - { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, - { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, - { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, - { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, - { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, + { LLM_KV_TOKENIZER_BOS_ID, vocab.special_bos_id }, + { LLM_KV_TOKENIZER_EOS_ID, vocab.special_eos_id }, + { LLM_KV_TOKENIZER_UNK_ID, vocab.special_unk_id }, + { LLM_KV_TOKENIZER_SEP_ID, vocab.special_sep_id }, + { LLM_KV_TOKENIZER_PAD_ID, vocab.special_pad_id }, + { LLM_KV_TOKENIZER_CLS_ID, vocab.special_cls_id }, + { LLM_KV_TOKENIZER_MASK_ID, vocab.special_mask_id }, + { LLM_KV_TOKENIZER_PREFIX_ID, vocab.special_prefix_id }, + { LLM_KV_TOKENIZER_SUFFIX_ID, vocab.special_suffix_id }, + { LLM_KV_TOKENIZER_MIDDLE_ID, vocab.special_middle_id }, + { LLM_KV_TOKENIZER_EOT_ID, vocab.special_eot_id }, }; + for (const auto & it : special_token_types) { const std::string & key = kv(std::get<0>(it)); int32_t & id = std::get<1>(it); @@ -3870,7 +4488,6 @@ static void llm_load_vocab( } else { id = new_id; } - } // Handle add_bos_token and add_eos_token @@ -3884,6 +4501,28 @@ static void llm_load_vocab( vocab.special_add_eos = int(temp); } } + + // find EOT token: "<|eot_id|>", "<|im_end|>", "", etc. + // + // TODO: convert scripts should provide this token through the KV metadata LLAMA_KV_TOKENIZER_EOT_ID + // for now, we apply this workaround to find the EOT token based on its text + if (vocab.special_eot_id == -1) { + for (const auto & t : vocab.token_to_id) { + if ( + // TODO: gemma "" is exported as a normal token, so the following check does not work + // need to fix convert script + //vocab.id_to_token[t.second].type == LLAMA_TOKEN_TYPE_CONTROL && + (t.first == "<|eot_id|>" || + t.first == "<|im_end|>" || + t.first == "<|end|>" || + t.first == "" + ) + ) { + vocab.special_eot_id = t.second; + break; + } + } + } } // build special tokens cache @@ -4046,12 +4685,19 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) { LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str()); // special tokens - if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } - if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } - if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } - if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } - if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } - if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } + if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); } + if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); } + if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); } + if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); } + if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); } + if (vocab.special_cls_id != -1) { LLAMA_LOG_INFO( "%s: CLS token = %d '%s'\n", __func__, vocab.special_cls_id, vocab.id_to_token[vocab.special_cls_id].text.c_str() ); } + if (vocab.special_mask_id != -1) { LLAMA_LOG_INFO( "%s: MASK token = %d '%s'\n", __func__, vocab.special_mask_id, vocab.id_to_token[vocab.special_mask_id].text.c_str() ); } + + if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); } + if (vocab.special_prefix_id != -1) { LLAMA_LOG_INFO( "%s: PRE token = %d '%s'\n", __func__, vocab.special_prefix_id, vocab.id_to_token[vocab.special_prefix_id].text.c_str() ); } + if (vocab.special_suffix_id != -1) { LLAMA_LOG_INFO( "%s: SUF token = %d '%s'\n", __func__, vocab.special_suffix_id, vocab.id_to_token[vocab.special_suffix_id].text.c_str() ); } + if (vocab.special_middle_id != -1) { LLAMA_LOG_INFO( "%s: MID token = %d '%s'\n", __func__, vocab.special_middle_id, vocab.id_to_token[vocab.special_middle_id].text.c_str() ); } + if (vocab.special_eot_id != -1) { LLAMA_LOG_INFO( "%s: EOT token = %d '%s'\n", __func__, vocab.special_eot_id, vocab.id_to_token[vocab.special_eot_id].text.c_str() ); } } // Returns false if cancelled by progress_callback @@ -4069,12 +4715,20 @@ static bool llm_load_tensors( auto & hparams = model.hparams; +#ifdef GGML_USE_SYCL + // disable MoE with SYCL until mul_mat_id is updated + if (hparams.n_expert > 0) { + n_gpu_layers = 0; + } +#endif + model.split_mode = split_mode; model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; const int64_t n_layer = hparams.n_layer; const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); + bool use_mmap_buffer = true; // there is very little benefit to offloading the input layer, so always keep it on the CPU model.buft_input = llama_default_buffer_type_cpu(true); @@ -4163,6 +4817,10 @@ static bool llm_load_tensors( // create one context per buffer type size_t ctx_size = ggml_tensor_overhead()*(ml.n_tensors + 1); // +1 for models where tok_embd is duplicated as output + + // for moe merged tensors + ctx_size += ggml_tensor_overhead()*n_layer*3; + std::map ctx_map; for (auto & it : buft_layer_count) { struct ggml_init_params params = { @@ -4189,6 +4847,11 @@ static bool llm_load_tensors( const int64_t n_vocab = hparams.n_vocab; const int64_t n_vocab_type = hparams.n_vocab_type; const int64_t n_ff = hparams.n_ff; + const int64_t n_expert = hparams.n_expert; + + if (n_expert > 0 && hparams.n_expert_used == 0) { + throw std::runtime_error("model has expert layers but no expert layers are used"); + } GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); @@ -4243,28 +4906,148 @@ static bool llm_load_tensors( layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false); - - if (layer.ffn_gate_inp == nullptr) { - GGML_ASSERT(hparams.n_expert == 0); - GGML_ASSERT(hparams.n_expert_used == 0); - + if (n_expert == 0) { layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } else { - GGML_ASSERT(hparams.n_expert > 0); - GGML_ASSERT(hparams.n_expert_used > 0); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); - // MoE branch - for (uint32_t x = 0; x < hparams.n_expert; ++x) { - layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); - layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); - layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + if (layer.ffn_gate_exps) { + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } else { + // merge split expert into a single tensor for compatibility with older models + // requires disabling mmap + use_mmap_buffer = false; + + ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type; + ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type; + ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type; + + layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert); + layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert); + layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert); + + ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str()); + ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str()); + ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str()); + + for (uint32_t x = 0; x < n_expert; ++x) { + // the individual experts are loaded into a view of the merged tensor + ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x); + } } } } } break; + case LLM_ARCH_GROK: + { + if (n_expert == 0) { + throw std::runtime_error("Grok model cannot have zero experts"); + } + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}, false); + if (layer.ffn_gate_exps) { + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), { n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } else { + // merge split expert into a single tensor for compatibility with older models + // requires disabling mmap + use_mmap_buffer = false; + + ggml_type type_gate = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, 0).c_str())->type; + ggml_type type_down = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, 0).c_str())->type; + ggml_type type_up = ml.require_tensor_meta(tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, 0).c_str())->type; + + layer.ffn_gate_exps = ggml_new_tensor_3d(ctx_split, type_gate, n_embd, n_ff, n_expert); + layer.ffn_down_exps = ggml_new_tensor_3d(ctx_split, type_down, n_ff, n_embd, n_expert); + layer.ffn_up_exps = ggml_new_tensor_3d(ctx_split, type_up, n_embd, n_ff, n_expert); + + ggml_set_name(layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i).c_str()); + ggml_set_name(layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i).c_str()); + ggml_set_name(layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i).c_str()); + + for (uint32_t x = 0; x < n_expert; ++x) { + // the individual experts are loaded into a view of the merged tensor + ml.create_tensor_as_view(ctx_split, layer.ffn_gate_exps, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_gate_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_down_exps, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd }, layer.ffn_down_exps->nb[2]*x); + ml.create_tensor_as_view(ctx_split, layer.ffn_up_exps, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), { n_embd, n_ff }, layer.ffn_up_exps->nb[2]*x); + } + } + + layer.layer_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}); + } + } break; + case LLM_ARCH_DBRX: + { + if (n_expert == 0) { + throw std::runtime_error("DBRX model cannot have zero experts"); + } + + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.attn_out_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), {n_embd, n_ff, n_expert}); + } + } break; case LLM_ARCH_BAICHUAN: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -4319,10 +5102,8 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { - layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); - layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); - } + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, false); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, false); layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); @@ -4502,6 +5283,7 @@ static bool llm_load_tensors( case LLM_ARCH_MPT: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, false); // output { @@ -4540,6 +5322,12 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, false); + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, false); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {n_embd}, false); + + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, false); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {n_embd}, false); + // AWQ ScaleActivation layer layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); } @@ -4574,8 +5362,13 @@ static bool llm_load_tensors( layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); - layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + // optional q and k layernorms, present in StableLM 2 12B + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}, false); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}, false); + + // optional FFN norm, not present in StableLM 2 12B which uses parallel residual + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, false); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, false); layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); @@ -4618,7 +5411,13 @@ static bool llm_load_tensors( // output { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } } for (int i = 0; i < n_layer; ++i) { @@ -4646,6 +5445,54 @@ static bool llm_load_tensors( layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; + case LLM_ARCH_QWEN2MOE: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + // optional bias tensors + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_expert}); + + GGML_ASSERT(hparams.n_expert > 0); + GGML_ASSERT(hparams.n_expert_used > 0); + + // MoE branch + auto n_ff_exp = n_ff / hparams.n_expert_used; + layer.ffn_gate_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + layer.ffn_down_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff_exp, n_embd, n_expert}); + layer.ffn_up_exps = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXPS, "weight", i), { n_embd, n_ff_exp, n_expert}); + + // Shared expert branch + layer.ffn_gate_inp_shexp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP_SHEXP, "weight", i), {n_embd}); + layer.ffn_gate_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_SHEXP, "weight", i), {n_embd, n_ff}); + layer.ffn_down_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_SHEXP, "weight", i), { n_ff, n_embd}); + layer.ffn_up_shexp = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_SHEXP, "weight", i), {n_embd, n_ff}); + } + } break; case LLM_ARCH_PHI2: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -4691,6 +5538,33 @@ static bool llm_load_tensors( layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; + case LLM_ARCH_PHI3: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), { n_embd, n_vocab }); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), { n_embd }); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), { n_embd, n_vocab }); + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context* ctx_layer = ctx_for_layer(i); + ggml_context* ctx_split = ctx_for_layer_split(i); + + auto& layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), { n_embd }); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), { n_embd, n_embd + 2 * n_embd_gqa }, false); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), { n_embd }); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd }); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), { n_embd, 2 * n_ff }); + } + } break; case LLM_ARCH_PLAMO: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -4986,6 +5860,28 @@ static bool llm_load_tensors( layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd}); } } break; + case LLM_ARCH_XVERSE: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + auto & layer = model.layers[i]; + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; case LLM_ARCH_COMMAND_R: { model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); @@ -5007,11 +5903,47 @@ static bool llm_load_tensors( layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + if (n_layer >= 64){ + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head}); + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {hparams.n_embd_head_k, hparams.n_head_kv}); + } + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + } + } break; + case LLM_ARCH_OLMO: // adapted from LLM_ARCH_LLAMA with norm params removed + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); @@ -5024,56 +5956,97 @@ static bool llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); + ml.init_mappings(true, use_mlock ? &model.mlock_mmaps : nullptr); + model.mappings.reserve(ml.mappings.size()); // create the backend buffers - std::vector> ctx_bufs; + std::vector> ctx_bufs; + ctx_bufs.reserve(ctx_map.size()); + + // Ensure we have enough capacity for the maximum backend buffer we will potentially create + size_t n_max_backend_buffer = ctx_map.size() * ml.files.size(); + model.bufs.reserve(n_max_backend_buffer); for (auto & it : ctx_map) { ggml_backend_buffer_type_t buft = it.first; - ggml_context * ctx = it.second; - ggml_backend_buffer_t buf = nullptr; + ggml_context * ctx = it.second; + + llama_buf_map bufs; + bufs.reserve(n_max_backend_buffer); // only the mmap region containing the tensors in the model is mapped to the backend buffer // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size - if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { - size_t first, last; - ml.get_mapping_range(&first, &last, ctx); - buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); -#ifdef GGML_USE_CUBLAS - if (n_layer >= n_gpu_layers) { - ggml_backend_cuda_register_host_buffer( + if (ml.use_mmap && use_mmap_buffer && buft == llama_default_buffer_type_cpu(true)) { + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, idx, ctx); + if (first >= last) { + continue; + } + ggml_backend_buffer_t buf = ggml_backend_cpu_buffer_from_ptr((char *) addr + first, last - first); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend CPU buffer"); + } + model.bufs.push_back(buf); + bufs.emplace(idx, buf); +#ifdef GGML_USE_CUDA + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( ggml_backend_buffer_get_base(buf), ggml_backend_buffer_get_size(buf)); - } + } #endif + } } #ifdef GGML_USE_METAL - else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { - const size_t max_size = ggml_get_max_tensor_size(ctx); - size_t first, last; - ml.get_mapping_range(&first, &last, ctx); - buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size); + else if (ml.use_mmap && use_mmap_buffer && buft == ggml_backend_metal_buffer_type()) { + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + const size_t max_size = ggml_get_max_tensor_size(ctx); + void * addr = nullptr; + size_t first, last; + ml.get_mapping_range(&first, &last, &addr, idx, ctx); + if (first >= last) { + continue; + } + ggml_backend_buffer_t buf = ggml_backend_metal_buffer_from_ptr((char *) addr + first, last - first, max_size); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend metal buffer"); + } + model.bufs.push_back(buf); + bufs.emplace(idx, buf); + } } #endif else { - buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); - if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) { + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf == nullptr) { + throw std::runtime_error("unable to allocate backend buffer"); + } + model.bufs.push_back(buf); + if (use_mlock && ggml_backend_buffer_is_host(buf)) { model.mlock_bufs.emplace_back(new llama_mlock); auto & mlock_buf = model.mlock_bufs.back(); mlock_buf->init (ggml_backend_buffer_get_base(buf)); mlock_buf->grow_to(ggml_backend_buffer_get_size(buf)); } + for (uint32_t idx = 0; idx < ml.files.size(); idx++) { + bufs.emplace(idx, buf); + } } - if (buf == nullptr) { + + if (bufs.empty()) { throw std::runtime_error("failed to allocate buffer"); } - // indicate that this buffer contains weights - // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight - ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - model.bufs.push_back(buf); - ctx_bufs.emplace_back(ctx, buf); + + for (auto & buf : bufs) { + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight + ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + } + + ctx_bufs.emplace_back(ctx, bufs); } if (llama_supports_gpu_offload()) { @@ -5105,13 +6078,17 @@ static bool llm_load_tensors( // load tensor data for (auto & it : ctx_bufs) { ggml_context * ctx = it.first; - ggml_backend_buffer_t buf = it.second; - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) { + auto & bufs = it.second; + if (!ml.load_all_data(ctx, bufs, use_mlock ? &model.mlock_mmaps : NULL, progress_callback, progress_callback_user_data)) { return false; } } - model.mapping = std::move(ml.mapping); + if (use_mmap_buffer) { + for (auto & mapping : ml.mappings) { + model.mappings.emplace_back(std::move(mapping)); + } + } // loading time will be recalculate after the first eval, so // we take page faults deferred by mmap() into consideration @@ -5122,7 +6099,7 @@ static bool llm_load_tensors( // Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) { try { - llama_model_loader ml(fname, params.use_mmap, params.kv_overrides); + llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides); model.hparams.vocab_only = params.vocab_only; @@ -5160,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam || !( model.ftype == LLAMA_FTYPE_ALL_F32 || model.ftype == LLAMA_FTYPE_MOSTLY_F16 || + model.ftype == LLAMA_FTYPE_MOSTLY_BF16 || model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ) @@ -5251,37 +6229,47 @@ static struct ggml_tensor * llm_build_inp_embd( static void llm_build_kv_store( struct ggml_context * ctx, const llama_hparams & hparams, + const llama_cparams & cparams, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * k_cur, struct ggml_tensor * v_cur, - int64_t n_ctx, int32_t n_tokens, int32_t kv_head, const llm_build_cb & cb, int64_t il) { + const int64_t n_ctx = cparams.n_ctx; + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(kv.size == n_ctx); - // compute the transposed [n_tokens, n_embd] V matrix - struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens)); - //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed - cb(v_cur_t, "v_cur_t", il); - struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_k_gqa, (ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa))*kv_head); cb(k_cache_view, "k_cache_view", il); - struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv.v_l[il]), - (kv_head)*ggml_element_size(kv.v_l[il])); + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); + + assert(v_cur->ne[0] == n_embd_v_gqa && v_cur->ne[1] == n_tokens); + + struct ggml_tensor * v_cache_view = nullptr; + + if (cparams.flash_attn) { + v_cache_view = ggml_view_1d(ctx, kv.v_l[il], n_tokens*n_embd_v_gqa, + (kv_head)*ggml_row_size(kv.v_l[il]->type, n_embd_v_gqa)); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_v_gqa, + ( n_ctx)*ggml_element_size(kv.v_l[il]), + (kv_head)*ggml_element_size(kv.v_l[il])); + + v_cur = ggml_transpose(ctx, v_cur); + } cb(v_cache_view, "v_cache_view", il); - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view)); - ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur_t, v_cache_view)); + ggml_build_forward_expand(graph, ggml_cpy(ctx, v_cur, v_cache_view)); } static struct ggml_tensor * llm_build_norm( @@ -5407,11 +6395,105 @@ static struct ggml_tensor * llm_build_ffn( return cur; } -// if max_alibi_bias > 0 then apply ALiBi +static struct ggml_tensor * llm_build_moe_ffn( + struct ggml_context * ctx, + struct ggml_tensor * cur, + struct ggml_tensor * gate_inp, + struct ggml_tensor * up_exps, + struct ggml_tensor * gate_exps, + struct ggml_tensor * down_exps, + int64_t n_expert, + int64_t n_expert_used, + llm_ffn_op_type type_op, + bool norm_w, + const llm_build_cb & cb, + int il) { + int64_t n_embd = cur->ne[0]; + int64_t n_tokens = cur->ne[1]; + + ggml_tensor * logits = ggml_mul_mat(ctx, gate_inp, cur); // [n_expert, n_tokens] + cb(logits, "ffn_moe_logits", il); + + ggml_tensor * probs = ggml_soft_max(ctx, logits); // [n_expert, n_tokens] + cb(probs, "ffn_moe_probs", il); + + // select experts + ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_expert_used); // [n_expert_used, n_tokens] + cb(selected_experts->src[0], "ffn_moe_argsort", il); + cb(selected_experts, "ffn_moe_topk", il); + + ggml_tensor * weights = ggml_get_rows(ctx, + ggml_reshape_3d(ctx, probs, 1, n_expert, n_tokens), selected_experts); // [1, n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights", il); + + if (norm_w) { + weights = ggml_reshape_2d(ctx, weights, n_expert_used, n_tokens); + + ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); // [1, n_tokens] + cb(weights_sum, "ffn_moe_weights_sum", il); + + weights = ggml_div(ctx, weights, weights_sum); // [n_expert_used, n_tokens] + cb(weights, "ffn_moe_weights_norm", il); + + weights = ggml_reshape_3d(ctx, weights, 1, n_expert_used, n_tokens); + } + + cur = ggml_reshape_3d(ctx, cur, n_embd, 1, n_tokens); + ggml_tensor * up = ggml_mul_mat_id(ctx, up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(up, "ffn_moe_up", il); + + ggml_tensor * gate = ggml_mul_mat_id(ctx, gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens] + cb(gate, "ffn_moe_gate", il); + + switch (type_op) { + case LLM_FFN_SILU: + { + gate = ggml_silu(ctx, gate); + cb(gate, "ffn_moe_silu", il); + } break; + case LLM_FFN_GELU: + { + gate = ggml_gelu(ctx, gate); + cb(gate, "ffn_moe_gelu", il); + } break; + default: + GGML_ASSERT(false); + } + + ggml_tensor * par = ggml_mul(ctx, up, gate); // [n_ff, n_expert_used, n_tokens] + cb(par, "ffn_moe_gate_par", il); + + ggml_tensor * experts = ggml_mul_mat_id(ctx, down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens] + cb(experts, "ffn_moe_down", il); + + experts = ggml_mul(ctx, experts, weights); + + // aggregate experts + ggml_tensor * moe_out = nullptr; + for (int i = 0; i < n_expert_used; ++i) { + ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens, + experts->nb[2], i*experts->nb[1]); + + if (i == 0) { + moe_out = cur_expert; + } else { + moe_out = ggml_add(ctx, moe_out, cur_expert); + } + } + + if (n_expert_used == 1) { + // avoid returning a non-contiguous tensor + moe_out = ggml_cont(ctx, moe_out); + } + + return moe_out; +} + static struct ggml_tensor * llm_build_kqv( struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, + const llama_cparams & cparams, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * wo, @@ -5419,12 +6501,12 @@ static struct ggml_tensor * llm_build_kqv( struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, struct ggml_tensor * kq_pos, - int64_t n_ctx, int32_t n_tokens, int32_t n_kv, float kq_scale, const llm_build_cb & cb, int il) { + const int64_t n_ctx = cparams.n_ctx; const int64_t n_head = hparams.n_head; const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head_k = hparams.n_embd_head_k; @@ -5442,58 +6524,100 @@ static struct ggml_tensor * llm_build_kqv( 0); cb(k, "k", il); - struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); - cb(kq, "kq", il); + struct ggml_tensor * cur; - if (model.arch == LLM_ARCH_PHI2) { - // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs - // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 - ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - } + if (cparams.flash_attn) { + GGML_UNUSED(model); + GGML_UNUSED(n_ctx); + + // note: if this assert triggers, then some check has failed earlier + // the idea is to detect during context creation that ALiBi would be used and disable Flash Attention + GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention"); + + // split cached v into n_head heads (not transposed) + struct ggml_tensor * v = + ggml_view_3d(ctx, kv.v_l[il], + n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(kv.v_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv.v_l[il]->type, n_embd_head_k), + 0); + cb(v, "v", il); + + cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale); + + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { + ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); + } + + cur = ggml_reshape_2d(ctx, cur, n_embd_head_k*n_head, n_tokens); + } else { + struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); + cb(kq, "kq", il); + + if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { + // for this arch, we need to perform the KQ multiplication with F32 precision, otherwise we get NaNs + // ref: https://github.com/ggerganov/llama.cpp/pull/4490#issuecomment-1859055847 + ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + } + + if (model.arch == LLM_ARCH_GROK) { + // need to do the following: + // multiply by attn_output_multiplyer of 0.08838834764831845 + // and then : + // kq = 30 * tanh(kq / 30) + // before the softmax below + + //try from phi2 + //ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + + kq = ggml_tanh(ctx, ggml_scale(ctx, kq, 0.08838834764831845f/30.0f)); + kq = ggml_scale(ctx, kq, 30); + } #if defined(GGML_USE_KOMPUTE) #pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute") #pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024") #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488") - if (hparams.f_max_alibi_bias > 0.0f) { - kq = ggml_scale(ctx, kq, kq_scale); - cb(kq, "kq_scaled", il); + if (hparams.use_alibi) { + kq = ggml_scale(ctx, kq, kq_scale); + cb(kq, "kq_scaled", il); - kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias); - cb(kq, "kq_scaled_alibi", il); + kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias); + cb(kq, "kq_scaled_alibi", il); - kq = ggml_add(ctx, kq, kq_mask); - cb(kq, "kq_masked", il); + kq = ggml_add(ctx, kq, kq_mask); + cb(kq, "kq_masked", il); - kq = ggml_soft_max(ctx, kq); - cb(kq, "kq_soft_max", il); - } else + kq = ggml_soft_max(ctx, kq); + cb(kq, "kq_soft_max", il); + } else #endif - { - kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias); - cb(kq, "kq_soft_max_ext", il); + { + kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias); + cb(kq, "kq_soft_max_ext", il); + } + + GGML_ASSERT(kv.size == n_ctx); + + // split cached v into n_head heads + struct ggml_tensor * v = + ggml_view_3d(ctx, kv.v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv.v_l[il])*n_ctx, + ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, + 0); + cb(v, "v", il); + + struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); + cb(kqv, "kqv", il); + + struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); + cb(kqv_merged, "kqv_merged", il); + + cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); + cb(cur, "kqv_merged_cont", il); } - GGML_ASSERT(kv.size == n_ctx); - - // split cached v into n_head heads - struct ggml_tensor * v = - ggml_view_3d(ctx, kv.v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv.v_l[il])*n_ctx, - ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head_v, - 0); - cb(v, "v", il); - - struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq); - cb(kqv, "kqv", il); - - struct ggml_tensor * kqv_merged = ggml_permute(ctx, kqv, 0, 2, 1, 3); - cb(kqv_merged, "kqv_merged", il); - - struct ggml_tensor * cur = ggml_cont_2d(ctx, kqv_merged, n_embd_head_k*n_head, n_tokens); - cb(cur, "kqv_merged_cont", il); - ggml_build_forward_expand(graph, cur); cur = ggml_mul_mat(ctx, wo, cur); @@ -5512,6 +6636,7 @@ static struct ggml_tensor * llm_build_kv( struct ggml_context * ctx, const llama_model & model, const llama_hparams & hparams, + const llama_cparams & cparams, const llama_kv_cache & kv, struct ggml_cgraph * graph, struct ggml_tensor * wo, @@ -5521,7 +6646,6 @@ static struct ggml_tensor * llm_build_kv( struct ggml_tensor * q_cur, struct ggml_tensor * kq_mask, struct ggml_tensor * kq_pos, - int64_t n_ctx, int32_t n_tokens, int32_t kv_head, int32_t n_kv, @@ -5535,12 +6659,12 @@ static struct ggml_tensor * llm_build_kv( ggml_build_forward_expand(graph, k_cur); ggml_build_forward_expand(graph, v_cur); - llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il); + llm_build_kv_store(ctx, hparams, cparams, kv, graph, k_cur, v_cur, n_tokens, kv_head, cb, il); struct ggml_tensor * cur; - cur = llm_build_kqv(ctx, model, hparams, kv, graph, wo, wo_b, - q_cur, kq_mask, kq_pos, n_ctx, n_tokens, n_kv, kq_scale, cb, il); + cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b, + q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il); cb(cur, "kqv_out", il); return cur; @@ -5577,10 +6701,13 @@ struct llm_build_context { const float norm_rms_eps; const int32_t n_tokens; - const int32_t n_kv; // size of KV cache to consider (n_kv <= n_ctx) + const int32_t n_kv; // size of KV cache to consider (n_kv <= kv_self.size) + const int32_t n_outputs; const int32_t kv_head; // index of where we store new KV data in the cache const int32_t n_orig_ctx; + const bool flash_attn; + const enum llama_pooling_type pooling_type; const enum llama_rope_type rope_type; @@ -5624,8 +6751,10 @@ struct llm_build_context { norm_rms_eps (hparams.f_norm_rms_eps), n_tokens (batch.n_tokens), n_kv (worst_case ? kv_self.size : kv_self.n), + n_outputs (worst_case ? n_tokens : lctx.n_outputs), kv_head (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head), n_orig_ctx (cparams.n_yarn_orig_ctx), + flash_attn (cparams.flash_attn), pooling_type (cparams.pooling_type), rope_type (hparams.rope_type), cb (cb), @@ -5645,6 +6774,7 @@ struct llm_build_context { lctx.inp_tokens = nullptr; lctx.inp_embd = nullptr; lctx.inp_pos = nullptr; + lctx.inp_out_ids = nullptr; lctx.inp_KQ_mask = nullptr; lctx.inp_KQ_pos = nullptr; lctx.inp_K_shift = nullptr; @@ -5739,15 +6869,31 @@ struct llm_build_context { ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); - ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; - ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); + if (flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + } ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); @@ -5768,22 +6914,35 @@ struct llm_build_context { return lctx.inp_pos; } + struct ggml_tensor * build_inp_out_ids() { + lctx.inp_out_ids = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_outputs); + cb(lctx.inp_out_ids, "inp_out_ids", -1); + ggml_set_input(lctx.inp_out_ids); + return lctx.inp_out_ids; + } + struct ggml_tensor * build_inp_KQ_mask(bool causal = true) { if (causal) { - lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, n_tokens); + lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); } else { - lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); + lctx.inp_KQ_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD)); } cb(lctx.inp_KQ_mask, "KQ_mask", -1); ggml_set_input(lctx.inp_KQ_mask); - return lctx.inp_KQ_mask; + return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; } - struct ggml_tensor * build_inp_KQ_pos() { - lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv); + struct ggml_tensor * build_inp_KQ_pos(bool causal = true) { + if (causal) { + lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv); + } else { + // TODO: this will be needed for ALiBi-based BERT models + // https://github.com/ggerganov/llama.cpp/pull/6826 + lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_tokens); + } cb(lctx.inp_KQ_pos, "KQ_pos", -1); ggml_set_input(lctx.inp_KQ_pos); - return lctx.inp_KQ_pos; + return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos; } struct ggml_tensor * build_inp_mean() { @@ -5824,6 +6983,9 @@ struct llm_build_context { struct ggml_cgraph * build_llama() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); GGML_ASSERT(n_embd_head == hparams.n_rot); @@ -5886,9 +7048,17 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -5915,62 +7085,15 @@ struct llm_build_context { LLM_NORM_RMS, cb, il); cb(cur, "ffn_norm", il); - ggml_tensor * logits = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp, cur); // [n_tokens, num_experts] - cb(logits, "ffn_moe_logits", il); - - ggml_tensor * probs = ggml_soft_max(ctx0, logits); // [n_tokens, num_experts] - cb(probs, "ffn_moe_probs", il); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx0, probs, n_expert_used); // [n_tokens, num_experts_per_tok] - cb(selected_experts->src[0], "ffn_moe_argsort", il); - - ggml_tensor * weights = ggml_get_rows(ctx0, - ggml_reshape_3d(ctx0, probs, 1, n_expert, n_tokens), selected_experts); - cb(weights, "ffn_moe_weights", il); - - weights = ggml_reshape_2d(ctx0, weights, n_expert_used, n_tokens); // [n_tokens, num_experts_per_tok] - - ggml_tensor * weights_sum = ggml_sum_rows(ctx0, weights); - cb(weights_sum, "ffn_moe_weights_sum", il); - - weights = ggml_div(ctx0, weights, weights_sum); // [n_tokens, num_experts_per_tok] - cb(weights, "ffn_moe_weights_norm", il); - - // compute expert outputs - ggml_tensor * moe_out = nullptr; - - for (int i = 0; i < n_expert_used; ++i) { - ggml_tensor * cur_expert; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx0, model.layers[il].ffn_up_exp, n_expert, selected_experts, i, cur); - cb(cur_up, "ffn_moe_up", il); - - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx0, model.layers[il].ffn_gate_exp, n_expert, selected_experts, i, cur); - cb(cur_gate, "ffn_moe_gate", il); - - cur_gate = ggml_silu(ctx0, cur_gate); - cb(cur_gate, "ffn_moe_silu", il); - - cur_expert = ggml_mul(ctx0, cur_up, cur_gate); // [n_tokens, n_embd] - cb(cur_expert, "ffn_moe_gate_par", il); - - cur_expert = ggml_mul_mat_id(ctx0, model.layers[il].ffn_down_exp, n_expert, selected_experts, i, cur_expert); // [n_tokens, n_embd] - cb(cur_expert, "ffn_moe_down", il); - - cur_expert = ggml_mul(ctx0, cur_expert, - ggml_view_2d(ctx0, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); - cb(cur_expert, "ffn_moe_weighted", il); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx0, moe_out, cur_expert); - cb(moe_out, "ffn_moe_out", il); - } - } - - cur = moe_out; + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + cb, il); + cb(cur, "ffn_moe_out", il); } cur = ggml_add(ctx0, cur, ffn_inp); @@ -6065,9 +7188,16 @@ struct llm_build_context { cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -6112,6 +7242,111 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_xverse() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + // positions of the tokens in the KV cache + struct ggml_tensor * KQ_pos = build_inp_KQ_pos(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, model.output_norm, NULL, LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_falcon() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -6180,9 +7415,17 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + attn_norm = ggml_get_rows(ctx0, attn_norm, inp_out_ids); } struct ggml_tensor * ffn_inp = cur; @@ -6225,6 +7468,293 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_grok() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // multiply by embedding_multiplier_scale of 78.38367176906169 + inpL = ggml_scale(ctx0, inpL, 78.38367176906169f); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // Grok + // if attn_out_norm is present then apply it before adding the input + if (model.layers[il].attn_out_norm) { + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].attn_out_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_out_norm", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_GELU, true, + cb, il); + cb(cur, "ffn_moe_out", il); + + // Grok + // if layer_out_norm is present then apply it before adding the input + // Idea: maybe ffn_out_norm is a better name + if (model.layers[il].layer_out_norm) { + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].layer_out_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "layer_out_norm", il); + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + // Grok + // multiply logits by output_multiplier_scale of 0.5773502691896257 + + cur = ggml_scale(ctx0, cur, 0.5773502691896257f); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + + struct ggml_cgraph * build_dbrx() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + + cur = ggml_clamp(ctx0, cur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(cur, "wqkv_clamped", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa))); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, NULL, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].attn_out_norm, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_out_norm", il); + + cur = llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, true, + cb, il); + cb(cur, "ffn_moe_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_starcoder() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -6274,9 +7804,16 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // add the input @@ -6471,9 +8008,16 @@ struct llm_build_context { ); cb(Vcur, "Vcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Q, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, residual, cur); @@ -6560,9 +8104,16 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -6722,6 +8273,13 @@ struct llm_build_context { } cb(cur, "kqv_out", il); + if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // re-add the layer input cur = ggml_add(ctx0, cur, inpL); @@ -6839,9 +8397,16 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // Add the input @@ -6891,6 +8456,7 @@ struct llm_build_context { GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); struct ggml_tensor * cur; + struct ggml_tensor * pos; struct ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); @@ -6901,6 +8467,16 @@ struct llm_build_context { // positions of the tokens in the KV cache struct ggml_tensor * KQ_pos = build_inp_KQ_pos(); + if (model.pos_embd) { + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + pos = ggml_get_rows(ctx0, model.pos_embd, inp_pos); + cb(pos, "pos_embd", -1); + + inpL = ggml_add(ctx0, inpL, pos); + cb(inpL, "inpL", -1); + } + for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * attn_norm; @@ -6935,11 +8511,40 @@ struct llm_build_context { cb(Kcur, "Kcur", il); cb(Vcur, "Vcur", il); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + // Q/K Layernorm + if (model.layers[il].attn_q_norm) { + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + model.layers[il].attn_q_norm_b, + LLM_NORM, cb, il); + cb(Qcur, "Qcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, - model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + model.layers[il].attn_k_norm_b, + LLM_NORM, cb, il); + cb(Kcur, "Kcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } else { + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // Add the input @@ -7003,7 +8608,7 @@ struct llm_build_context { struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; + // norm cur = llm_build_norm(ctx0, inpL, hparams, @@ -7012,6 +8617,8 @@ struct llm_build_context { LLM_NORM, cb, il); cb(cur, "attn_norm", il); + struct ggml_tensor * inpSA = cur; + // self-attention { // compute Q and K and RoPE them @@ -7036,36 +8643,69 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + cb(Qcur, "Qcur", il); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + cb(Kcur, "Kcur", il); + + if (model.layers[il].attn_q_norm) { + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM, cb, il); + cb(Qcur, "Qcur", il); + } + if (model.layers[il].attn_k_norm) { + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM, cb, il); + cb(Kcur, "Kcur", il); + } + + Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } - struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpL); cb(ffn_inp, "ffn_inp", il); // feed-forward network { - cur = llm_build_norm(ctx0, ffn_inp, hparams, - model.layers[il].ffn_norm, - model.layers[il].ffn_norm_b, - LLM_NORM, cb, il); - cb(cur, "ffn_norm", il); - + if (model.layers[il].ffn_norm) { + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, + model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + } else { + // parallel residual + cur = inpSA; + } cur = llm_build_ffn(ctx0, cur, model.layers[il].ffn_up, NULL, model.layers[il].ffn_gate, NULL, @@ -7156,9 +8796,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -7248,12 +8895,6 @@ struct llm_build_context { Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); cb(Vcur, "Vcur", il); - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(gf, Qcur); - ggml_build_forward_expand(gf, Kcur); - ggml_build_forward_expand(gf, Vcur); - Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, @@ -7268,9 +8909,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -7313,6 +8961,150 @@ struct llm_build_context { return gf; } + struct ggml_cgraph * build_qwen2moe() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "attn_norm", il); + + // self_attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // MoE branch + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + ggml_tensor * moe_out = + llm_build_moe_ffn(ctx0, cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + n_expert, n_expert_used, + LLM_FFN_SILU, false, + cb, il); + cb(cur, "ffn_moe_out", il); + + // FFN shared expert + { + ggml_tensor * cur_gate_inp = ggml_mul_mat(ctx0, model.layers[il].ffn_gate_inp_shexp, cur); + cb(cur_gate_inp, "ffn_shexp_gate_inp", il); + + // sigmoid + ggml_tensor * cur_gate = ggml_div(ctx0, ggml_silu(ctx0, cur_gate_inp), cur_gate_inp); + cb(cur_gate, "ffn_shexp_gate", il); + + ggml_tensor * cur_ffn = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up_shexp, NULL, + model.layers[il].ffn_gate_shexp, NULL, + model.layers[il].ffn_down_shexp, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur_ffn, "ffn_shexp", il); + + ggml_tensor * ffn_shexp_out = ggml_mul(ctx0, cur_ffn, cur_gate); + cb(ffn_shexp_out, "ffn_shexp_out", il); + + moe_out = ggml_add(ctx0, moe_out, ffn_shexp_out); + cb(moe_out, "ffn_out", il); + + cur = moe_out; + } + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } + struct ggml_cgraph * build_phi2() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); @@ -7386,9 +9178,17 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + attn_norm_output = ggml_get_rows(ctx0, attn_norm_output, inp_out_ids); } // FF @@ -7422,12 +9222,140 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, model.output_b); cb(cur, "result_output", -1); + ggml_build_forward_expand(gf, cur); + return gf; + } + + struct ggml_cgraph * build_phi3() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + auto residual = inpL; + + // self-attention + { + struct ggml_tensor* attn_norm_output = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, + NULL, + LLM_NORM_RMS, cb, il); + cb(attn_norm_output, "attn_norm", il); + + struct ggml_tensor * Qcur = nullptr; + struct ggml_tensor * Kcur = nullptr; + struct ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv) { + cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output); + cb(cur, "wqkv", il); + + Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0 * sizeof(float) * (n_embd))); + Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd))); + Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa))); + } + else { + Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq); + Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk); + Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv); + } + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_custom( + ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Qcur = ggml_scale(ctx0, Qcur, 1.0f / sqrtf(float(n_embd_head))); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx, + freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor* inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + residual = ggml_get_rows(ctx0, residual, inp_out_ids); + } + + cur = ggml_add(ctx0, cur, residual); + residual = cur; + + cur = llm_build_norm(ctx0, cur, hparams, + model.layers[il].ffn_norm, NULL, + LLM_NORM_RMS, cb, il); + cb(cur, "ffn_norm", il); + + // FF + // special-case: the up and gate tensors are merged into a single tensor + // TOOD: support into llm_build_ffn + { + struct ggml_tensor* up = ggml_mul_mat(ctx0, model.layers[il].ffn_up, cur); + cb(up, "ffn_up", il); + + auto g = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), 0)); + auto y = ggml_cont(ctx0, ggml_view_2d(ctx0, up, up->ne[0] / 2, up->ne[1], ggml_row_size(up->type, up->ne[0]), up->nb[1] / 2)); + + y = ggml_mul(ctx0, y, ggml_silu(ctx0, g)); + cb(y, "ffn_gate", il); + + auto down = ggml_mul_mat(ctx0, model.layers[il].ffn_down, y); + cb(down, "ffn_down", il); + + cur = down; + cb(cur, "ffn_out", il); + } + + cur = ggml_add(ctx0, residual, cur); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = llm_build_norm(ctx0, inpL, hparams, + model.output_norm, + NULL, + LLM_NORM_RMS, cb, -1); + cb(cur, "result_norm", -1); + + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); ggml_build_forward_expand(gf, cur); return gf; } + struct ggml_cgraph * build_plamo() { struct ggml_cgraph * gf = ggml_new_graph(ctx0); @@ -7480,14 +9408,22 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); } struct ggml_tensor * sa_out = cur; cur = attention_norm; + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + sa_out = ggml_get_rows(ctx0, sa_out, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // feed-forward network { cur = llm_build_ffn(ctx0, cur, @@ -7575,9 +9511,16 @@ struct llm_build_context { Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // add the input @@ -7675,9 +9618,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } // add the input @@ -7784,9 +9734,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -7894,9 +9851,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -8017,9 +9981,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } // scale_res - scale the hidden states for residual connection @@ -8131,9 +10102,16 @@ struct llm_build_context { ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, NULL, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f, cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); } struct ggml_tensor * sa_out = ggml_add(ctx0, cur, inpL); @@ -8243,9 +10221,16 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -8395,6 +10380,15 @@ struct llm_build_context { struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0); + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + x = ggml_get_rows(ctx0, x, inp_out_ids); + y = ggml_get_rows(ctx0, y, inp_out_ids); + z = ggml_get_rows(ctx0, z, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + } + // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens} y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d)); y = ggml_mul(ctx0, y, ggml_silu(ctx0, z)); @@ -8478,6 +10472,31 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } + if (model.layers[il].attn_q_norm) { + Qcur = ggml_view_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens, + ggml_element_size(Qcur) * n_embd_head, + ggml_element_size(Qcur) * n_embd_head * n_head, + 0); + cb(Qcur, "Qcur", il); + Kcur = ggml_view_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens, + ggml_element_size(Kcur) * n_embd_head, + ggml_element_size(Kcur) * n_embd_head * n_head_kv, + 0); + cb(Kcur, "Kcur", il); + + Qcur = llm_build_norm(ctx0, Qcur, hparams, + model.layers[il].attn_q_norm, + NULL, + LLM_NORM, cb, il); + cb(Qcur, "Qcur", il); + + Kcur = llm_build_norm(ctx0, Kcur, hparams, + model.layers[il].attn_k_norm, + NULL, + LLM_NORM, cb, il); + cb(Kcur, "Kcur", il); + } + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, @@ -8492,9 +10511,17 @@ struct llm_build_context { ); cb(Kcur, "Kcur", il); - cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, - Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpL = ggml_get_rows(ctx0, inpL, inp_out_ids); + ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids); } struct ggml_tensor * attn_out = cur; @@ -8540,6 +10567,139 @@ struct llm_build_context { return gf; } + + // ref: https://allenai.org/olmo + // based on the original build_llama() function, changes: + // * non-parametric layer norm + // * clamp qkv + // * removed bias + // * removed MoE + struct ggml_cgraph * build_olmo() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + // mutable variable, needed during the last layer of the computation to skip unused tokens + int32_t n_tokens = this->n_tokens; + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = build_inp_pos(); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + NULL, NULL, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Qcur = ggml_clamp(ctx0, Qcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Kcur = ggml_clamp(ctx0, Kcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (hparams.f_clamp_kqv > 0.0f) { + Vcur = ggml_clamp(ctx0, Vcur, -hparams.f_clamp_kqv, hparams.f_clamp_kqv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, + model.layers[il].wo, nullptr, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + } + + if (il == n_layer - 1) { + // skip computing output for unused tokens + struct ggml_tensor * inp_out_ids = build_inp_out_ids(); + n_tokens = n_outputs; + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = llm_build_norm(ctx0, ffn_inp, hparams, + NULL, NULL, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, NULL, + model.layers[il].ffn_gate, NULL, + model.layers[il].ffn_down, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, cb, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + ggml_tensor * layer_dir = lctx.cvec.tensor_for(il); + if (layer_dir != nullptr) { + cur = ggml_add(ctx0, cur, layer_dir); + } + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + NULL, NULL, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -8648,6 +10808,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_falcon(); } break; + case LLM_ARCH_GROK: + { + result = llm.build_grok(); + } break; case LLM_ARCH_STARCODER: { result = llm.build_starcoder(); @@ -8685,10 +10849,18 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_qwen2(); } break; + case LLM_ARCH_QWEN2MOE: + { + result = llm.build_qwen2moe(); + } break; case LLM_ARCH_PHI2: { result = llm.build_phi2(); } break; + case LLM_ARCH_PHI3: + { + result = llm.build_phi3(); + } break; case LLM_ARCH_PLAMO: { result = llm.build_plamo(); @@ -8725,10 +10897,22 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_mamba(); } break; + case LLM_ARCH_XVERSE: + { + result = llm.build_xverse(); + } break; case LLM_ARCH_COMMAND_R: { result = llm.build_command_r(); } break; + case LLM_ARCH_DBRX: + { + result = llm.build_dbrx(); + } break; + case LLM_ARCH_OLMO: + { + result = llm.build_olmo(); + } break; default: GGML_ASSERT(false); } @@ -8790,9 +10974,39 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { ggml_backend_tensor_set(lctx.inp_pos, batch.pos, 0, n_tokens*ggml_element_size(lctx.inp_pos)); } + if (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE) { + GGML_ASSERT(lctx.inp_out_ids && "every model that can must skip unused outputs"); + const int64_t n_tokens = batch.n_tokens; + + GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_out_ids->buffer)); + int32_t * data = (int32_t *) lctx.inp_out_ids->data; + + if (lctx.n_outputs == n_tokens) { + for (int i = 0; i < n_tokens; ++i) { + data[i] = i; + } + } else if (batch.logits) { + int32_t n_outputs = 0; + for (int i = 0; i < n_tokens; ++i) { + if (batch.logits[i]) { + data[n_outputs++] = i; + } + } + // the graph needs to have been passed the correct number of outputs + GGML_ASSERT(lctx.n_outputs == n_outputs); + } else if (lctx.n_outputs == 1) { + // only keep last output + data[0] = n_tokens - 1; + } else { + GGML_ASSERT(lctx.n_outputs == 0); + } + } + GGML_ASSERT( + // (!a || b) is a logical implication (a -> b) + // !hparams.causal_attn -> !cparams.causal_attn (hparams.causal_attn || !cparams.causal_attn) && - "non-causal attention with generative models is not supported" + "causal attention with embedding models is not supported" ); if (lctx.inp_KQ_mask) { @@ -8857,7 +11071,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } - if (hparams.need_kq_pos) { + // ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch + // this allows to process multiple sequences in parallel with ALiBi-based models + if (hparams.use_alibi) { const int64_t n_kv = kv_self.n; GGML_ASSERT(lctx.inp_KQ_pos); @@ -8971,6 +11187,74 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) { } } +// Make sure enough space is available for outputs. +// Returns max number of outputs for which space was reserved. +static size_t llama_output_reserve(llama_context & lctx, size_t n_outputs) { + const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; + + const size_t n_outputs_max = std::max(n_outputs, (size_t) cparams.n_seq_max); + + const auto n_batch = cparams.n_batch; + const auto n_vocab = hparams.n_vocab; + const auto n_embd = hparams.n_embd; + + // TODO: use a per-batch flag for logits presence instead + const bool has_logits = cparams.causal_attn; + const bool has_embd = cparams.embeddings && (hparams.causal_attn || cparams.pooling_type == LLAMA_POOLING_TYPE_NONE); + + const size_t logits_size = has_logits ? n_vocab*n_outputs_max : 0; + const size_t embd_size = has_embd ? n_embd*n_outputs_max : 0; + + if (lctx.output_ids.empty()) { + // init, never resized afterwards + lctx.output_ids.resize(n_batch); + } + + const size_t prev_size = lctx.buf_output ? ggml_backend_buffer_get_size(lctx.buf_output) : 0; + const size_t new_size = (logits_size + embd_size) * sizeof(float); + + // alloc only when more than the current capacity is required + // TODO: also consider shrinking the buffer + if (!lctx.buf_output || prev_size < new_size) { + if (lctx.buf_output) { +#ifndef NDEBUG + // This doesn't happen often, but may be annoying in some cases (like the HellaSwag benchmark) + LLAMA_LOG_INFO("%s: reallocating output buffer from size %.02f MiB to %.02f MiB\n", __func__, prev_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + ggml_backend_buffer_free(lctx.buf_output); + lctx.buf_output = nullptr; + lctx.logits = nullptr; + lctx.embd = nullptr; + } + + lctx.buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), new_size); + if (lctx.buf_output == nullptr) { + LLAMA_LOG_ERROR("%s: failed to allocate output buffer of size %.2f MiB\n", __func__, new_size / (1024.0 * 1024.0)); + return 0; + } + } + + float * output_base = (float *) ggml_backend_buffer_get_base(lctx.buf_output); + + lctx.logits = has_logits ? output_base : nullptr; + lctx.embd = has_embd ? output_base + logits_size : nullptr; + + lctx.output_size = n_outputs_max; + lctx.logits_size = logits_size; + lctx.embd_size = embd_size; + + // set all ids as invalid (negative) + std::fill(lctx.output_ids.begin(), lctx.output_ids.end(), -1); + + ggml_backend_buffer_clear(lctx.buf_output, 0); + + lctx.n_outputs = 0; + + return n_outputs_max; +} + + static void llama_graph_compute( llama_context & lctx, ggml_cgraph * gf, @@ -9046,16 +11330,8 @@ static int llama_decode_internal( const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; - - auto * logits_out = lctx.logits; - -#ifndef NDEBUG - auto & logits_valid = lctx.logits_valid; - logits_valid.clear(); - logits_valid.resize(n_tokens_all); - - memset(logits_out, 0, lctx.logits_size*sizeof(float)); -#endif + uint32_t n_outputs = 0; + uint32_t n_outputs_prev = 0; const auto n_ubatch = cparams.n_ubatch; @@ -9064,6 +11340,38 @@ static int llama_decode_internal( std::vector seq_id_arr; std::vector> seq_id; + // count outputs + if (batch_all.logits) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs += batch_all.logits[i] != 0; + } + } else if (lctx.logits_all || (cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE)) { + n_outputs = n_tokens_all; + } else { + // keep last output only + n_outputs = 1; + } + + // reserve output buffer + if (llama_output_reserve(lctx, n_outputs) < n_outputs) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); + return -2; + }; + + // set output mappings + if (batch_all.logits) { + int32_t i_logits = 0; + for (uint32_t i = 0; i < n_tokens_all; ++i) { + if (batch_all.logits[i]) { + lctx.output_ids[i] = i_logits++; + } + } + } else { + for (uint32_t i = 0; i < n_outputs; ++i) { + lctx.output_ids[i] = i; + } + } + for (uint32_t cur_token = 0; cur_token < n_tokens_all; cur_token += n_ubatch) { const uint32_t n_tokens = std::min(n_ubatch, n_tokens_all - cur_token); llama_batch u_batch = { @@ -9079,6 +11387,27 @@ static int llama_decode_internal( /* .all_seq_id = */ batch_all.all_seq_id, }; + // count the outputs in this u_batch + { + int32_t n_outputs_new = 0; + + if (u_batch.logits) { + for (uint32_t i = 0; i < n_tokens; i++) { + n_outputs_new += u_batch.logits[i] != 0; + } + } else if (n_outputs == n_tokens_all) { + n_outputs_new = n_tokens; + } else { + // keep last output only + if (cur_token + n_tokens >= n_tokens_all) { + n_outputs_new = 1; + } + } + + // needs to happen before the graph is built + lctx.n_outputs = n_outputs_new; + } + int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch; GGML_ASSERT(n_threads > 0); @@ -9126,7 +11455,7 @@ static int llama_decode_internal( // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears // if we start defragmenting the cache, the benefit from this will be more important - kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32))); + kv_self.n = std::min(kv_self.size, std::max(256u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 256))); //kv_self.n = llama_kv_cache_cell_max(kv_self); } } @@ -9142,23 +11471,37 @@ static int llama_decode_internal( struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2]; - if (!hparams.causal_attn) { + if (lctx.n_outputs == 0) { + // no output + res = nullptr; + embd = nullptr; + } else if (!hparams.causal_attn) { res = nullptr; // do not extract logits for embedding models such as BERT // token or sequence embeddings embd = gf->nodes[gf->n_nodes - 1]; GGML_ASSERT(strcmp(embd->name, "result_embd") == 0 || strcmp(embd->name, "result_embd_pooled") == 0); - } else { - if (strcmp(res->name, "result_output") == 0) { - // the token embeddings could be the second to last tensor, or the third to last tensor - if (strcmp(embd->name, "result_norm") != 0) { - embd = gf->nodes[gf->n_nodes - 3]; - GGML_ASSERT(strcmp(embd->name, "result_norm") == 0); - } - } else { - GGML_ASSERT(false && "missing result_output tensor"); + } else if (cparams.embeddings) { + // the embeddings could be in the second to last tensor, or any of the previous tensors + int i_embd = gf->n_nodes - 2; + for (int i = 3; strcmp(embd->name, "result_norm") != 0; ++i) { + i_embd = gf->n_nodes - i; + if (i_embd < 0) { break; } + embd = gf->nodes[i_embd]; } + GGML_ASSERT(i_embd >= 0 && "missing result_norm tensor"); + + // TODO: use a per-batch flag to know when to skip logits while keeping embeddings + if (!cparams.causal_attn) { + res = nullptr; // do not extract logits when not needed + // skip computing logits + // TODO: is this safe? + gf->n_nodes = i_embd + 1; + } + } else { + embd = nullptr; // do not extract embeddings when not needed + GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -9201,50 +11544,23 @@ static int llama_decode_internal( //} // extract logits - // TODO: do not compute and extract logits if only embeddings are needed - // update the graphs to skip "result_output" if logits are not needed if (res) { ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(lctx.sched, res); GGML_ASSERT(backend_res != nullptr); - if (u_batch.logits) { - int32_t i_first = -1; - for (uint32_t i = 0; i < n_tokens; i++) { - if (u_batch.logits[i] && i_first == -1) { - i_first = (int32_t) i; - } - if (u_batch.logits[i] == 0 || i == n_tokens - 1) { - if (i_first != -1) { - int i_last = u_batch.logits[i] == 0 ? i : i + 1; - // extract logits for the range [i_first, i_last) - // group the requests to minimize the number of calls to the backend - ggml_backend_tensor_get_async(backend_res, res, - logits_out + n_vocab*(cur_token + i_first), - i_first*n_vocab*sizeof(float), - (i_last - i_first)*n_vocab*sizeof(float)); - i_first = -1; - } - } -#ifndef NDEBUG - logits_valid[cur_token + i] = u_batch.logits[i] != 0;; -#endif - } - } else if (lctx.logits_all) { - ggml_backend_tensor_get_async(backend_res, res, logits_out + n_vocab*cur_token, 0, n_vocab*n_tokens*sizeof(float)); -#ifndef NDEBUG - std::fill(logits_valid.begin() + cur_token, logits_valid.begin() + cur_token + n_tokens, true); -#endif - } else { - if (cur_token + n_tokens >= n_tokens_all) { - ggml_backend_tensor_get_async(backend_res, res, logits_out, n_vocab*(n_tokens - 1)*sizeof(float), n_vocab*sizeof(float)); -#ifndef NDEBUG - logits_valid[0] = true; -#endif - } + GGML_ASSERT(lctx.logits != nullptr); + + float * logits_out = lctx.logits + n_outputs_prev*n_vocab; + const int32_t n_outputs_new = lctx.n_outputs; + + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size); + ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); } } // extract embeddings - if (cparams.embeddings && embd) { + if (embd) { ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(lctx.sched, embd); GGML_ASSERT(backend_embd != nullptr); @@ -9252,16 +11568,14 @@ static int llama_decode_internal( case LLAMA_POOLING_TYPE_NONE: { // extract token embeddings - auto & embd_out = lctx.embd; + GGML_ASSERT(lctx.embd != nullptr); + float * embd_out = lctx.embd + n_outputs_prev*n_embd; + const int32_t n_outputs_new = lctx.n_outputs; - if (u_batch.logits) { - //embd_out.resize(n_embd * n_tokens); - for (uint32_t i = 0; i < n_tokens; i++) { - if (u_batch.logits[i] == 0) { - continue; - } - ggml_backend_tensor_get_async(backend_embd, embd, embd_out + n_embd*(i + cur_token), (n_embd*i)*sizeof(float), n_embd*sizeof(float)); - } + if (n_outputs_new) { + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); + GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); + ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); } } break; case LLAMA_POOLING_TYPE_CLS: @@ -9288,8 +11602,12 @@ static int llama_decode_internal( } break; } } + n_outputs_prev += lctx.n_outputs; } + // set to total number of outputs in the batch, for use in llama_get_logits_ith + lctx.n_outputs = n_outputs; + // wait for the computation to finish (automatically done when obtaining the model output) //llama_synchronize(&lctx); @@ -9305,6 +11623,10 @@ static int llama_decode_internal( } } + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(lctx.sched); + return 0; } @@ -9330,7 +11652,9 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) { // each move requires 6*n_layer tensors (see build_defrag) // - source view, destination view, copy operation // - x2 for keys and values - const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + //const uint32_t max_moves = LLAMA_MAX_NODES/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (LLAMA_MAX_NODES - 2*n_layer)/(6*n_layer); // determine which KV cells to move where // @@ -9646,7 +11970,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE); GGML_ASSERT(llama_is_byte_token(vocab, id)); - const auto& token_data = vocab.id_to_token.at(id); + const auto & token_data = vocab.id_to_token.at(id); switch (llama_vocab_get_type(vocab)) { case LLAMA_VOCAB_TYPE_SPM: { auto buf = token_data.text.substr(3, 2); @@ -9654,7 +11978,7 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) { } case LLAMA_VOCAB_TYPE_BPE: { GGML_ASSERT(false); - return unicode_utf8_to_byte(token_data.text); + return unicode_utf8_to_byte(token_data.text); // TODO: why is this here after GGML_ASSERT? } case LLAMA_VOCAB_TYPE_WPM: { GGML_ASSERT(false); @@ -9876,7 +12200,94 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - auto word_collection = bpe_gpt2_preprocess(text); + + std::vector word_collection; + switch (vocab.type) { + case LLAMA_VOCAB_TYPE_BPE: + switch (vocab.type_pre) { + case LLAMA_VOCAB_PRE_TYPE_LLAMA3: + case LLAMA_VOCAB_PRE_TYPE_DBRX: + word_collection = unicode_regex_split(text, { + // original regex from tokenizer.json + //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + + // adapted: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2080233989 + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM: + word_collection = unicode_regex_split(text, { + "[\r\n]", + "\\s?[A-Za-zµÀ-ÖØ-öø-ƺƼ-ƿDŽ-ʓʕ-ʯͰ-ͳͶͷͻ-ͽͿΆΈ-ΊΌΎ-ΡΣ-ϵϷ-ҁҊ-ԯԱ-ՖႠ-ჅᎠ-Ᏽᏸ-ᏽᲐ-ᲺᲽ-Ჿᴀ-ᴫᵫ-ᵷᵹ-ᶚḀ-ἕἘ-Ἕἠ-ὅὈ-Ὅὐ-ὗὙὛὝὟ-ώᾀ-ᾴᾶ-ᾼιῂ-ῄῆ-ῌῐ-ΐῖ-Ίῠ-Ῥῲ-ῴῶ-ῼℂℇℊ-ℓℕℙ-ℝℤΩℨK-ℭℯ-ℴℹℼ-ℿⅅ-ⅉⅎↃↄⰀ-ⱻⱾ-ⳤⳫ-ⳮⳲⳳꙀ-ꙭꚀ-ꚛꜢ-ꝯꝱ-ꞇꞋ-ꞎꭰ-ꮿff-stﬓ-ﬗA-Za-z𐐀-𐑏𐒰-𐓓𐓘-𐓻𐲀-𐲲𐳀-𐳲𑢠-𑣟𞤀-𞥃]+", + "\\s?[!-/:-~!-/:-~‘-‟ -。]+", + "\\s+$", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}+", + }); + break; + case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER: + word_collection = unicode_regex_split(text, { + "[\r\n]", + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "[一-龥ࠀ-一가-퟿]+", + "\\p{N}", + }); + break; + case LLAMA_VOCAB_PRE_TYPE_FALCON: + word_collection = unicode_regex_split(text, { + "[\\p{P}\\$\\+<=>\\^~\\|]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "[0-9][0-9][0-9]", + }); + break; + case LLAMA_VOCAB_PRE_TYPE_MPT: + // TODO: MPT pre-tokenization regexes are unknown + // the following are close, but not exact. run the following: + // ./bin/test-tokenizer-0 ../models/ggml-vocab-mpt.gguf + GGML_ASSERT("MPT pre-tokenization regexes are unknown - fixes needed"); + word_collection = unicode_regex_split(text, { + "\\s?\\p{L}+", + "\\s?\\p{P}+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; + case LLAMA_VOCAB_PRE_TYPE_STARCODER: + case LLAMA_VOCAB_PRE_TYPE_REFACT: + case LLAMA_VOCAB_PRE_TYPE_COMMAND_R: + word_collection = unicode_regex_split(text, { + "\\p{N}", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; + case LLAMA_VOCAB_PRE_TYPE_GPT2: + case LLAMA_VOCAB_PRE_TYPE_OLMO: + word_collection = unicode_regex_split(text, { + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + }); + break; + case LLAMA_VOCAB_PRE_TYPE_QWEN2: + word_collection = unicode_regex_split(text, { + // original regex from tokenizer.json + // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" + "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", + }); + break; + default: + // default regex for BPE tokenization pre-processing + word_collection = unicode_regex_split(text, { + "[\\p{P}\\$\\+<=>\\^~\\|]+", + "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)", + "\\p{N}+", + "[0-9][0-9][0-9]", + }); + break; + } + break; + default: + GGML_ASSERT(false); + break; + } symbols_final.clear(); @@ -9933,7 +12344,7 @@ struct llm_tokenizer_bpe { add_new_bigram(bigram.left, left_symbol.next); // right side of current symbol } - // add the fnished tokens to the final list keeping correct order for next and prev + // add the finished tokens to the final list keeping correct order for next and prev for (auto & sym : symbols) { if (sym.n > 0) { sym.prev = final_prev_index; @@ -10003,145 +12414,6 @@ private: work_queue.push(bigram); } - std::vector bpe_gpt2_preprocess(const std::string & text) { - std::vector bpe_words; - std::vector bpe_encoded_words; - - std::string token = ""; - // GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ - bool collecting_numeric = false; - bool collecting_letter = false; - bool collecting_special = false; - bool collecting_whitespace_lookahead = false; - bool collecting = false; - - std::vector text_utf; - text_utf.reserve(text.size()); - bpe_words.reserve(text.size()); - bpe_encoded_words.reserve(text.size()); - - const auto cpts = unicode_cpts_from_utf8(text); - for (size_t i = 0; i < cpts.size(); ++i) - text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); - - for (int i = 0; i < (int)text_utf.size(); i++) { - const std::string & utf_char = text_utf[i]; - bool split_condition = false; - int bytes_remain = text_utf.size() - i; - // forward backward lookups - const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; - const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; - - // handling contractions - if (!split_condition && bytes_remain >= 2) { - // 's|'t|'m|'d - if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) { - split_condition = true; - } - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char + utf_char_next; - bpe_words.emplace_back(token); - token = ""; - i++; - continue; - } - } - if (!split_condition && bytes_remain >= 3) { - // 're|'ve|'ll - if (utf_char == "\'" && ( - (utf_char_next == "r" && utf_char_next_next == "e") || - (utf_char_next == "v" && utf_char_next_next == "e") || - (utf_char_next == "l" && utf_char_next_next == "l")) - ) { - split_condition = true; - } - if (split_condition) { - // current token + next token can be defined - if (token.size()) { - bpe_words.emplace_back(token); // push previous content as token - } - token = utf_char + utf_char_next + utf_char_next_next; - bpe_words.emplace_back(token); // the contraction - token = ""; - i += 2; - continue; - } - } - - if (!split_condition && !collecting) { - if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { - collecting_letter = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - collecting_numeric = true; - collecting = true; - } - else if ( - ((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) || - (!token.size() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_DIGIT && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE) - ) { - collecting_special = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) { - collecting_whitespace_lookahead = true; - collecting = true; - } - else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) { - split_condition = true; - } - } - else if (!split_condition && collecting) { - if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { - split_condition = true; - } - else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_DIGIT) { - split_condition = true; - } - else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_DIGIT || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) { - split_condition = true; - } - else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_DIGIT)) { - split_condition = true; - } - } - - if (utf_char_next == "") { - split_condition = true; // final - token += utf_char; - } - - if (split_condition) { - if (token.size()) { - bpe_words.emplace_back(token); - } - token = utf_char; - collecting = false; - collecting_letter = false; - collecting_numeric = false; - collecting_special = false; - collecting_whitespace_lookahead = false; - } - else { - token += utf_char; - } - } - - for (std::string & word : bpe_words) { - std::string encoded_token = ""; - for (char & c : word) { - encoded_token += unicode_byte_to_utf8(c); - } - bpe_encoded_words.emplace_back(encoded_token); - } - - return bpe_encoded_words; - } - const llama_vocab & vocab; std::vector symbols; @@ -10202,9 +12474,6 @@ struct llm_tokenizer_wpm { output.push_back(vocab.special_unk_id); } } - - // append eos token - output.push_back(vocab.special_eos_id); } std::vector preprocess(const std::string & text) { @@ -10218,8 +12487,8 @@ struct llm_tokenizer_wpm { if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) { continue; } - code = to_lower(code); - if (type == CODEPOINT_TYPE_WHITESPACE) { + code = unicode_tolower(code); + if (type == CODEPOINT_TYPE_SEPARATOR) { code = ' '; } std::string s = unicode_cpt_to_utf8(code); @@ -10238,7 +12507,7 @@ struct llm_tokenizer_wpm { std::vector words; while (r < new_str.size()) { // if is whitespace - if (isspace(new_str[r])) { + if (isspace(new_str[r], std::locale::classic())) { if (r > l) words.push_back(new_str.substr(l, (r - l))); l = r + 1; r = l; @@ -10252,18 +12521,12 @@ struct llm_tokenizer_wpm { return words; } - uint32_t to_lower(uint32_t code) { - static const std::locale locale("en_US.UTF-8"); -#if defined(_WIN32) - if (code > 0xFFFF) { - return code; - } -#endif - return std::tolower(wchar_t(code), locale); - } - bool is_ascii_punct(uint32_t code) { - return code < 256 && ispunct(code); + if (code > 0xFF) { + return false; + } + auto c = char(static_cast(code)); + return ispunct(c, std::locale::classic()); } bool is_chinese_char(uint32_t cpt) { @@ -10415,30 +12678,28 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list< } } -static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool bos, bool special) { +static std::vector llama_tokenize_internal(const llama_vocab & vocab, std::string raw_text, bool add_special, bool parse_special) { std::vector output; - - // OG tokenizer behavior: - // - // tokenizer.encode('', add_bos=True) returns [1] - // tokenizer.encode('', add_bos=False) returns [] - - if (bos && vocab.special_bos_id != -1) { - output.push_back(vocab.special_bos_id); - } - - if (raw_text.empty()) { - return output; - } - std::forward_list fragment_buffer; - fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); - if (special) tokenizer_st_partition(vocab, fragment_buffer); + if (!raw_text.empty()) { + fragment_buffer.emplace_front(raw_text, 0, raw_text.length()); + if (parse_special) tokenizer_st_partition(vocab, fragment_buffer); + } switch (vocab.type) { case LLAMA_VOCAB_TYPE_SPM: { + // OG tokenizer behavior: + // + // tokenizer.encode('', add_special_tokens=True) returns [1] + // tokenizer.encode('', add_special_tokens=False) returns [] + + if (add_special && vocab.special_add_bos != 0) { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + } + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { // without adding this leading whitespace, we do not get the same results as the original tokenizer @@ -10464,9 +12725,19 @@ static std::vector llama_tokenize_internal(const llama_vocab & output.push_back(fragment.token); } } + + if (add_special && vocab.special_add_eos == 1) { + GGML_ASSERT(vocab.special_eos_id != -1); + output.push_back(vocab.special_eos_id); + } } break; case LLAMA_VOCAB_TYPE_BPE: { + if (add_special && vocab.special_add_bos != 0) { + GGML_ASSERT(vocab.special_bos_id != -1); + output.push_back(vocab.special_bos_id); + } + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -10480,9 +12751,16 @@ static std::vector llama_tokenize_internal(const llama_vocab & output.push_back(fragment.token); } } + + GGML_ASSERT(vocab.special_add_eos != 1); } break; case LLAMA_VOCAB_TYPE_WPM: { + if (add_special) { + GGML_ASSERT(vocab.special_cls_id != -1); + output.push_back(vocab.special_cls_id); + } + for (const auto & fragment : fragment_buffer) { if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) { auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length); @@ -10496,6 +12774,11 @@ static std::vector llama_tokenize_internal(const llama_vocab & output.push_back(fragment.token); } } + + if (add_special) { + GGML_ASSERT(vocab.special_sep_id != -1); + output.push_back(vocab.special_sep_id); + } } break; case LLAMA_VOCAB_TYPE_NONE: GGML_ASSERT(false); @@ -10508,28 +12791,10 @@ static std::vector llama_tokenize_internal(const llama_vocab & // grammar - internal // -struct llama_partial_utf8 { - uint32_t value; // bit value so far (unshifted) - int n_remain; // num bytes remaining; -1 indicates invalid sequence -}; - -struct llama_grammar { - const std::vector> rules; - std::vector> stacks; - - // buffer for partially generated UTF-8 sequence from accepted tokens - llama_partial_utf8 partial_utf8; -}; - -struct llama_grammar_candidate { - size_t index; - const uint32_t * code_points; - llama_partial_utf8 partial_utf8; -}; // Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as // pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`. -static std::pair, llama_partial_utf8> decode_utf8( +std::pair, llama_partial_utf8> decode_utf8( const std::string & src, llama_partial_utf8 partial_start) { static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 }; @@ -10680,7 +12945,9 @@ static void llama_grammar_advance_stack( std::vector> & new_stacks) { if (stack.empty()) { - new_stacks.emplace_back(stack); + if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { + new_stacks.emplace_back(stack); + } return; } @@ -10717,7 +12984,10 @@ static void llama_grammar_advance_stack( } case LLAMA_GRETYPE_CHAR: case LLAMA_GRETYPE_CHAR_NOT: - new_stacks.emplace_back(stack); + if (std::find(new_stacks.begin(), new_stacks.end(), stack) == new_stacks.end()) { + // only add the stack if it's not a duplicate of one we already have + new_stacks.emplace_back(stack); + } break; default: // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range @@ -10731,12 +13001,13 @@ static void llama_grammar_advance_stack( // be positioned at a character range (see `llama_grammar_advance_stack`), and // produces the N possible stacks if the given char is accepted at those // positions -static std::vector> llama_grammar_accept( +void llama_grammar_accept( const std::vector> & rules, const std::vector> & stacks, - const uint32_t chr) { + const uint32_t chr, + std::vector> & new_stacks) { - std::vector> new_stacks; + new_stacks.clear(); for (const auto & stack : stacks) { if (stack.empty()) { @@ -10755,8 +13026,6 @@ static std::vector> llama_grammar_acc llama_grammar_advance_stack(rules, new_stack, new_stacks); } } - - return new_stacks; } static std::vector llama_grammar_reject_candidates( @@ -10770,6 +13039,7 @@ static std::vector llama_grammar_reject_candidates_for_ const std::vector & candidates) { std::vector rejects; + rejects.reserve(candidates.size()); if (stack.empty()) { for (const auto & tok : candidates) { @@ -10783,6 +13053,8 @@ static std::vector llama_grammar_reject_candidates_for_ const llama_grammar_element * stack_pos = stack.back(); std::vector next_candidates; + next_candidates.reserve(candidates.size()); + for (const auto & tok : candidates) { if (*tok.code_points == 0) { // reached end of full codepoints in token, reject iff it ended in a partial sequence @@ -11368,16 +13640,14 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); - bool allow_eos = false; + bool allow_eog = false; for (const auto & stack : grammar->stacks) { if (stack.empty()) { - allow_eos = true; + allow_eog = true; break; } } - const llama_token eos = llama_token_eos(&ctx->model); - std::vector, llama_partial_utf8>> candidates_decoded; candidates_decoded.reserve(candidates->size); std::vector candidates_grammar; @@ -11385,9 +13655,10 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c for (size_t i = 0; i < candidates->size; ++i) { const llama_token id = candidates->data[i].id; - const std::string piece = llama_token_to_piece(ctx, id); - if (id == eos) { - if (!allow_eos) { + const std::string piece = llama_token_to_piece(ctx, id, false); + + if (llama_token_is_eog(&ctx->model, id)) { + if (!allow_eog) { candidates->data[i].logit = -INFINITY; } } else if (piece.empty() || piece[0] == 0) { @@ -11550,7 +13821,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da return result; } -llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { +llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng) { GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); @@ -11563,7 +13834,6 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra } std::discrete_distribution<> dist(probs.begin(), probs.end()); - auto & rng = ctx->rng; int idx = dist(rng); llama_token result = candidates->data[idx].id; @@ -11573,10 +13843,14 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra return result; } +llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates) { + return llama_sample_token_with_rng(ctx, candidates, ctx->rng); +} + void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { const int64_t t_start_sample_us = ggml_time_us(); - if (token == llama_token_eos(&ctx->model)) { + if (llama_token_is_eog(&ctx->model, token)) { for (const auto & stack : grammar->stacks) { if (stack.empty()) { return; @@ -11585,13 +13859,15 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar GGML_ASSERT(false); } - const std::string piece = llama_token_to_piece(ctx, token); + const std::string piece = llama_token_to_piece(ctx, token, false); // Note terminating 0 in decoded string const auto decoded = decode_utf8(piece, grammar->partial_utf8); const auto & code_points = decoded.first; + std::vector> tmp_new_stacks; for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { - grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it); + llama_grammar_accept(grammar->rules, grammar->stacks, *it, tmp_new_stacks); + grammar->stacks = tmp_new_stacks; } grammar->partial_utf8 = decoded.second; GGML_ASSERT(!grammar->stacks.empty()); @@ -11725,6 +14001,11 @@ struct llama_beam_search_data { } llama_logit_info logit_info(ctx); std::vector next_tokens = logit_info.top_k(n_beams); + + // Clear the kv slot so that other beams may try different tokens at this position. The llama_decode() + // call in loop() will conclusively fill in the kv slot once the beams converge at this position. + llama_kv_cache_seq_rm(ctx, 0, n_past, -1); + size_t i=0; if (next_beams.size() < n_beams) { for (; next_beams.size() < n_beams ; ++i) { @@ -11894,13 +14175,16 @@ static void llama_tensor_dequantize_internal( if (qtype.to_float == NULL) { throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type))); } - } else if (tensor->type != GGML_TYPE_F16) { + } else if (tensor->type != GGML_TYPE_F16 && + tensor->type != GGML_TYPE_BF16) { throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type))); } if (nthread < 2) { if (tensor->type == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements); + } else if (tensor->type == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements); } else if (ggml_is_quantized(tensor->type)) { qtype.to_float(tensor->data, f32_output, nelements); } else { @@ -11909,7 +14193,14 @@ static void llama_tensor_dequantize_internal( return; } - size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type); + size_t block_size; + if (tensor->type == GGML_TYPE_F16 || + tensor->type == GGML_TYPE_BF16) { + block_size = 1; + } else { + block_size = (size_t)ggml_blck_size(tensor->type); + } + size_t block_size_bytes = ggml_type_size(tensor->type); GGML_ASSERT(nelements % block_size == 0); @@ -11928,6 +14219,8 @@ static void llama_tensor_dequantize_internal( auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) { if (typ == GGML_TYPE_F16) { ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels); + } else if (typ == GGML_TYPE_BF16) { + ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels); } else { qtype.to_float(inbuf, outbuf, nels); } @@ -11957,7 +14250,6 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // sprinkled in the model. Hence, simply dividing i_ffn_down by n_expert does not work // for getting the current layer as I initially thought, and we need to resort to parsing the // tensor name. - n_layer /= n_expert; if (sscanf(name, "blk.%d.", &i_layer) != 1) { throw std::runtime_error(format("Failed to determine layer for tensor %s", name)); } @@ -11971,30 +14263,39 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings // with the quantization of the output tensor if (name == tn(LLM_TENSOR_OUTPUT, "weight") || (!qs.has_output && name == tn(LLM_TENSOR_TOKEN_EMBD, "weight"))) { - int nx = tensor->ne[0]; - if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { - new_type = GGML_TYPE_Q8_0; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = GGML_TYPE_Q5_K; - } - else if (new_type != GGML_TYPE_Q8_0) { - new_type = GGML_TYPE_Q6_K; + if (qs.params->output_tensor_type < GGML_TYPE_COUNT) { + new_type = qs.params->output_tensor_type; + } else { + int nx = tensor->ne[0]; + if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { + new_type = GGML_TYPE_Q8_0; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + new_type = GGML_TYPE_Q5_K; + } + else if (new_type != GGML_TYPE_Q8_0) { + new_type = GGML_TYPE_Q6_K; + } } } else if (name == "token_embd.weight") { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || - ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) { - new_type = GGML_TYPE_Q2_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { - new_type = GGML_TYPE_IQ3_S; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { - new_type = GGML_TYPE_IQ3_S; + if (qs.params->token_embedding_type < GGML_TYPE_COUNT) { + new_type = qs.params->token_embedding_type; + } else { + if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || + ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { + new_type = GGML_TYPE_Q2_K; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + new_type = GGML_TYPE_IQ3_S; + } + else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { + new_type = GGML_TYPE_IQ3_S; + } } } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || - ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) { + ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) { if (name.find("attn_v.weight") != std::string::npos) { if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K; else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K; @@ -12013,7 +14314,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (qs.model.hparams.n_expert == 8) { new_type = GGML_TYPE_Q5_K; } else { - if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS; + if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) new_type = GGML_TYPE_IQ2_XXS; else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S; } } @@ -12027,13 +14328,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS; } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { - new_type = GGML_TYPE_Q4_K; - } - else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) { + else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) { new_type = GGML_TYPE_Q4_K; } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) { @@ -12186,7 +14481,8 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS || new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S || - new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) { + new_type == GGML_TYPE_IQ3_XXS || new_type == GGML_TYPE_IQ1_S || new_type == GGML_TYPE_IQ3_S || + new_type == GGML_TYPE_IQ1_M) { int nx = tensor->ne[0]; int ny = tensor->ne[1]; if (nx % QK_K != 0) { @@ -12204,6 +14500,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: case GGML_TYPE_Q2_K: case GGML_TYPE_Q3_K: case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break; @@ -12219,21 +14516,27 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n return new_type; } -static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, const float * imatrix, std::vector & workers, const int nthread) { - std::mutex mutex; - int counter = 0; - size_t new_size = 0; +static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int64_t chunk_size, int64_t nrows, int64_t n_per_row, const float * imatrix, std::vector & workers, const int nthread) { if (nthread < 2) { // single-thread - return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); + size_t new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, imatrix); + if (!ggml_validate_row_data(new_type, new_data, new_size)) { + throw std::runtime_error("quantized data validation failed"); + } + return new_size; } - auto compute = [&mutex, &counter, &new_size, new_type, f32_data, new_data, chunk_size, + + std::mutex mutex; + int64_t counter = 0; + size_t new_size = 0; + bool valid = true; + auto compute = [&mutex, &counter, &new_size, &valid, new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix]() { - const int nrows_per_chunk = chunk_size / n_per_row; + const int64_t nrows_per_chunk = chunk_size / n_per_row; size_t local_size = 0; while (true) { std::unique_lock lock(mutex); - int first_row = counter; counter += nrows_per_chunk; + int64_t first_row = counter; counter += nrows_per_chunk; if (first_row >= nrows) { if (local_size > 0) { new_size += local_size; @@ -12241,8 +14544,18 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa break; } lock.unlock(); - const int this_nrow = std::min(nrows - first_row, nrows_per_chunk); - local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); + const int64_t this_nrow = std::min(nrows - first_row, nrows_per_chunk); + size_t this_size = ggml_quantize_chunk(new_type, f32_data, new_data, first_row * n_per_row, this_nrow, n_per_row, imatrix); + local_size += this_size; + + // validate the quantized data + const size_t row_size = ggml_row_size(new_type, n_per_row); + void * this_data = (char *) new_data + first_row * row_size; + if (!ggml_validate_row_data(new_type, this_data, this_size)) { + std::unique_lock lock(mutex); + valid = false; + break; + } } }; for (int it = 0; it < nthread - 1; ++it) { @@ -12251,6 +14564,9 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa compute(); for (auto & w : workers) { w.join(); } workers.clear(); + if (!valid) { + throw std::runtime_error("quantized data validation failed"); + } return new_size; } @@ -12265,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break; case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break; + case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break; case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break; // K-quants @@ -12285,6 +14602,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_IQ2_M: default_type = GGML_TYPE_IQ2_S; break; case LLAMA_FTYPE_MOSTLY_IQ3_XXS: default_type = GGML_TYPE_IQ3_XXS; break; case LLAMA_FTYPE_MOSTLY_IQ1_S: default_type = GGML_TYPE_IQ1_S; break; + case LLAMA_FTYPE_MOSTLY_IQ1_M: default_type = GGML_TYPE_IQ1_M; break; case LLAMA_FTYPE_MOSTLY_IQ4_NL: default_type = GGML_TYPE_IQ4_NL; break; case LLAMA_FTYPE_MOSTLY_IQ4_XS: default_type = GGML_TYPE_IQ4_XS; break; case LLAMA_FTYPE_MOSTLY_IQ3_S: default_type = GGML_TYPE_IQ3_S; break; @@ -12307,8 +14625,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s constexpr bool use_mmap = false; #endif - llama_model_loader ml(fname_inp, use_mmap, NULL); - ml.init_mapping(false); // no prefetching? + llama_model_kv_override * kv_overrides = nullptr; + if (params->kv_overrides) { + auto v = (std::vector*)params->kv_overrides; + kv_overrides = v->data(); + } + llama_model_loader ml(fname_inp, use_mmap, /*check_tensors*/ true, kv_overrides); + ml.init_mappings(false); // no prefetching llama_model model; llm_load_arch(ml, model); @@ -12332,36 +14655,54 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s struct gguf_context * ctx_out = gguf_init_empty(); // copy the KV pairs from the input file - gguf_set_kv (ctx_out, ml.ctx_gguf); + gguf_set_kv (ctx_out, ml.meta); gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION); gguf_set_val_u32(ctx_out, "general.file_type", ftype); + // Remove split metadata + gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_NO).c_str()); + gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str()); + gguf_remove_key(ctx_out, ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str()); + + if (params->kv_overrides) { + const std::vector & overrides = *(const std::vector *)params->kv_overrides; + for (auto & o : overrides) { + if (o.key[0] == 0) break; + if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) { + gguf_set_val_f32(ctx_out, o.key, o.val_f64); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) { + gguf_set_val_i32(ctx_out, o.key, o.val_i64); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) { + gguf_set_val_bool(ctx_out, o.key, o.val_bool); + } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) { + gguf_set_val_str(ctx_out, o.key, o.val_str); + } else { + LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key); + } + } + } for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); + const struct ggml_tensor * meta = ml.get_tensor_meta(i); const std::string name = ggml_get_name(meta); // TODO: avoid hardcoded tensor names - use the TN_* constants - if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { + if (name.find("attn_v.weight") != std::string::npos || + name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; - } - else if (name.find("ffn_down") != std::string::npos) { - ++qs.n_ffn_down; - } - else if (name.find("ffn_gate") != std::string::npos) { - ++qs.n_ffn_gate; - } - else if (name.find("ffn_up") != std::string::npos) { - ++qs.n_ffn_up; - } - else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { + } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } } - if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) { - LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n", - __func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer); - } + + qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; + + // sanity checks + // + // - qs.n_attention_wv == 0 for Mamba models + // - qs.n_attention_wv == model.hparams.n_layer for Transformer models + // + GGML_ASSERT((qs.n_attention_wv == 0 || qs.n_attention_wv == (int)model.hparams.n_layer) && "n_attention_wv is unexpected"); size_t total_size_org = 0; size_t total_size_new = 0; @@ -12375,24 +14716,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector> work; std::vector> f32_conv_buf; + uint16_t n_split = 1; + // Assume split index is continuous + if (params->keep_split) { + for (int i = 0; i < ml.n_tensors; ++i) { + n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split); + } + } + std::vector ctx_outs(n_split, NULL); + ctx_outs[0] = ctx_out; + // populate the original tensors so we get an initial meta data for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * meta = ml.get_tensor_meta(i); - gguf_add_tensor(ctx_out, meta); + auto weight = ml.get_weight(i); + uint16_t i_split = params->keep_split ? weight->idx : 0; + struct ggml_tensor * tensor = weight->tensor; + if (ctx_outs[i_split] == NULL) { + ctx_outs[i_split] = gguf_init_empty(); + } + gguf_add_tensor(ctx_outs[i_split], tensor); } - std::ofstream fout(fname_out, std::ios::binary); - fout.exceptions(std::ofstream::failbit); // fail fast on write errors + // Set split info if needed + if (n_split > 1) { + for (size_t i = 0; i < ctx_outs.size(); ++i) { + gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i); + gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split); + gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors); + } + } - const size_t meta_size = gguf_get_meta_size(ctx_out); + int cur_split = -1; + std::ofstream fout; + auto close_ofstream = [&]() { + // Write metadata and close file handler + if (fout.is_open()) { + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_outs[cur_split])); + gguf_get_meta_data(ctx_outs[cur_split], data.data()); + fout.write((const char *) data.data(), data.size()); + fout.close(); + } + }; + auto new_ofstream = [&](int index) { + cur_split = index; + GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context"); + std::string fname = fname_out; + if (params->keep_split) { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split); + fname = std::string(split_path); + } - LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size); - - // placeholder for the meta data - ::zeros(fout, meta_size); + fout = std::ofstream(fname, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]); + // placeholder for the meta data + ::zeros(fout, meta_size); + }; + const auto tn = LLM_TN(model.arch); + new_ofstream(0); for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * tensor = ml.get_tensor_meta(i); + auto weight = ml.get_weight(i); + struct ggml_tensor * tensor = weight->tensor; + if (weight->idx != cur_split && params->keep_split) { + close_ofstream(); + new_ofstream(weight->idx); + } const std::string name = ggml_get_name(tensor); @@ -12413,8 +14804,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // This used to be a regex, but has an extreme cost to compile times. bool quantize = name.rfind("weight") == name.size() - 6; // ends with 'weight'? - // quantize only 2D tensors - quantize &= (ggml_n_dims(tensor) == 2); + // quantize only 2D and 3D tensors (experts) + quantize &= (ggml_n_dims(tensor) >= 2); + + // do not quantize norm tensors + quantize &= name.find("_norm.weight") == std::string::npos; + quantize &= params->quantize_output_tensor || name != "output.weight"; quantize &= !params->only_copy; @@ -12443,6 +14838,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (!params->pure && ggml_is_quantized(default_type)) { new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); } + if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { + new_type = params->token_embedding_type; + } + if (params->output_tensor_type < GGML_TYPE_COUNT && strcmp(tensor->name, "output.weight") == 0) { + new_type = params->output_tensor_type; + } // If we've decided to quantize to the same type the tensor is already // in then there's nothing to do. @@ -12455,7 +14856,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_size = ggml_nbytes(tensor); LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0); } else { - const size_t nelements = ggml_nelements(tensor); + const int64_t nelements = ggml_nelements(tensor); const float * imatrix = nullptr; if (imatrix_data) { @@ -12463,11 +14864,20 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (it == imatrix_data->end()) { LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name); } else { - if (it->second.size() == (size_t)tensor->ne[0]) { + if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) { imatrix = it->second.data(); } else { LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__, - int(it->second.size()), int(tensor->ne[0]), tensor->name); + int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name); + + // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix + // this is a significant error and it may be good idea to abort the process if this happens, + // since many people will miss the error and not realize that most of the model is being quantized without an imatrix + // tok_embd should be ignored in this case, since it always causes this warning + if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) { + throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s", + int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name)); + } } } } @@ -12475,6 +14885,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_S || new_type == GGML_TYPE_IQ1_S || + (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) || (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) { LLAMA_LOG_ERROR("\n\n============================================================\n"); LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name); @@ -12497,47 +14908,48 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type)); fflush(stdout); - if (work.size() < nelements * 4) { + if (work.size() < (size_t)nelements * 4) { work.resize(nelements * 4); // upper bound on size } new_data = work.data(); - const int n_per_row = tensor->ne[0]; - const int nrows = nelements / n_per_row; + const int64_t n_per_row = tensor->ne[0]; + const int64_t nrows = tensor->ne[1]; - static const int min_chunk_size = 32 * 512; - const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row); + static const int64_t min_chunk_size = 32 * 512; + const int64_t chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row); - const int nchunk = (nelements + chunk_size - 1)/chunk_size; - const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; - new_size = llama_tensor_quantize_internal(new_type, f32_data, new_data, chunk_size, nrows, n_per_row, imatrix, workers, nthread_use); + const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1]; + const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size; + const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1; + // quantize each expert separately since they have different importance matrices + new_size = 0; + for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) { + const float * f32_data_03 = f32_data + i03 * nelements_matrix; + void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows; + const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr; + + new_size += llama_tensor_quantize_internal(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use); + } LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); } total_size_org += ggml_nbytes(tensor); total_size_new += new_size; // update the gguf meta data as we go - gguf_set_tensor_type(ctx_out, name.c_str(), new_type); - gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size); + gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type); + gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size); // write tensor data + padding fout.write((const char *) new_data, new_size); zeros(fout, GGML_PAD(new_size, align) - new_size); } - - // go back to beginning of file and write the updated meta data - { - fout.seekp(0); - std::vector data(gguf_get_meta_size(ctx_out)); - gguf_get_meta_data(ctx_out, data.data()); - fout.write((const char *) data.data(), data.size()); + close_ofstream(); + for (auto & c:ctx_outs) { + gguf_free(c); } - fout.close(); - - gguf_free(ctx_out); - LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0); LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0); @@ -12581,8 +14993,8 @@ static int llama_apply_lora_from_file_internal( std::unique_ptr ml; if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); - ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); - ml->init_mapping(/*prefetch*/ false); // no prefetching + ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*check_tensors*/ false, /*kv_overrides*/ nullptr)); + ml->init_mappings(/*prefetch*/ false); // no prefetching } struct tensor_meta { @@ -12703,7 +15115,7 @@ static int llama_apply_lora_from_file_internal( ggml_tensor * base_t; if (ml) { - if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + if (!ml->get_tensor_meta(base_name.c_str())) { LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } @@ -12840,6 +15252,7 @@ struct llama_model_params llama_model_default_params() { /*.vocab_only =*/ false, /*.use_mmap =*/ true, /*.use_mlock =*/ false, + /*.check_tensors =*/ false, }; #ifdef GGML_USE_METAL @@ -12876,6 +15289,7 @@ struct llama_context_params llama_context_default_params() { /*.logits_all =*/ false, /*.embeddings =*/ false, /*.offload_kqv =*/ true, + /*.flash_attn =*/ false, /*.abort_callback =*/ nullptr, /*.abort_callback_data =*/ nullptr, }; @@ -12887,11 +15301,15 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { struct llama_model_quantize_params result = { /*.nthread =*/ 0, /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1, + /*.output_tensor_type =*/ GGML_TYPE_COUNT, + /*.token_embedding_type =*/ GGML_TYPE_COUNT, /*.allow_requantize =*/ false, /*.quantize_output_tensor =*/ true, /*.only_copy =*/ false, /*.pure =*/ false, + /*.keep_split =*/ false, /*.imatrix =*/ nullptr, + /*.kv_overrides =*/ nullptr, }; return result; @@ -12900,7 +15318,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() { size_t llama_max_devices(void) { #if defined(GGML_USE_METAL) return 1; -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) return GGML_CUDA_MAX_DEVICES; #elif defined(GGML_USE_SYCL) return GGML_SYCL_MAX_DEVICES; @@ -12920,8 +15338,8 @@ bool llama_supports_mlock(void) { } bool llama_supports_gpu_offload(void) { -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ - defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) +#if defined(GGML_USE_CUDA) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \ + defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) // Defined when llama.cpp is compiled with support for offloading model layers to GPU. return true; #else @@ -13028,7 +15446,7 @@ struct llama_context * llama_new_context_with_model( const auto & hparams = model->hparams; auto & cparams = ctx->cparams; - // TODO: maybe add n_seq_max here too + cparams.n_seq_max = std::max(1u, params.n_seq_max); cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch; cparams.yarn_ext_factor = params.yarn_ext_factor; @@ -13038,16 +15456,28 @@ struct llama_context * llama_new_context_with_model( cparams.defrag_thold = params.defrag_thold; cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; + cparams.flash_attn = params.flash_attn; cparams.pooling_type = params.pooling_type; cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx; cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base; cparams.rope_freq_scale = params.rope_freq_scale == 0.0f ? hparams.rope_freq_scale_train : params.rope_freq_scale; + // this is necessary due to kv_self.n being padded later during inference + cparams.n_ctx = GGML_PAD(cparams.n_ctx, 256); + // with causal attention, the batch size is limited by the context size cparams.n_batch = hparams.causal_attn ? std::min(cparams.n_ctx, params.n_batch) : params.n_batch; - cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + // the batch has to be at least GGML_KQ_MASK_PAD because we will be padding the KQ_mask + // this is required by GPU kernels in order to avoid out-of-bounds accesses (e.g. ggml_flash_attn_ext) + // ref: https://github.com/ggerganov/llama.cpp/pull/5021 + if (cparams.n_batch < GGML_KQ_MASK_PAD) { + LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD); + cparams.n_batch = GGML_KQ_MASK_PAD; + } + + cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); cparams.n_yarn_orig_ctx = params.yarn_orig_ctx != 0 ? params.yarn_orig_ctx : hparams.n_yarn_orig_ctx != 0 ? hparams.n_yarn_orig_ctx : @@ -13079,6 +15509,16 @@ struct llama_context * llama_new_context_with_model( } } + if (cparams.flash_attn && hparams.use_alibi) { + LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__); + cparams.flash_attn = false; + } + + if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) { + LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__); + cparams.flash_attn = false; + } + if (params.seed == LLAMA_DEFAULT_SEED) { params.seed = time(NULL); } @@ -13086,6 +15526,7 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: n_ctx = %u\n", __func__, cparams.n_ctx); LLAMA_LOG_INFO("%s: n_batch = %u\n", __func__, cparams.n_batch); LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch); + LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn); LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); @@ -13123,7 +15564,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_metal); } -#elif defined(GGML_USE_CUBLAS) +#elif defined(GGML_USE_CUDA) if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); @@ -13146,7 +15587,20 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_VULKAN) - if (model->n_gpu_layers > 0) { + if (model->split_mode == LLAMA_SPLIT_MODE_ROW) { + LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__); + llama_free(ctx); + return nullptr; + } + if (model->split_mode == LLAMA_SPLIT_MODE_NONE) { + ggml_backend_t backend = ggml_backend_vk_init(0); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) { ggml_backend_t backend = ggml_backend_vk_init(device); if (backend == nullptr) { @@ -13158,30 +15612,28 @@ struct llama_context * llama_new_context_with_model( } } #elif defined(GGML_USE_SYCL) - if (model->n_gpu_layers > 0) { - // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); + if (backend == nullptr) { + int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { + ggml_backend_t backend = ggml_backend_sycl_init(i); if (backend == nullptr) { - int main_gpu_id = ggml_backend_sycl_get_device_id(model->main_gpu); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, main_gpu_id, model->main_gpu); + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_LAYER requires a backend for each GPU - for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { - ggml_backend_t backend = ggml_backend_sycl_init(i); - if (backend == nullptr) { - int id_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d) backend\n", __func__, id_list[i], i); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } } } #elif defined(GGML_USE_KOMPUTE) @@ -13203,7 +15655,7 @@ struct llama_context * llama_new_context_with_model( } ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx, type_k, type_v, kv_size, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -13229,25 +15681,12 @@ struct llama_context * llama_new_context_with_model( // graph outputs buffer { - // resized during inference, reserve maximum - ctx->logits_size = hparams.n_vocab*cparams.n_batch; - ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0; - - const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float); - - ctx->buf_output = ggml_backend_buft_alloc_buffer(llama_default_buffer_type_cpu(true), buf_output_size); - if (ctx->buf_output == nullptr) { - LLAMA_LOG_ERROR("%s: failed to allocate logits buffer\n", __func__); + // resized during inference when a batch uses more outputs + if (llama_output_reserve(*ctx, params.n_seq_max) < params.n_seq_max) { + LLAMA_LOG_ERROR("%s: failed to reserve initial output buffer\n", __func__); llama_free(ctx); return nullptr; } - ggml_backend_buffer_clear(ctx->buf_output, 0); - - - ctx->logits = (float *) ggml_backend_buffer_get_base(ctx->buf_output); - if (params.embeddings) { - ctx->embd = ctx->logits + ctx->logits_size; - } LLAMA_LOG_INFO("%s: %10s output buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(ctx->buf_output), @@ -13272,7 +15711,7 @@ struct llama_context * llama_new_context_with_model( // enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary bool pipeline_parallel = llama_get_device_count() > 1 && model->n_gpu_layers > (int)model->hparams.n_layer && model->split_mode == LLAMA_SPLIT_MODE_LAYER; -#ifndef GGML_USE_CUBLAS +#ifndef GGML_USE_CUDA // pipeline parallelism requires support for async compute and events // currently this is only implemented in the CUDA backend pipeline_parallel = false; @@ -13380,18 +15819,24 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_ORION: case LLM_ARCH_INTERNLM2: case LLM_ARCH_MINICPM: + case LLM_ARCH_XVERSE: case LLM_ARCH_COMMAND_R: + case LLM_ARCH_OLMO: return LLAMA_ROPE_TYPE_NORM; // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: + case LLM_ARCH_GROK: + case LLM_ARCH_DBRX: case LLM_ARCH_PERSIMMON: case LLM_ARCH_BERT: case LLM_ARCH_NOMIC_BERT: case LLM_ARCH_STABLELM: case LLM_ARCH_QWEN: case LLM_ARCH_QWEN2: + case LLM_ARCH_QWEN2MOE: case LLM_ARCH_PHI2: + case LLM_ARCH_PHI3: case LLM_ARCH_GEMMA: case LLM_ARCH_STARCODER2: return LLAMA_ROPE_TYPE_NEOX; @@ -13405,6 +15850,10 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { return LLAMA_ROPE_TYPE_NONE; } +enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) { + return ctx->cparams.pooling_type; +} + int32_t llama_n_vocab(const struct llama_model * model) { return model->hparams.n_vocab; } @@ -13760,30 +16209,61 @@ void llama_kv_cache_update(struct llama_context * ctx) { llama_kv_cache_update_internal(*ctx); } +// deprecated +size_t llama_get_state_size(const struct llama_context * ctx) { + return llama_state_get_size(ctx); +} + +// deprecated +size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { + return llama_state_get_data(ctx, dst); +} + +// deprecated +size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { + return llama_state_set_data(ctx, src); +} + +// deprecated +bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + return llama_state_load_file(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); +} + +// deprecated +bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + return llama_state_save_file(ctx, path_session, tokens, n_token_count); +} // Returns the *maximum* size of the state -size_t llama_get_state_size(const struct llama_context * ctx) { +size_t llama_state_get_size(const struct llama_context * ctx) { + const auto & cparams = ctx->cparams; + const auto & hparams = ctx->model.hparams; + // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state. // for reference, std::mt19937(1337) serializes to 6701 bytes. const size_t s_rng_size = sizeof(size_t); const size_t s_rng = LLAMA_MAX_RNG_STATE; + const size_t s_n_outputs = sizeof(size_t); + // assume worst case for outputs although only currently set ones are serialized + const size_t s_output_pos = ctx->cparams.n_batch * sizeof(int32_t); const size_t s_logits_size = sizeof(size_t); - // assume worst case for logits although only currently set ones are serialized - const size_t s_logits = ctx->logits_size * sizeof(float); + const size_t s_logits = ctx->logits_size ? cparams.n_batch * hparams.n_vocab * sizeof(float) : 0; const size_t s_embedding_size = sizeof(size_t); - const size_t s_embedding = ctx->embd_size * sizeof(float); + const size_t s_embedding = ctx->embd_size ? cparams.n_batch * hparams.n_embd * sizeof(float) : 0; const size_t s_kv_buf_size = sizeof(size_t); const size_t s_kv_head = sizeof(uint32_t); const size_t s_kv_size = sizeof(uint32_t); const size_t s_kv_used = sizeof(uint32_t); + const size_t s_v_trans = sizeof(uint32_t); const size_t s_kv = ctx->kv_self.total_size(); - // TODO: assume the max is more than 1 seq_id per KV cell - const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + sizeof(llama_seq_id); + const size_t s_kv_cell = sizeof(llama_pos) + sizeof(size_t) + cparams.n_seq_max*sizeof(llama_seq_id); const size_t s_kv_cells = ctx->kv_self.size * s_kv_cell; const size_t s_total = ( + s_rng_size + s_rng + + s_n_outputs + + s_output_pos + s_logits_size + s_logits + s_embedding_size @@ -13792,10 +16272,14 @@ size_t llama_get_state_size(const struct llama_context * ctx) { + s_kv_head + s_kv_size + s_kv_used + + s_v_trans + s_kv + s_kv_cells ); + // on session change it is very likely that the state size has changed - so we need to update this function + static_assert(LLAMA_SESSION_VERSION == 6, "So you just bumped the session version - good. But did you remember to update llama_state_get_size?"); + return s_total; } @@ -13844,21 +16328,23 @@ struct llama_data_file_context : llama_data_context { * file context: * llama_file file("/path", "wb"); * llama_data_file_context data_ctx(&file); - * llama_copy_state_data(ctx, &data_ctx); + * llama_state_get_data(ctx, &data_ctx); * * buffer context: * std::vector buf(max_size, 0); * llama_data_buffer_context data_ctx(&buf.data()); - * llama_copy_state_data(ctx, &data_ctx); + * llama_state_get_data(ctx, &data_ctx); * */ -static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { +static void llama_state_get_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) { + llama_synchronize(ctx); + // copy rng { std::ostringstream rng_ss; rng_ss << ctx->rng; - const std::string & rng_str = rng_ss.str(); + const std::string & rng_str = rng_ss.str(); const size_t rng_size = rng_str.size(); GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); @@ -13867,25 +16353,61 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(rng_str.data(), rng_size); } - // copy logits + // copy outputs { - const size_t logits_size = ctx->logits_size; + // Can't use ctx->n_outputs because it's not for the + // entire last batch when n_ubatch is smaller than n_batch + size_t n_outputs = 0; - data_ctx->write(&logits_size, sizeof(logits_size)); + // copy output ids + { + std::vector output_pos; - if (logits_size) { - data_ctx->write(ctx->logits, logits_size * sizeof(float)); + const size_t n_batch = ctx->cparams.n_batch; + const auto & output_ids = ctx->output_ids; + + output_pos.resize(ctx->output_size); + + // build a more compact representation of the output ids + for (size_t i = 0; i < n_batch; ++i) { + // map an output id to a position in the batch + int32_t pos = output_ids[i]; + if (pos >= 0) { + if ((size_t) pos >= n_outputs) { + n_outputs = pos + 1; + } + GGML_ASSERT((size_t) pos < ctx->output_size); + output_pos[pos] = i; + } + } + + data_ctx->write(&n_outputs, sizeof(n_outputs)); + + if (n_outputs) { + data_ctx->write(output_pos.data(), n_outputs * sizeof(int32_t)); + } } - } - // copy embeddings - { - const size_t embeddings_size = ctx->embd_size; + // copy logits + { + const size_t logits_size = std::min(ctx->logits_size, n_outputs * ctx->model.hparams.n_vocab); - data_ctx->write(&embeddings_size, sizeof(embeddings_size)); + data_ctx->write(&logits_size, sizeof(logits_size)); - if (embeddings_size) { - data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); + if (logits_size) { + data_ctx->write(ctx->logits, logits_size * sizeof(float)); + } + } + + // copy embeddings + { + const size_t embeddings_size = std::min(ctx->embd_size, n_outputs * ctx->model.hparams.n_embd); + + data_ctx->write(&embeddings_size, sizeof(embeddings_size)); + + if (embeddings_size) { + data_ctx->write(ctx->embd, embeddings_size * sizeof(float)); + } } } @@ -13898,17 +16420,22 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); - const size_t kv_buf_size = kv_self.total_size(); + // NOTE: kv_size and kv_buf_size are mostly used for sanity checks const uint32_t kv_head = llama_kv_cache_cell_max(kv_self); const uint32_t kv_size = kv_self.size; + const size_t kv_buf_size = kv_self.total_size() / (kv_size ? kv_size : 1) * kv_head; const uint32_t kv_used = kv_self.used; + const uint32_t v_trans = kv_self.v_trans ? 1 : 0; data_ctx->write(&kv_buf_size, sizeof(kv_buf_size)); data_ctx->write(&kv_head, sizeof(kv_head)); data_ctx->write(&kv_size, sizeof(kv_size)); data_ctx->write(&kv_used, sizeof(kv_used)); + data_ctx->write(&v_trans, sizeof(v_trans)); if (kv_buf_size) { + const size_t pre_kv_buf_size = data_ctx->get_size_written(); + std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); @@ -13917,7 +16444,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); - if (kv_self.recurrent) { + if (kv_self.recurrent || !kv_self.v_trans) { // v is contiguous for recurrent models // TODO: use other tensors for state models than k and v const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); @@ -13938,6 +16465,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat data_ctx->write(tmp_buf.data(), tmp_buf.size()); } } + GGML_ASSERT(kv_buf_size == data_ctx->get_size_written() - pre_kv_buf_size); } for (uint32_t i = 0; i < kv_head; ++i) { @@ -13956,15 +16484,17 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat } } -size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { +size_t llama_state_get_data(struct llama_context * ctx, uint8_t * dst) { llama_data_buffer_context data_ctx(dst); - llama_copy_state_data_internal(ctx, &data_ctx); + llama_state_get_data_internal(ctx, &data_ctx); return data_ctx.get_size_written(); } // Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { +size_t llama_state_set_data(struct llama_context * ctx, const uint8_t * src) { + llama_synchronize(ctx); + const uint8_t * inp = src; // set rng @@ -13982,6 +16512,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { GGML_ASSERT(!rng_ss.fail()); } + // set output ids + { + size_t n_outputs; + std::vector output_pos; + + memcpy(&n_outputs, inp, sizeof(n_outputs)); inp += sizeof(n_outputs); + + GGML_ASSERT(n_outputs <= llama_output_reserve(*ctx, n_outputs)); + + if (n_outputs) { + output_pos.resize(n_outputs); + memcpy(output_pos.data(), inp, n_outputs * sizeof(int32_t)); + inp += n_outputs * sizeof(int32_t); + + for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { + int32_t id = output_pos[i]; + GGML_ASSERT((uint32_t) id < ctx->cparams.n_batch); + ctx->output_ids[id] = i; + } + + ctx->n_outputs = n_outputs; + } + } + // set logits { size_t logits_size; @@ -14002,7 +16556,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { memcpy(&embeddings_size, inp, sizeof(embeddings_size)); inp += sizeof(embeddings_size); - GGML_ASSERT(ctx->embd_size == embeddings_size); + GGML_ASSERT(ctx->embd_size >= embeddings_size); if (embeddings_size) { memcpy(ctx->embd, inp, embeddings_size * sizeof(float)); @@ -14023,14 +16577,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { uint32_t kv_head; uint32_t kv_size; uint32_t kv_used; + uint32_t v_trans; memcpy(&kv_buf_size, inp, sizeof(kv_buf_size)); inp += sizeof(kv_buf_size); memcpy(&kv_head, inp, sizeof(kv_head)); inp += sizeof(kv_head); memcpy(&kv_size, inp, sizeof(kv_size)); inp += sizeof(kv_size); memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); + memcpy(&v_trans, inp, sizeof(v_trans)); inp += sizeof(v_trans); + + GGML_ASSERT(kv_self.v_trans == (bool) v_trans); // incompatible V transposition + + if (kv_self.size != kv_size) { + // the KV cache needs to be big enough to load all the KV cells from the saved state + GGML_ASSERT(kv_self.size >= kv_head); + + LLAMA_LOG_INFO("%s: state contains %d KV cells, was saved with kv_size=%d, but is loaded with kv_size=%d (fine, but different)\n", + __func__, kv_head, kv_size, kv_self.size); + } + + llama_kv_cache_clear(ctx); if (kv_buf_size) { - GGML_ASSERT(kv_self.total_size() == kv_buf_size); + const size_t pre_kv_buf_size = inp - src; + + GGML_ASSERT(kv_self.total_size() >= kv_buf_size); for (int il = 0; il < (int) n_layer; ++il) { const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head); @@ -14038,7 +16608,7 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); inp += k_size; - if (kv_self.recurrent) { + if (kv_self.recurrent || !kv_self.v_trans) { // v is contiguous for recurrent models // TODO: use other tensors for state models than k and v const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head); @@ -14050,23 +16620,19 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { // v is not contiguous, copy row by row const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head); - const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size); + const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_self.size); for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size); inp += v_row_size; } } + GGML_ASSERT(kv_buf_size == inp - src - pre_kv_buf_size); } - GGML_ASSERT(kv_self.size == kv_size); - ctx->kv_self.head = kv_head; - ctx->kv_self.size = kv_size; ctx->kv_self.used = kv_used; - ctx->kv_self.cells.resize(kv_size); - for (uint32_t i = 0; i < kv_head; ++i) { llama_pos pos; size_t seq_id_size; @@ -14083,22 +16649,17 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { ctx->kv_self.cells[i].seq_id.insert(seq_id); } } - - for (uint32_t i = kv_head; i < kv_size; ++i) { - ctx->kv_self.cells[i].pos = -1; - ctx->kv_self.cells[i].seq_id.clear(); - } } const size_t nread = inp - src; - const size_t max_size = llama_get_state_size(ctx); + const size_t max_size = llama_state_get_size(ctx); GGML_ASSERT(nread <= max_size); return nread; } -static bool llama_load_session_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +static bool llama_state_load_file_internal(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { llama_file file(path_session, "rb"); // sanity checks @@ -14136,7 +16697,7 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c // restore the context state { const size_t n_state_size_cur = file.size - file.tell(); - const size_t n_state_size_max = llama_get_state_size(ctx); + const size_t n_state_size_max = llama_state_get_size(ctx); if (n_state_size_cur > n_state_size_max) { LLAMA_LOG_ERROR("%s : the state size in session file is too big! max %zu, got %zu\n", __func__, n_state_size_max, n_state_size_cur); @@ -14146,22 +16707,22 @@ static bool llama_load_session_file_internal(struct llama_context * ctx, const c std::vector state_data(n_state_size_max); file.read_raw(state_data.data(), n_state_size_cur); - llama_set_state_data(ctx, state_data.data()); + llama_state_set_data(ctx, state_data.data()); } return true; } -bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { +bool llama_state_load_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { try { - return llama_load_session_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); + return llama_state_load_file_internal(ctx, path_session, tokens_out, n_token_capacity, n_token_count_out); } catch (const std::exception & err) { LLAMA_LOG_ERROR("error loading session file: %s\n", err.what()); return false; } } -bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { +static bool llama_state_save_file_internal(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { llama_file file(path_session, "wb"); file.write_u32(LLAMA_SESSION_MAGIC); @@ -14175,11 +16736,479 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi // save the context state using stream saving llama_data_file_context data_ctx(&file); - llama_copy_state_data_internal(ctx, &data_ctx); + llama_state_get_data_internal(ctx, &data_ctx); return true; } +bool llama_state_save_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count) { + try { + return llama_state_save_file_internal(ctx, path_session, tokens, n_token_count); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("error saving session file: %s\n", err.what()); + return false; + } +} + +size_t llama_state_seq_get_size(struct llama_context* ctx, llama_seq_id seq_id) { + // save the size of size_t as a uint32_t for safety check + const size_t size_t_size_size = sizeof(uint32_t); + + // other values + const size_t s_cell_count_size = sizeof(uint32_t); + const size_t s_layer_count_size = sizeof(uint32_t); + const size_t n_embd_v_gqa_size = sizeof(uint32_t); + + size_t s_cell_count = 0; + size_t s_cell_data_size = 0; + const auto & kv_self = ctx->kv_self; + const auto & hparams = ctx->model.hparams; + + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); + + for (uint32_t i = 0; i < kv_self.size; ++i) { + const auto & cell = kv_self.cells[i]; + if (cell.seq_id.count(seq_id) > 0) { + ++s_cell_count; + s_cell_data_size += sizeof(llama_pos); + } + } + + for (int il = 0; il < (int)n_layer; ++il) { + // types of keys and values + s_cell_data_size += sizeof(int32_t) * 2; + // k_size_row and v_size_el values of layer + s_cell_data_size += sizeof(size_t) * 2; + + // keys + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + s_cell_data_size += k_size_row * s_cell_count; + + // values (transposed) + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + s_cell_data_size += v_size_el * s_cell_count * n_embd_v_gqa; + } + + const size_t s_total = ( + size_t_size_size + + s_cell_count_size + + s_layer_count_size + + n_embd_v_gqa_size + + s_cell_data_size + ); + + return s_total; +} + +static size_t llama_state_seq_get_data_internal(struct llama_context * ctx, llama_data_context & data_ctx, llama_seq_id seq_id) { + llama_synchronize(ctx); + + const auto & kv_self = ctx->kv_self; + GGML_ASSERT(!kv_self.recurrent); // not implemented + + // Save the size of size_t as a uint32_t for safety check + const uint32_t size_t_size = sizeof(size_t); + data_ctx.write(&size_t_size, sizeof(size_t_size)); + + std::vector> cell_ranges; // ranges, from inclusive, to exclusive + uint32_t cell_count = 0; + + // Count the number of cells with the specified seq_id + // Find all the ranges of cells with this seq id + { + uint32_t cell_range_begin = kv_self.size; + for (uint32_t i = 0; i < kv_self.size; ++i) { + const auto & cell = kv_self.cells[i]; + if (cell.has_seq_id(seq_id)) { + ++cell_count; + if (cell_range_begin == kv_self.size) { + cell_range_begin = i; + } + } + else { + if (cell_range_begin != kv_self.size) { + cell_ranges.push_back({ cell_range_begin, i }); + cell_range_begin = kv_self.size; + } + } + } + if (cell_range_begin != kv_self.size) { + cell_ranges.push_back({ cell_range_begin, kv_self.size }); + } + + // DEBUG CHECK: Sum of cell counts in ranges should equal the total cell count + uint32_t cell_count_check = 0; + for (const auto & range : cell_ranges) { + cell_count_check += range.second - range.first; + } + GGML_ASSERT(cell_count == cell_count_check); + } + + // Write the cell count + data_ctx.write(&cell_count, sizeof(cell_count)); + + const auto & hparams = ctx->model.hparams; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); + + // Write the layer count + data_ctx.write(&n_layer, sizeof(n_layer)); + + // Write n_embd_v_gqa + data_ctx.write(&n_embd_v_gqa, sizeof(n_embd_v_gqa)); + + // Iterate the ranges and write all the pos (this is the token position in the prompt) + for (const auto & range : cell_ranges) { + for (uint32_t i = range.first; i < range.second; ++i) { + const auto & cell = kv_self.cells[i]; + data_ctx.write(&cell.pos, sizeof(cell.pos)); + } + } + + // Iterate and write all the keys first, each row is a cell + // Get whole range at a time + std::vector tmp_buf; + for (int il = 0; il < (int)n_layer; ++il) { + // Write key type + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + data_ctx.write(&k_type_i, sizeof(k_type_i)); + + // Write row size of key + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + data_ctx.write(&k_size_row, sizeof(k_size_row)); + + // Read each range of cells of k_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + tmp_buf.resize(range_size * k_size_row); + ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), range.first * k_size_row, range_size * k_size_row); + data_ctx.write(tmp_buf.data(), tmp_buf.size()); + } + } + + // TODO: simplify, reduce copy-paste + if (!kv_self.v_trans) { + for (int il = 0; il < (int)n_layer; ++il) { + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + data_ctx.write(&v_type_i, sizeof(v_type_i)); + + // Write row size of value + const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + data_ctx.write(&v_size_row, sizeof(v_size_row)); + + // Read each range of cells of v_size length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + tmp_buf.resize(range_size * v_size_row); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), range.first * v_size_row, range_size * v_size_row); + data_ctx.write(tmp_buf.data(), tmp_buf.size()); + } + } + } else { + // For the values, they are transposed, so we also need the element size and get the element ranges from each row + const uint32_t kv_size = kv_self.size; + for (int il = 0; il < (int)n_layer; ++il) { + // Write value type + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + data_ctx.write(&v_type_i, sizeof(v_type_i)); + + // Write element size + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + data_ctx.write(&v_size_el, sizeof(v_size_el)); + + // For each row, we get the element values of each cell + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + // Read each range of cells of v_size_el length each into tmp_buf and write out + for (const auto & range : cell_ranges) { + const size_t range_size = range.second - range.first; + const size_t src_offset = (range.first + j * kv_size) * v_size_el; + tmp_buf.resize(range_size * v_size_el); + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), src_offset, tmp_buf.size()); + data_ctx.write(tmp_buf.data(), tmp_buf.size()); + } + } + } + } + + return data_ctx.get_size_written(); +} + +size_t llama_state_seq_get_data(struct llama_context* ctx, uint8_t* dst, llama_seq_id seq_id) { + llama_data_buffer_context data_ctx(dst); + return llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); +} + +size_t llama_state_seq_set_data(struct llama_context * ctx, const uint8_t * src, llama_seq_id dest_seq_id) { + llama_synchronize(ctx); + + auto & kv_self = ctx->kv_self; + GGML_ASSERT(!kv_self.recurrent); // not implemented + + // Wipe the slot + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + + const uint8_t * inp = src; + + // Read size of size_t + uint32_t size_t_size; + memcpy(&size_t_size, inp, sizeof(size_t_size)); + inp += sizeof(size_t_size); + if (size_t_size != sizeof(size_t)) { + LLAMA_LOG_ERROR("%s: size_t size mismatch\n", __func__); + return 0; + } + + // Read the cell count + uint32_t cell_count; + memcpy(&cell_count, inp, sizeof(cell_count)); + inp += sizeof(cell_count); + + // Read the layer count + uint32_t n_layer_ref; + memcpy(&n_layer_ref, inp, sizeof(n_layer_ref)); + inp += sizeof(n_layer_ref); + + // Read n_embd_v_gqa + uint32_t n_embd_v_gqa_ref; + memcpy(&n_embd_v_gqa_ref, inp, sizeof(n_embd_v_gqa_ref)); + inp += sizeof(n_embd_v_gqa_ref); + + // Sanity check model compatibility + const auto & hparams = ctx->model.hparams; + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s(); + if (n_layer != n_layer_ref) { + LLAMA_LOG_ERROR("%s: mismatched n_layer (%d != %d)\n", __func__, n_layer, n_layer_ref); + return 0; + } + if (n_embd_v_gqa != n_embd_v_gqa_ref) { + LLAMA_LOG_ERROR("%s: mismatched n_embd_v_gqa (%d != %d)\n", __func__, n_embd_v_gqa, n_embd_v_gqa_ref); + return 0; + } + + // Allocate the new cells for the slot + if (cell_count) { + llama_batch batch = llama_batch_init(cell_count, 0, 1); + batch.n_tokens = cell_count; + for (uint32_t i = 0; i < cell_count; ++i) { + llama_pos pos; + memcpy(&pos, inp, sizeof(pos)); + inp += sizeof(pos); + + batch.pos[i] = pos; + batch.n_seq_id[i] = 1; + batch.seq_id[i][0] = dest_seq_id; + } + if (!llama_kv_cache_find_slot(kv_self, batch)) { + llama_batch_free(batch); + LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); + return 0; + } + + // DEBUG CHECK: kv_self.head should be our first cell, kv_self.head + cell_count - 1 should be our last cell (verify seq_id and pos values) + // Assume that this is one contiguous block of cells + GGML_ASSERT(kv_self.head + cell_count <= kv_self.size); + GGML_ASSERT(kv_self.cells[kv_self.head].pos == batch.pos[0]); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].pos == batch.pos[cell_count - 1]); + GGML_ASSERT(kv_self.cells[kv_self.head].has_seq_id(dest_seq_id)); + GGML_ASSERT(kv_self.cells[kv_self.head + cell_count - 1].has_seq_id(dest_seq_id)); + + // Cleanup + llama_batch_free(batch); + } + + const uint32_t kv_size = kv_self.size; + const uint32_t kv_head = kv_self.head; + + // For each layer, read the keys for each cell, one row is one cell, read as one contiguous blo + for (int il = 0; il < (int)n_layer; ++il) { + // Read type of key + int32_t k_type_i_ref; + memcpy(&k_type_i_ref, inp, sizeof(k_type_i_ref)); + inp += sizeof(k_type_i_ref); + const int32_t k_type_i = (int32_t)kv_self.k_l[il]->type; + if (k_type_i != k_type_i_ref) { + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + LLAMA_LOG_ERROR("%s: mismatched key type (%d != %d, layer %d)\n", __func__, k_type_i, k_type_i_ref, il); + return 0; + } + + // Read row size of key + size_t k_size_row_ref; + memcpy(&k_size_row_ref, inp, sizeof(k_size_row_ref)); + inp += sizeof(k_size_row_ref); + const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); + if (k_size_row != k_size_row_ref) { + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + LLAMA_LOG_ERROR("%s: mismatched key row size (%zu != %zu, layer %d)\n", __func__, k_size_row, k_size_row_ref, il); + return 0; + } + + if (cell_count) { + // Read and set the keys for the whole cell range + ggml_backend_tensor_set(kv_self.k_l[il], inp, kv_head * k_size_row, cell_count * k_size_row); + inp += cell_count * k_size_row; + } + } + + // TODO: simplify, reduce copy-paste + if (!kv_self.v_trans) { + for (int il = 0; il < (int)n_layer; ++il) { + // Read type of value + int32_t v_type_i_ref; + memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); + inp += sizeof(v_type_i_ref); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return 0; + } + + // Read row size of value + size_t v_size_row_ref; + memcpy(&v_size_row_ref, inp, sizeof(v_size_row_ref)); + inp += sizeof(v_size_row_ref); + const size_t v_size_row = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa); + if (v_size_row != v_size_row_ref) { + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + LLAMA_LOG_ERROR("%s: mismatched value row size (%zu != %zu, layer %d)\n", __func__, v_size_row, v_size_row_ref, il); + return 0; + } + + if (cell_count) { + // Read and set the values for the whole cell range + ggml_backend_tensor_set(kv_self.v_l[il], inp, kv_head * v_size_row, cell_count * v_size_row); + inp += cell_count * v_size_row; + } + } + } else { + // For each layer, read the values for each cell (transposed) + for (int il = 0; il < (int)n_layer; ++il) { + // Read type of value + int32_t v_type_i_ref; + memcpy(&v_type_i_ref, inp, sizeof(v_type_i_ref)); + inp += sizeof(v_type_i_ref); + const int32_t v_type_i = (int32_t)kv_self.v_l[il]->type; + if (v_type_i != v_type_i_ref) { + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + LLAMA_LOG_ERROR("%s: mismatched value type (%d != %d, layer %d)\n", __func__, v_type_i, v_type_i_ref, il); + return 0; + } + + // Read element size of value + size_t v_size_el_ref; + memcpy(&v_size_el_ref, inp, sizeof(v_size_el_ref)); + inp += sizeof(v_size_el_ref); + const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); + if (v_size_el != v_size_el_ref) { + llama_kv_cache_seq_rm(kv_self, dest_seq_id, -1, -1); + LLAMA_LOG_ERROR("%s: mismatched value element size (%zu != %zu, layer %d)\n", __func__, v_size_el, v_size_el_ref, il); + return 0; + } + + if (cell_count) { + // For each row in the transposed matrix, read the values for the whole cell range + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + const size_t dst_offset = (kv_head + j * kv_size) * v_size_el; + ggml_backend_tensor_set(kv_self.v_l[il], inp, dst_offset, cell_count * v_size_el); + inp += cell_count * v_size_el; + } + } + } + } + + const size_t nread = inp - src; + + return nread; +} + +static size_t llama_state_seq_save_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { + llama_file file(filepath, "wb"); + + file.write_u32(LLAMA_STATE_SEQ_MAGIC); + file.write_u32(LLAMA_STATE_SEQ_VERSION); + + // save the prompt + file.write_u32((uint32_t)n_token_count); + file.write_raw(tokens, sizeof(llama_token) * n_token_count); + + // save the context state using stream saving + llama_data_file_context data_ctx(&file); + llama_state_seq_get_data_internal(ctx, data_ctx, seq_id); + + const size_t res = file.tell(); + GGML_ASSERT(res == sizeof(uint32_t) * 3 + sizeof(llama_token) * n_token_count + data_ctx.get_size_written()); + return res; +} + +static size_t llama_state_seq_load_file_internal(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + llama_file file(filepath, "rb"); + + // version checks + { + const uint32_t magic = file.read_u32(); + const uint32_t version = file.read_u32(); + + if (magic != LLAMA_STATE_SEQ_MAGIC || version != LLAMA_STATE_SEQ_VERSION) { + LLAMA_LOG_ERROR("%s: unknown (magic, version) for sequence state file: %08x, %08x\n", __func__, magic, version); + return 0; + } + } + + // load the prompt + { + const uint32_t n_token_count = file.read_u32(); + + if (n_token_count > n_token_capacity) { + LLAMA_LOG_ERROR("%s: token count in sequence state file exceeded capacity! %u > %zu\n", __func__, n_token_count, n_token_capacity); + return 0; + } + + file.read_raw(tokens_out, sizeof(llama_token) * n_token_count); + *n_token_count_out = n_token_count; + } + + // restore the context state + { + const size_t state_size = file.size - file.tell(); + std::vector state_data(state_size); + file.read_raw(state_data.data(), state_size); + const size_t nread = llama_state_seq_set_data(ctx, state_data.data(), dest_seq_id); + if (!nread) { + LLAMA_LOG_ERROR("%s: failed to restore sequence state\n", __func__); + return 0; + } + GGML_ASSERT(nread <= state_size); + GGML_ASSERT(nread + sizeof(uint32_t) * 3 + sizeof(llama_token) * *n_token_count_out == file.tell()); + } + + return file.tell(); +} + +size_t llama_state_seq_save_file(struct llama_context * ctx, const char * filepath, llama_seq_id seq_id, const llama_token * tokens, size_t n_token_count) { + try { + return llama_state_seq_save_file_internal(ctx, filepath, seq_id, tokens, n_token_count); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("error saving sequence state file: %s\n", err.what()); + return 0; + } +} + +size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepath, llama_seq_id dest_seq_id, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out) { + try { + return llama_state_seq_load_file_internal(ctx, filepath, dest_seq_id, tokens_out, n_token_capacity, n_token_count_out); + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("error loading sequence state file: %s\n", err.what()); + return 0; + } +} + void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) { ctx->cparams.n_threads = n_threads; ctx->cparams.n_threads_batch = n_threads_batch; @@ -14293,11 +17322,41 @@ float * llama_get_logits(struct llama_context * ctx) { } float * llama_get_logits_ith(struct llama_context * ctx, int32_t i) { - assert(ctx->logits_valid.at(i)); - + int32_t j = -1; llama_synchronize(ctx); - return ctx->logits + i*ctx->model.hparams.n_vocab; + try { + if (ctx->logits == nullptr) { + throw std::runtime_error("no logits"); + } + + if (i < 0) { + j = ctx->n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs)); + } + } else if ((size_t) i >= ctx->output_ids.size()) { + throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size())); + } else { + j = ctx->output_ids[i]; + } + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= ctx->n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs)); + } + + return ctx->logits + j*ctx->model.hparams.n_vocab; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid logits id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ASSERT(false); +#endif + return nullptr; + } } float * llama_get_embeddings(struct llama_context * ctx) { @@ -14307,9 +17366,42 @@ float * llama_get_embeddings(struct llama_context * ctx) { } float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i) { + int32_t j = -1; + llama_synchronize(ctx); - return ctx->embd + i*ctx->model.hparams.n_embd; + try { + if (ctx->embd == nullptr) { + throw std::runtime_error("no embeddings"); + } + + if (i < 0) { + j = ctx->n_outputs + i; + if (j < 0) { + throw std::runtime_error(format("negative index out of range [0, %d)", ctx->n_outputs)); + } + } else if ((size_t) i >= ctx->output_ids.size()) { + throw std::runtime_error(format("out of range [0, %lu)", ctx->output_ids.size())); + } else { + j = ctx->output_ids[i]; + } + + if (j < 0) { + throw std::runtime_error(format("batch.logits[%d] != true", i)); + } + if (j >= ctx->n_outputs) { + // This should not happen + throw std::runtime_error(format("corrupt output buffer (j=%d, n_outputs=%d)", j, ctx->n_outputs)); + } + + return ctx->embd + j*ctx->model.hparams.n_embd; + } catch (const std::exception & err) { + LLAMA_LOG_ERROR("%s: invalid embeddings id %d, reason: %s\n", __func__, i, err.what()); +#ifndef NDEBUG + GGML_ASSERT(false); +#endif + return nullptr; + } } float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id) { @@ -14338,6 +17430,13 @@ llama_token_type llama_token_get_type(const struct llama_model * model, llama_to return model->vocab.id_to_token[token].type; } +bool llama_token_is_eog(const struct llama_model * model, llama_token token) { + return token != -1 && ( + token == llama_token_eos(model) || + token == llama_token_eot(model) + ); +} + llama_token llama_token_bos(const struct llama_model * model) { return model->vocab.special_bos_id; } @@ -14346,6 +17445,14 @@ llama_token llama_token_eos(const struct llama_model * model) { return model->vocab.special_eos_id; } +llama_token llama_token_cls(const struct llama_model * model) { + return model->vocab.special_cls_id; +} + +llama_token llama_token_sep(const struct llama_model * model) { + return model->vocab.special_sep_id; +} + llama_token llama_token_nl(const struct llama_model * model) { return model->vocab.linefeed_id; } @@ -14380,9 +17487,9 @@ int32_t llama_tokenize( int32_t text_len, llama_token * tokens, int32_t n_tokens_max, - bool add_bos, - bool special) { - auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_bos, special); + bool add_special, + bool parse_special) { + auto res = llama_tokenize_internal(model->vocab, std::string(text, text_len), add_special, parse_special); if (n_tokens_max < (int) res.size()) { // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__); @@ -14398,16 +17505,17 @@ int32_t llama_tokenize( static std::string llama_decode_text(const std::string & text) { std::string decoded_text; - auto unicode_sequences = unicode_cpts_from_utf8(text); - for (auto & unicode_sequence : unicode_sequences) { - decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence)); + + const auto cpts = unicode_cpts_from_utf8(text); + for (const auto cpt : cpts) { + decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt)); } return decoded_text; } // does not write null-terminator to buf -int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length) { +int32_t llama_token_to_piece(const struct llama_model * model, llama_token token, char * buf, int32_t length, bool special) { if (0 <= token && token < llama_n_vocab(model)) { switch (llama_vocab_get_type(model->vocab)) { case LLAMA_VOCAB_TYPE_WPM: @@ -14422,7 +17530,9 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, result.c_str(), result.length()); return result.length(); - } else if (llama_is_user_defined_token(model->vocab, token)) { + } else if ( + (llama_is_user_defined_token(model->vocab, token)) || + (llama_is_control_token (model->vocab, token) && special)) { std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { return -(int) result.length(); @@ -14435,8 +17545,6 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, "\xe2\x96\x85", 3); return 3; - } else if (llama_is_control_token(model->vocab, token)) { - ; } else if (llama_is_byte_token(model->vocab, token)) { if (length < 1) { return -1; @@ -14457,15 +17565,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token } memcpy(buf, result.c_str(), result.length()); return result.length(); - } else if (llama_is_user_defined_token(model->vocab, token)) { + } else if ( + (llama_is_user_defined_token(model->vocab, token)) || + (llama_is_control_token (model->vocab, token) && special)) { std::string result = model->vocab.id_to_token[token].text; if (length < (int) result.length()) { return -(int) result.length(); } memcpy(buf, result.c_str(), result.length()); return result.length(); - } else if (llama_is_control_token(model->vocab, token)) { - ; } break; } @@ -14618,6 +17726,88 @@ static int32_t llama_chat_apply_template_internal( ss << trim(message->content); } } + } else if (tmpl == "openchat" || tmpl.find("GPT4 Correct ") != std::string::npos) { + // openchat/openchat-3.5-0106, + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content << "<|end_of_turn|>"; + } else { + role[0] = toupper(role[0]); + ss << "GPT4 Correct " << role << ": " << message->content << "<|end_of_turn|>"; + } + } + if (add_ass) { + ss << "GPT4 Correct Assistant:"; + } + } else if (tmpl == "vicuna" || tmpl == "vicuna-orca" || (tmpl.find("USER: ") != std::string::npos && tmpl.find("ASSISTANT: ") != std::string::npos)) { + // eachadea/vicuna-13b-1.1 (and Orca variant) + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + // Orca-Vicuna variant uses a system prefix + if (tmpl == "vicuna-orca" || tmpl.find("SYSTEM: ") != std::string::npos) { + ss << "SYSTEM: " << message->content << "\n"; + } else { + ss << message->content << "\n\n"; + } + } else if (role == "user") { + ss << "USER: " << message->content << "\n"; + } else if (role == "assistant") { + ss << "ASSISTANT: " << message->content << "\n"; + } + } + if (add_ass) { + ss << "ASSISTANT:"; + } + } else if (tmpl == "deepseek" || (tmpl.find("### Instruction:") != std::string::npos && tmpl.find("<|EOT|>") != std::string::npos)) { + // deepseek-ai/deepseek-coder-33b-instruct + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << message->content; + } else if (role == "user") { + ss << "### Instruction:\n" << message->content << "\n"; + } else if (role == "assistant") { + ss << "### Response:\n" << message->content << "\n<|EOT|>\n"; + } + } + if (add_ass) { + ss << "### Response:\n"; + } + } else if (tmpl == "command-r" || (tmpl.find("<|START_OF_TURN_TOKEN|>") != std::string::npos && tmpl.find("<|USER_TOKEN|>") != std::string::npos)) { + // CohereForAI/c4ai-command-r-plus + for (auto message : chat) { + std::string role(message->role); + if (role == "system") { + ss << "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } else if (role == "user") { + ss << "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } else if (role == "assistant") { + ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" << trim(message->content) << "<|END_OF_TURN_TOKEN|>"; + } + } + if (add_ass) { + ss << "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"; + } + } else if (tmpl == "llama3" || (tmpl.find("<|start_header_id|>") != std::string::npos && tmpl.find("<|end_header_id|>") != std::string::npos)) { + // Llama 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|start_header_id|>" << role << "<|end_header_id|>\n\n" << trim(message->content) << "<|eot_id|>"; + } + if (add_ass) { + ss << "<|start_header_id|>assistant<|end_header_id|>\n\n"; + } + } else if (tmpl == "phi3" || (tmpl.find("<|assistant|>") != std::string::npos && tmpl.find("<|end|>") != std::string::npos )) { + // Phi 3 + for (auto message : chat) { + std::string role(message->role); + ss << "<|" << role << "|>\n" << trim(message->content) << "<|end|>\n"; + } + if (add_ass) { + ss << "<|assistant|>\n"; + } } else { // template not supported return -1; @@ -14667,6 +17857,30 @@ LLAMA_API int32_t llama_chat_apply_template( return res; } +LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count) { + static const char * const SPLIT_PATH_FORMAT = "%s-%05d-of-%05d.gguf"; + if (snprintf(split_path, maxlen, SPLIT_PATH_FORMAT, path_prefix, split_no + 1, split_count)) { + return strlen(split_path); + } + return 0; +} + +int llama_split_prefix(char * dest, size_t maxlen, const char * split_path, int split_no, int split_count) { + std::string str_split_path(split_path); + char postfix[32]; + snprintf(postfix, 32, "-%05d-of-%05d.gguf", split_no + 1, split_count); + std::string str_postfix(postfix); + + // check if dest ends with postfix + int size_prefix = str_split_path.size() - str_postfix.size(); + if (size_prefix > 0 && str_split_path.find(str_postfix, size_prefix) != std::string::npos) { + snprintf(dest, std::min((size_t) size_prefix + 1, maxlen), "%s", split_path); + return size_prefix; + } + + return 0; +} + struct llama_timings llama_get_timings(struct llama_context * ctx) { struct llama_timings result = { /*.t_start_ms =*/ 1e-3 * ctx->t_start_us, @@ -14677,7 +17891,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) { /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, /*.n_sample =*/ std::max(1, ctx->n_sample), - /*.n_p_eval =*/ std::max(1, ctx->n_p_eval), + /*.n_p_eval =*/ std::max(0, ctx->n_p_eval), /*.n_eval =*/ std::max(1, ctx->n_eval), }; @@ -14726,6 +17940,11 @@ const char * llama_print_system_info(void) { s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; s += "MATMUL_INT8 = " + std::to_string(ggml_cpu_has_matmul_int8()) + " | "; +#ifdef GGML_USE_LLAMAFILE + s += "LLAMAFILE = 1 | "; +#else + s += "LLAMAFILE = 0 | "; +#endif return s.c_str(); } diff --git a/llama.h b/llama.h index 40dcf54e3..0b2e708d0 100644 --- a/llama.h +++ b/llama.h @@ -37,9 +37,13 @@ #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' +#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN -#define LLAMA_SESSION_VERSION 4 +#define LLAMA_SESSION_VERSION 6 + +#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ +#define LLAMA_STATE_SEQ_VERSION 1 #ifdef __cplusplus extern "C" { @@ -60,9 +64,26 @@ extern "C" { enum llama_vocab_type { LLAMA_VOCAB_TYPE_NONE = 0, // For models without vocab - LLAMA_VOCAB_TYPE_SPM = 1, // SentencePiece - LLAMA_VOCAB_TYPE_BPE = 2, // Byte Pair Encoding - LLAMA_VOCAB_TYPE_WPM = 3, // WordPiece + LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback + LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE + LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece + }; + + // pre-tokenization types + enum llama_vocab_pre_type { + LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0, + LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2, + LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3, + LLAMA_VOCAB_PRE_TYPE_FALCON = 4, + LLAMA_VOCAB_PRE_TYPE_MPT = 5, + LLAMA_VOCAB_PRE_TYPE_STARCODER = 6, + LLAMA_VOCAB_PRE_TYPE_GPT2 = 7, + LLAMA_VOCAB_PRE_TYPE_REFACT = 8, + LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9, + LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10, + LLAMA_VOCAB_PRE_TYPE_OLMO = 11, + LLAMA_VOCAB_PRE_TYPE_DBRX = 12, }; // note: these values should be synchronized with ggml_rope @@ -117,6 +138,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_IQ2_S = 28, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors + LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -154,7 +177,7 @@ extern "C" { bool sorted; } llama_token_data_array; - typedef bool (*llama_progress_callback)(float progress, void *ctx); + typedef bool (*llama_progress_callback)(float progress, void * user_data); // Input data for llama_decode // A llama_batch object can contain input about one or many sequences @@ -190,15 +213,19 @@ extern "C" { LLAMA_KV_OVERRIDE_TYPE_INT, LLAMA_KV_OVERRIDE_TYPE_FLOAT, LLAMA_KV_OVERRIDE_TYPE_BOOL, + LLAMA_KV_OVERRIDE_TYPE_STR, }; struct llama_model_kv_override { - char key[128]; enum llama_model_kv_override_type tag; + + char key[128]; + union { - int64_t int_value; - double float_value; - bool bool_value; + int64_t val_i64; + double val_f64; + bool val_bool; + char val_str[128]; }; }; @@ -227,9 +254,10 @@ extern "C" { const struct llama_model_kv_override * kv_overrides; // Keep the booleans together to avoid misalignment during copy-by-value. - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool check_tensors; // validate model tensor data }; struct llama_context_params { @@ -265,6 +293,7 @@ extern "C" { bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU + bool flash_attn; // whether to use flash attention // Abort callback // if it returns true, execution of llama_decode() will be aborted @@ -275,13 +304,17 @@ extern "C" { // model quantization parameters typedef struct llama_model_quantize_params { - int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() - enum llama_ftype ftype; // quantize to this llama_ftype - bool allow_requantize; // allow quantizing non-f32/f16 tensors - bool quantize_output_tensor; // quantize output.weight - bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored - bool pure; // quantize all tensors to the default type - void * imatrix; // pointer to importance matrix data + int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() + enum llama_ftype ftype; // quantize to this llama_ftype + enum ggml_type output_tensor_type; // output tensor type + enum ggml_type token_embedding_type; // itoken embeddings tensor type + bool allow_requantize; // allow quantizing non-f32/f16 tensors + bool quantize_output_tensor; // quantize output.weight + bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored + bool pure; // quantize all tensors to the default type + bool keep_split; // quantize to the same number of shards + void * imatrix; // pointer to importance matrix data + void * kv_overrides; // pointer to vector containing overrides } llama_model_quantize_params; // grammar types @@ -382,8 +415,10 @@ extern "C" { LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); - LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model); - LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); + LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); + + LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); + LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); @@ -514,11 +549,12 @@ extern "C" { // Returns the number of used KV cells (i.e. have at least one sequence assigned to them) LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx); - // Clear the KV cache + // Clear the KV cache - both cell info is erased and KV data is zeroed LLAMA_API void llama_kv_cache_clear( struct llama_context * ctx); // Removes all tokens that belong to the specified sequence and have positions in [p0, p1) + // Returns false if a partial sequence cannot be removed. Removing a whole sequence never fails // seq_id < 0 : match any sequence // p0 < 0 : [0, p1] // p1 < 0 : [p0, inf) @@ -590,34 +626,92 @@ extern "C" { // Returns the maximum size in bytes of the state (rng, logits, embedding // and kv_cache) - will often be smaller after compacting tokens - LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx); + LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx); + LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx), + "use llama_state_get_size instead"); // Copies the state to the specified destination address. // Destination needs to have allocated enough memory. // Returns the number of bytes copied - LLAMA_API size_t llama_copy_state_data( + LLAMA_API size_t llama_state_get_data( struct llama_context * ctx, uint8_t * dst); + LLAMA_API DEPRECATED(size_t llama_copy_state_data( + struct llama_context * ctx, + uint8_t * dst), + "use llama_state_get_data instead"); // Set the state reading from the specified address // Returns the number of bytes read - LLAMA_API size_t llama_set_state_data( + LLAMA_API size_t llama_state_set_data( struct llama_context * ctx, const uint8_t * src); + LLAMA_API DEPRECATED(size_t llama_set_state_data( + struct llama_context * ctx, + const uint8_t * src), + "use llama_state_set_data instead"); // Save/load session file - LLAMA_API bool llama_load_session_file( + LLAMA_API bool llama_state_load_file( struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out); + LLAMA_API DEPRECATED(bool llama_load_session_file( + struct llama_context * ctx, + const char * path_session, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out), + "use llama_state_load_file instead"); - LLAMA_API bool llama_save_session_file( + LLAMA_API bool llama_state_save_file( struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count); + LLAMA_API DEPRECATED(bool llama_save_session_file( + struct llama_context * ctx, + const char * path_session, + const llama_token * tokens, + size_t n_token_count), + "use llama_state_save_file instead"); + + // Get the exact size needed to copy the KV cache of a single sequence + LLAMA_API size_t llama_state_seq_get_size( + struct llama_context * ctx, + llama_seq_id seq_id); + + // Copy the KV cache of a single sequence into the specified buffer + LLAMA_API size_t llama_state_seq_get_data( + struct llama_context * ctx, + uint8_t * dst, + llama_seq_id seq_id); + + // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence + // Returns: + // - Positive: Ok + // - Zero: Failed to load + LLAMA_API size_t llama_state_seq_set_data( + struct llama_context * ctx, + const uint8_t * src, + llama_seq_id dest_seq_id); + + LLAMA_API size_t llama_state_seq_save_file( + struct llama_context * ctx, + const char * filepath, + llama_seq_id seq_id, + const llama_token * tokens, + size_t n_token_count); + + LLAMA_API size_t llama_state_seq_load_file( + struct llama_context * ctx, + const char * filepath, + llama_seq_id dest_seq_id, + llama_token * tokens_out, + size_t n_token_capacity, + size_t * n_token_count_out); // // Decoding @@ -674,23 +768,31 @@ extern "C" { LLAMA_API void llama_synchronize(struct llama_context * ctx); // Token logits obtained from the last call to llama_decode() - // The logits for the last token are stored in the last row - // Logits for which llama_batch.logits[i] == 0 are undefined - // Rows: n_tokens provided with llama_batch + // The logits for which llama_batch.logits[i] != 0 are stored contiguously + // in the order they have appeared in the batch. + // Rows: number of tokens for which llama_batch.logits[i] != 0 // Cols: n_vocab LLAMA_API float * llama_get_logits(struct llama_context * ctx); - // Logits for the ith token. Equivalent to: - // llama_get_logits(ctx) + i*n_vocab + // Logits for the ith token. For positive indices, Equivalent to: + // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab + // Negative indicies can be used to access logits in reverse order, -1 is the last logit. + // returns NULL for invalid ids. LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i); - // Get all output token embeddings - // shape: [n_tokens*n_embd] (1-dimensional) + // Get all output token embeddings. + // when pooling_type == LLAMA_POOLING_TYPE_NONE or when using a generative model, + // the embeddings for which llama_batch.logits[i] != 0 are stored contiguously + // in the order they have appeared in the batch. + // shape: [n_outputs*n_embd] + // Otherwise, returns NULL. LLAMA_API float * llama_get_embeddings(struct llama_context * ctx); - // Get the embeddings for the ith token - // llama_get_embeddings(ctx) + i*n_embd + // Get the embeddings for the ith token. For positive indices, Equivalent to: + // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd + // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding. // shape: [n_embd] (1-dimensional) + // returns NULL for invalid ids. LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i); // Get the embeddings for a sequence id @@ -708,9 +810,14 @@ extern "C" { LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); + // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.) + LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token); + // Special tokens LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence + LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification + LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line // Returns -1 if unknown, 1 for true or 0 for false. @@ -719,7 +826,7 @@ extern "C" { // Returns -1 if unknown, 1 for true or 0 for false. LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model); - // codellama infill tokens + // Codellama infill tokens LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix @@ -733,26 +840,28 @@ extern "C" { /// @param tokens The tokens pointer must be large enough to hold the resulting tokens. /// @return Returns the number of tokens on success, no more than n_tokens_max /// @return Returns a negative number on failure - the number of tokens that would have been returned - /// @param special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated as plaintext. - /// Does not insert a leading space. + /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated + /// as plaintext. Does not insert a leading space. LLAMA_API int32_t llama_tokenize( const struct llama_model * model, const char * text, int32_t text_len, llama_token * tokens, int32_t n_tokens_max, - bool add_bos, - bool special); + bool add_special, + bool parse_special); // Token Id -> Piece. // Uses the vocabulary in the provided context. // Does not write null terminator to the buffer. // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens. + // @param special If true, special tokens are rendered in the output. LLAMA_API int32_t llama_token_to_piece( const struct llama_model * model, llama_token token, char * buf, - int32_t length); + int32_t length, + bool special); /// Apply chat template. Inspired by hf apply_chat_template() on python. /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" @@ -905,7 +1014,7 @@ extern "C" { struct llama_context * ctx, llama_token_data_array * candidates); - /// @details Randomly selects a token from the candidates based on their probabilities. + /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx. LLAMA_API llama_token llama_sample_token( struct llama_context * ctx, llama_token_data_array * candidates); @@ -960,6 +1069,16 @@ extern "C" { int32_t n_past, int32_t n_predict); + /// @details Build a split GGUF final path for this chunk. + /// llama_split_path(split_path, sizeof(split_path), "/models/ggml-model-q4_0", 2, 4) => split_path = "/models/ggml-model-q4_0-00002-of-00004.gguf" + // Returns the split_path length. + LLAMA_API int llama_split_path(char * split_path, size_t maxlen, const char * path_prefix, int split_no, int split_count); + + /// @details Extract the path prefix from the split_path if and only if the split_no and split_count match. + /// llama_split_prefix(split_prefix, 64, "/models/ggml-model-q4_0-00002-of-00004.gguf", 2, 4) => split_prefix = "/models/ggml-model-q4_0" + // Returns the split_prefix length. + LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); + // Performance information LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); @@ -982,15 +1101,49 @@ extern "C" { // Internal API to be implemented by llama.cpp and used by tests/benchmarks only #ifdef LLAMA_API_INTERNAL -#include +#include #include +#include struct ggml_tensor; +struct llama_partial_utf8 { + uint32_t value; // bit value so far (unshifted) + int n_remain; // num bytes remaining; -1 indicates invalid sequence +}; + +struct llama_grammar { + const std::vector> rules; + std::vector> stacks; + + // buffer for partially generated UTF-8 sequence from accepted tokens + llama_partial_utf8 partial_utf8; +}; + +struct llama_grammar_candidate { + size_t index; + const uint32_t * code_points; + llama_partial_utf8 partial_utf8; +}; + const std::vector> & llama_internal_get_tensor_map( struct llama_context * ctx ); +void llama_grammar_accept( + const std::vector> & rules, + const std::vector> & stacks, + const uint32_t chr, + std::vector> & new_stacks); + +std::pair, llama_partial_utf8> decode_utf8( + const std::string & src, + llama_partial_utf8 partial_start); + +// Randomly selects a token from the candidates based on their probabilities using given std::mt19937. +// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences. +llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng); + #endif // LLAMA_API_INTERNAL #endif // LLAMA_H diff --git a/media/matmul.png b/media/matmul.png new file mode 100644 index 000000000..786a20492 Binary files /dev/null and b/media/matmul.png differ diff --git a/media/matmul.svg b/media/matmul.svg new file mode 100644 index 000000000..1d6cb4bb7 --- /dev/null +++ b/media/matmul.svg @@ -0,0 +1,1238 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ARow-major + BTColumn-major + CT=ABTColumn-major + + ne00 + + ne01 + + ne1 + + ne0 + + ne10 + + ne11 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + BRow-major + ATColumn-major + C=BATRow-major + + ne10 + + ne11 + + ne0 + + ne1 + + ne00 + + ne01 + + + diff --git a/models/ggml-vocab-bert-bge.gguf b/models/ggml-vocab-bert-bge.gguf new file mode 100644 index 000000000..b2cbd5df6 Binary files /dev/null and b/models/ggml-vocab-bert-bge.gguf differ diff --git a/models/ggml-vocab-bert-bge.gguf.inp b/models/ggml-vocab-bert-bge.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-bert-bge.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-bert-bge.gguf.out b/models/ggml-vocab-bert-bge.gguf.out new file mode 100644 index 000000000..e4a76cdb0 --- /dev/null +++ b/models/ggml-vocab-bert-bge.gguf.out @@ -0,0 +1,43 @@ + 29464 2094 1018 1092 2706 + 11865 17875 + + + + + + + + + + 7592 2088 + 7592 2088 + 7592 2088 + 7592 2088 + 7592 2088 999 + 7592 1010 2088 999 + 7592 1010 2088 999 + 2023 2003 100 1012 18133 2361 + 1059 2692 18139 1021 8525 28418 2243 16233 20952 6979 + 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 + 100 + 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 1006 2069 7861 29147 2072 2008 2038 2049 2219 19204 1007 + 7592 + 7592 + 7592 + 7592 + 7592 + 7592 7592 + 1006 + 1027 + 1005 3690 + 7592 1010 1061 1005 2035 999 2129 2024 2017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 + 1017 + 3943 + 21211 + 21211 2509 + 21211 22394 + 21211 22394 2509 + 21211 22394 22394 + 21211 22394 22394 2509 + 21211 22394 22394 22394 + 100 1006 3671 1007 100 1006 3674 7861 29147 2483 9530 16280 23854 1007 100 100 1017 3943 21211 21211 2509 21211 22394 21211 22394 2509 21211 22394 22394 21211 22394 22394 2509 1017 1012 1017 1017 1012 1012 1017 1017 1012 1012 1012 1017 100 1029 1855 100 100 6207 100 100 14677 23632 22203 1811 1995 1011 1011 1011 1011 1011 1011 1027 1027 1027 1027 1027 1027 1027 1192 15290 29754 14150 1192 10260 1181 29755 29436 29741 10260 16856 29747 23925 10325 1005 1005 1005 1005 1005 1005 1036 1036 1036 1036 1036 1036 1036 1000 1000 1000 1000 1012 1012 1012 1012 1012 1012 999 999 999 999 999 999 1029 1029 1029 1029 1029 1029 1045 1005 2310 2042 1005 2409 2002 1005 1055 2045 1010 1005 2128 2017 2469 1029 1005 1049 2025 2469 1045 1005 2222 2191 2009 1010 1005 1040 2017 2066 2070 5572 1029 2057 1005 2310 1037 1005 2222 diff --git a/models/ggml-vocab-command-r.gguf b/models/ggml-vocab-command-r.gguf new file mode 100644 index 000000000..b553eab33 Binary files /dev/null and b/models/ggml-vocab-command-r.gguf differ diff --git a/models/ggml-vocab-command-r.gguf.inp b/models/ggml-vocab-command-r.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-command-r.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-command-r.gguf.out b/models/ggml-vocab-command-r.gguf.out new file mode 100644 index 000000000..cc4277daa --- /dev/null +++ b/models/ggml-vocab-command-r.gguf.out @@ -0,0 +1,43 @@ + 2536 228 27 228 22957 6983 + 45 193433 + + 228 + 1667 + 1742 + 205 + 206 + 2126 + 11516 + 34777 + 28339 3845 + 46609 3845 + 28339 3930 + 46609 3930 + 46609 3930 8 + 28339 19 3845 8 + 46609 19 3845 8 + 2075 1801 11254 107 255 21 19317 + 94 23 27 31 228 30 21213 20752 39267 6405 9980 + 4929 40071 2196 3236 8750 1764 37097 41168 + 38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239 + 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16 + 28339 + 46609 + 228 46609 + 1667 46609 + 1742 46609 + 1742 46609 1856 46609 + 1737 + 206 1857 + 14 4515 + 28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 + 26 + 26 26 + 26 26 26 + 26 26 26 26 + 26 26 26 26 26 + 26 26 26 26 26 26 + 26 26 26 26 26 26 26 + 26 26 26 26 26 26 26 26 + 26 26 26 26 26 26 26 26 26 + 127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51 diff --git a/models/ggml-vocab-deepseek-coder.gguf b/models/ggml-vocab-deepseek-coder.gguf new file mode 100644 index 000000000..6728cd747 Binary files /dev/null and b/models/ggml-vocab-deepseek-coder.gguf differ diff --git a/models/ggml-vocab-deepseek-coder.gguf.inp b/models/ggml-vocab-deepseek-coder.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-deepseek-coder.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-coder.gguf.out b/models/ggml-vocab-deepseek-coder.gguf.out new file mode 100644 index 000000000..9ccc560d6 --- /dev/null +++ b/models/ggml-vocab-deepseek-coder.gguf.out @@ -0,0 +1,43 @@ + 1050 207 19 207 19192 4217 + 37 32009 71 6247 + + 207 + 243 + 315 + 184 + 185 + 185 185 + 185 185 185 + 184 185 + 17535 1835 + 414 9489 1835 + 17535 5414 + 414 9489 5414 + 414 9489 5414 0 + 17535 11 1835 0 + 414 9489 11 1835 0 + 437 317 12394 99 234 13 14789 + 86 15 19 23 207 22 83 3963 27659 26078 3934 14072 + 1593 6478 616 2251 14994 + 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 155 239 210 155 239 236 155 239 214 155 240 210 155 239 218 + 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 334 5950 992 78 12896 344 638 891 1372 10736 8 + 17535 + 414 9489 + 207 414 9489 + 243 414 9489 + 315 414 9489 + 315 414 9489 185 315 414 9489 + 334 + 185 405 + 6 2895 + 17535 11 320 6 435 0 1717 417 340 12394 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 + 18 + 18 18 + 18 18 18 + 18 18 18 18 + 18 18 18 18 18 + 18 18 18 18 18 18 + 18 18 18 18 18 18 18 + 18 18 18 18 18 18 18 18 + 18 18 18 18 18 18 18 18 18 + 185 207 185 185 207 185 185 185 207 12405 459 22758 185 243 185 315 185 251 185 730 185 10047 235 209 334 8760 8 12394 233 114 350 222 10047 221 104 169 116 224 334 4684 3909 992 24330 262 29651 612 8 207 156 237 214 12394 99 234 10047 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 524 18 207 18 1202 18 207 155 239 209 155 239 114 155 239 228 155 240 220 155 239 224 155 240 211 155 239 231 155 239 115 155 239 240 155 240 210 155 239 240 155 239 95 155 239 114 155 239 214 10047 233 210 3015 19100 608 9413 2668 16 18 16 19 16 20 16 1393 169 121 239 18155 374 17194 28 2861 6478 616 2251 14994 31269 4191 6 4686 4686 10252 3358 3358 3409 524 15330 3023 15031 5668 303 6 312 798 651 83 839 362 6 82 741 11 651 1369 340 2037 30 651 44 441 2037 303 6 642 1098 359 11 651 35 340 833 738 10860 30 998 6 10709 245 6 75 43 diff --git a/models/ggml-vocab-deepseek-llm.gguf b/models/ggml-vocab-deepseek-llm.gguf new file mode 100644 index 000000000..5d66091c4 Binary files /dev/null and b/models/ggml-vocab-deepseek-llm.gguf differ diff --git a/models/ggml-vocab-deepseek-llm.gguf.inp b/models/ggml-vocab-deepseek-llm.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-deepseek-llm.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-deepseek-llm.gguf.out b/models/ggml-vocab-deepseek-llm.gguf.out new file mode 100644 index 000000000..fd94b896d --- /dev/null +++ b/models/ggml-vocab-deepseek-llm.gguf.out @@ -0,0 +1,43 @@ + 1052 207 19 207 19109 4223 + 37 100014 71 6245 + + 207 + 243 + 300 + 184 + 185 + 185 185 + 185 185 185 + 184 185 + 17464 1843 + 37727 1843 + 17464 5427 + 37727 5427 + 37727 5427 0 + 17464 11 1843 0 + 37727 11 1843 0 + 437 317 12356 99 234 13 14743 + 86 15 19 23 207 22 83 3970 27519 26016 3944 14025 + 1603 6476 620 91754 + 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71374 210 71374 236 71374 214 155 240 210 71374 218 + 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 334 5956 89213 344 643 895 1377 10728 8 + 17464 + 37727 + 207 37727 + 243 37727 + 300 37727 + 300 37727 185 300 37727 + 334 + 185 403 + 6 2906 + 17464 11 320 6 436 0 1724 418 340 33701 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239 + 18 + 18 18 + 18 18 18 + 18 18 18 18 + 18 18 18 18 18 + 18 18 18 18 18 18 + 18 18 18 18 18 18 18 + 18 18 18 18 18 18 18 18 + 18 18 18 18 18 18 18 18 18 + 185 207 185 185 207 185 185 185 207 11969 486 22504 185 243 185 300 185 251 185 663 185 10044 95300 334 8754 8 33701 114 350 222 10044 221 104 46713 334 34732 996 24250 262 80923 8 207 37103 214 12356 99 234 10044 99 234 207 18 207 18 18 207 18 18 18 207 18 18 18 18 207 18 18 18 18 18 207 18 18 18 18 18 18 207 18 18 18 18 18 18 18 207 18 18 18 18 18 18 18 18 207 18 13 18 207 18 526 18 207 18 1204 18 207 71374 209 71374 114 71374 228 155 240 220 71374 224 155 240 211 71374 231 71374 115 71374 240 155 240 210 71374 240 71374 95 71374 114 71374 214 71899 210 3025 19017 612 9407 2681 16 18 16 19 16 20 16 1398 68940 239 78827 55170 76659 620 91754 31116 36804 4885 4885 10897 4390 4390 41047 15278 3033 14986 5675 304 6 313 803 655 33326 362 6 82 745 11 655 1374 340 2049 30 655 44 441 2049 304 6 647 1099 359 11 655 35 340 837 742 10842 30 1003 6 10699 245 6 75 43 diff --git a/models/ggml-vocab-falcon.gguf b/models/ggml-vocab-falcon.gguf index d4ea2e822..334d50da5 100644 Binary files a/models/ggml-vocab-falcon.gguf and b/models/ggml-vocab-falcon.gguf differ diff --git a/models/ggml-vocab-falcon.gguf.inp b/models/ggml-vocab-falcon.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-falcon.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-falcon.gguf.out b/models/ggml-vocab-falcon.gguf.out new file mode 100644 index 000000000..209b04cda --- /dev/null +++ b/models/ggml-vocab-falcon.gguf.out @@ -0,0 +1,43 @@ + 878 204 31 3068 133 2137 + 28611 132 30042 + + 204 + 258 + 466 + 192 + 193 + 1001 + 11331 + 19125 + 9856 1079 + 23090 1079 + 9856 2889 + 23090 2889 + 23090 2889 12 + 9856 23 1079 12 + 23090 23 1079 12 + 414 304 3346 111 231 25 29247 + 98 55866 204 34 16682 7149 36190 6869 11481 + 150 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795 + 38154 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 38154 207 38154 233 38154 211 167 237 207 38154 215 + 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 204 19 7927 53360 325 504 701 946 10930 20 + 9856 + 23090 + 204 23090 + 258 23090 + 466 23090 + 466 23090 742 23090 + 204 19 + 1212 40 + 18 4932 + 9856 23 291 18 436 12 1265 362 299 8196 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236 + 30 + 3138 + 22287 + 22287 30 + 22287 3138 + 22287 22287 + 22287 22287 30 + 22287 22287 3138 + 22287 22287 22287 + 1212 4824 1001 1212 192 204 663 49453 2069 742 561 1501 193 2571 232 206 204 19 11003 20 8196 126 283 219 48778 116 13392 204 19 51831 732 63209 1741 7955 522 20 22438 211 3346 111 231 2571 111 231 204 30 204 3138 204 22287 204 22287 30 204 22287 3138 204 22287 22287 204 22287 22287 30 204 22287 22287 3138 204 30 25 30 204 30 513 30 204 30 951 30 27171 236 206 38154 126 38154 225 167 237 217 38154 221 167 237 208 38154 228 38154 127 38154 237 167 237 207 38154 237 38154 107 38154 126 38154 211 20589 207 204 42 50087 123 2727 20300 32022 133 234 17419 30137 28 7858 181 133 236 204 37057 2228 10666 5052 133 6207 151 215 150 134 5052 133 6279 5052 223 151 216 49679 123 53110 47043 7795 204 7544 7544 7544 8543 8543 17593 3513 3513 12844 51520 17664 4247 295 18 298 650 204 18 95 693 332 18 94 629 23 204 18 1553 299 1310 42 204 18 56 416 1310 295 18 567 717 334 23 204 18 47 299 606 596 6696 42 703 18 16139 241 18 87 55 diff --git a/models/ggml-vocab-gpt-2.gguf b/models/ggml-vocab-gpt-2.gguf new file mode 100644 index 000000000..5ea85cf52 Binary files /dev/null and b/models/ggml-vocab-gpt-2.gguf differ diff --git a/models/ggml-vocab-gpt-2.gguf.inp b/models/ggml-vocab-gpt-2.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-gpt-2.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-gpt-2.gguf.out b/models/ggml-vocab-gpt-2.gguf.out new file mode 100644 index 000000000..78430f0d3 --- /dev/null +++ b/models/ggml-vocab-gpt-2.gguf.out @@ -0,0 +1,43 @@ + 798 604 25208 1933 + 37 9116 71 11751 + + 220 + 220 220 + 220 220 220 + 197 + 198 + 628 + 628 198 + 197 198 + 15496 995 + 18435 995 + 15496 2159 + 18435 2159 + 18435 2159 0 + 15496 11 995 0 + 18435 11 995 0 + 428 318 12520 99 247 13 20322 + 86 47202 767 28047 45961 288 82 7568 13415 + 22177 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849 + 157 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 157 252 223 157 252 249 157 252 227 157 253 223 157 252 231 + 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 357 8807 44805 326 468 663 898 11241 8 + 15496 + 18435 + 220 18435 + 220 220 18435 + 220 220 220 18435 + 220 220 220 18435 198 220 220 220 18435 + 357 + 198 796 + 6 6980 + 15496 11 331 6 439 0 1374 389 345 30325 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252 + 18 + 2091 + 20370 + 24840 + 2091 20370 + 24840 2091 + 24840 20370 + 24840 24840 + 24840 2091 20370 + 198 220 628 220 628 198 220 197 220 197 197 220 197 198 220 220 198 220 220 220 198 220 220 220 220 198 220 220 220 220 220 198 8582 248 222 357 11265 8 30325 114 447 235 8582 234 104 37929 357 48101 795 13210 271 1673 36686 515 8 14519 227 12520 99 247 8582 99 247 513 4747 23460 513 20370 23460 2091 23460 20370 23460 24840 23460 2091 20370 513 13 18 513 492 18 513 986 18 28053 252 222 157 252 114 157 252 241 157 253 233 157 252 237 157 253 224 157 252 244 157 252 115 157 252 253 157 253 223 157 252 253 157 252 95 157 252 114 157 252 227 47249 223 5633 22755 239 46349 111 28839 101 18040 32432 98 43291 1485 1415 24309 25465 171 121 252 40103 1421 18604 12466 121 16843 141 231 15166 12466 121 16142 12466 239 141 232 30143 140 111 16142 21169 21727 31583 18849 705 39115 6 33153 15506 63 15931 15931 16317 13896 3228 9805 3548 314 1053 587 705 44040 339 338 612 11 705 2200 345 1654 30 705 44 407 1654 314 1183 787 340 11 705 35 345 588 617 8887 30 775 6 26979 257 6 75 43 diff --git a/models/ggml-vocab-llama-bpe.gguf b/models/ggml-vocab-llama-bpe.gguf new file mode 100644 index 000000000..e51a99118 Binary files /dev/null and b/models/ggml-vocab-llama-bpe.gguf differ diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-llama-bpe.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out new file mode 100644 index 000000000..1f00e3812 --- /dev/null +++ b/models/ggml-vocab-llama-bpe.gguf.out @@ -0,0 +1,43 @@ + 1142 220 19 220 27154 4038 + 37 51853 261 + + 220 + 256 + 262 + 197 + 198 + 271 + 1432 + 1602 + 9906 1917 + 22691 1917 + 9906 4435 + 22691 4435 + 22691 4435 0 + 9906 11 1917 0 + 22691 11 1917 0 + 420 374 11410 99 247 13 11055 + 86 23904 220 22 83 2005 42908 11729 3013 17156 + 79862 102118 13373 64571 34694 3114 112203 80112 + 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 21549 223 21549 249 21549 227 45358 223 21549 231 + 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 320 3323 43465 430 706 1202 1866 4037 8 + 9906 + 22691 + 220 22691 + 256 22691 + 262 22691 + 262 22691 198 262 22691 + 320 + 198 284 + 6 11639 + 9906 11 379 65948 0 2650 527 499 27623 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 + 18 + 1644 + 8765 + 8765 18 + 8765 1644 + 8765 8765 + 8765 8765 18 + 8765 8765 1644 + 8765 8765 8765 + 198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43 diff --git a/models/ggml-vocab-llama.gguf b/models/ggml-vocab-llama-spm.gguf similarity index 99% rename from models/ggml-vocab-llama.gguf rename to models/ggml-vocab-llama-spm.gguf index 549eed8c5..658295a5d 100644 Binary files a/models/ggml-vocab-llama.gguf and b/models/ggml-vocab-llama-spm.gguf differ diff --git a/models/ggml-vocab-llama-spm.gguf.inp b/models/ggml-vocab-llama-spm.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-llama-spm.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-llama-spm.gguf.out b/models/ggml-vocab-llama-spm.gguf.out new file mode 100644 index 000000000..9c3327cb5 --- /dev/null +++ b/models/ggml-vocab-llama-spm.gguf.out @@ -0,0 +1,43 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 + + 259 + 1678 + 268 + 29871 12 + 29871 13 + 29871 13 13 + 29871 13 13 13 + 29871 12 13 + 15043 3186 + 29871 15043 3186 + 15043 2787 + 29871 15043 2787 + 29871 15043 2787 29991 + 15043 29892 3186 29991 + 29871 15043 29892 3186 29991 + 29871 445 338 29871 243 162 169 156 29889 8223 + 281 29900 29946 29947 29871 29955 9161 13535 18031 2176 6905 + 1538 4851 665 1386 29713 1305 + 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 228 161 132 228 161 158 228 161 136 228 162 132 228 161 140 + 29871 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 313 6194 953 29877 2397 393 756 967 1914 5993 29897 + 15043 + 29871 15043 + 259 15043 + 1678 15043 + 268 15043 + 268 15043 13 1678 15043 + 29871 313 + 29871 13 353 + 525 3152 + 15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 + 29871 29941 + 29871 29941 29941 + 29871 29941 29941 29941 + 29871 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 29941 29941 29941 + 29871 13 29871 13 13 29871 13 13 13 29871 12 29871 12 12 29871 12 13 259 13 1678 13 268 13 418 13 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 29871 243 162 169 156 243 162 169 156 29871 29941 29871 29941 29941 29871 29941 29941 29941 29871 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29941 29871 29941 29889 29941 29871 29941 636 29941 29871 29941 856 29941 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 448 23648 2751 25512 1538 4851 665 1386 29713 1305 14550 4907 11120 16159 16159 16159 15945 15945 3045 636 6824 6824 6824 8773 8773 8773 306 29915 345 1063 525 29873 1025 540 29915 29879 727 29892 525 1525 366 1854 29973 525 29924 451 1854 306 29915 645 1207 372 29892 525 29928 366 763 777 23429 29973 1334 29915 29963 29872 263 29915 29880 29931 diff --git a/models/ggml-vocab-mpt.gguf b/models/ggml-vocab-mpt.gguf index 6affa34bd..f42f56dec 100644 Binary files a/models/ggml-vocab-mpt.gguf and b/models/ggml-vocab-mpt.gguf differ diff --git a/models/ggml-vocab-mpt.gguf.inp b/models/ggml-vocab-mpt.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-mpt.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-mpt.gguf.out b/models/ggml-vocab-mpt.gguf.out new file mode 100644 index 000000000..d8d0fe909 --- /dev/null +++ b/models/ggml-vocab-mpt.gguf.out @@ -0,0 +1,43 @@ + 728 577 24142 2607 + 39 26288 6554 + + 209 + 50276 + 50275 + 186 + 187 + 535 + 2756 + 186 187 + 12092 1533 + 24387 1533 + 12092 3645 + 24387 3645 + 24387 3645 2 + 12092 13 1533 2 + 24387 13 1533 2 + 436 310 22692 101 236 15 14161 + 88 27244 818 16853 16392 20505 4989 11917 + 32520 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389 + 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 18081 212 18081 238 18081 216 39936 212 18081 220 + 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 313 7483 802 80 8020 326 556 697 1211 10669 10 + 12092 + 24387 + 50276 12092 + 50275 12092 + 50274 12092 + 50274 12092 187 50274 12092 + 313 + 187 426 + 8 8685 + 12092 13 340 8 455 2 1359 403 368 49042 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241 + 20 + 1610 + 20084 + 26409 + 1610 20084 + 26409 1610 + 26409 20084 + 26409 26409 + 26409 1610 20084 + 586 1744 33525 186 209 623 28910 187 50276 187 50275 187 50274 187 50273 187 14931 237 211 313 6320 10 49042 116 325 224 14931 223 106 171 118 226 313 34263 802 13511 261 32147 456 10 3384 239 216 22692 101 236 14931 101 236 495 5922 30057 495 20084 495 26409 30057 20084 495 26409 1610 495 26409 20084 495 15 20 495 537 20 495 1051 20 209 18081 211 18081 116 18081 230 39936 222 18081 226 39936 213 18081 233 18081 117 18081 242 39936 212 18081 242 18081 97 18081 116 18081 216 14931 235 212 3736 15367 41197 13610 19934 41869 21275 1012 1047 18795 40120 20422 241 16081 6877 12880 11514 1068 8713 38177 13396 3415 9925 12559 10453 1389 42011 35033 34842 11202 9739 9739 33021 18963 4672 25561 8220 309 1849 644 686 42618 344 434 627 13 686 1848 368 2119 32 686 46 417 2119 309 1833 1056 352 13 686 37 368 751 690 10331 32 844 8 31516 247 8 77 45 diff --git a/models/ggml-vocab-phi-3.gguf b/models/ggml-vocab-phi-3.gguf new file mode 100644 index 000000000..f8022a385 Binary files /dev/null and b/models/ggml-vocab-phi-3.gguf differ diff --git a/models/ggml-vocab-phi-3.gguf.inp b/models/ggml-vocab-phi-3.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-phi-3.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-phi-3.gguf.out b/models/ggml-vocab-phi-3.gguf.out new file mode 100644 index 000000000..9c3327cb5 --- /dev/null +++ b/models/ggml-vocab-phi-3.gguf.out @@ -0,0 +1,43 @@ + 474 287 29871 29946 29871 30226 7378 + 383 4000 261 + + 259 + 1678 + 268 + 29871 12 + 29871 13 + 29871 13 13 + 29871 13 13 13 + 29871 12 13 + 15043 3186 + 29871 15043 3186 + 15043 2787 + 29871 15043 2787 + 29871 15043 2787 29991 + 15043 29892 3186 29991 + 29871 15043 29892 3186 29991 + 29871 445 338 29871 243 162 169 156 29889 8223 + 281 29900 29946 29947 29871 29955 9161 13535 18031 2176 6905 + 1538 4851 665 1386 29713 1305 + 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 228 161 132 228 161 158 228 161 136 228 162 132 228 161 140 + 29871 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 313 6194 953 29877 2397 393 756 967 1914 5993 29897 + 15043 + 29871 15043 + 259 15043 + 1678 15043 + 268 15043 + 268 15043 13 1678 15043 + 29871 313 + 29871 13 353 + 525 3152 + 15043 29892 343 29915 497 29991 1128 526 366 29871 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 + 29871 29941 + 29871 29941 29941 + 29871 29941 29941 29941 + 29871 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 29941 29941 + 29871 29941 29941 29941 29941 29941 29941 29941 29941 29941 + 29871 13 29871 13 13 29871 13 13 13 29871 12 29871 12 12 29871 12 13 259 13 1678 13 268 13 418 13 243 162 157 131 313 8945 29897 29871 243 162 155 185 30722 243 162 143 174 30598 313 20787 953 3848 275 16125 630 29897 29871 31681 29871 243 162 169 156 243 162 169 156 29871 29941 29871 29941 29941 29871 29941 29941 29941 29871 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29871 29941 29941 29941 29941 29941 29941 29941 29941 29871 29941 29889 29941 29871 29941 636 29941 29871 29941 856 29941 29871 31849 31324 31934 228 162 142 228 161 146 228 162 133 228 161 153 228 161 186 31708 228 162 132 31708 228 161 165 31324 228 161 136 243 162 155 132 1577 30672 31522 30505 11548 31041 30732 29896 29941 29896 29946 29896 29945 29896 30408 30739 448 23648 2751 25512 1538 4851 665 1386 29713 1305 14550 4907 11120 16159 16159 16159 15945 15945 3045 636 6824 6824 6824 8773 8773 8773 306 29915 345 1063 525 29873 1025 540 29915 29879 727 29892 525 1525 366 1854 29973 525 29924 451 1854 306 29915 645 1207 372 29892 525 29928 366 763 777 23429 29973 1334 29915 29963 29872 263 29915 29880 29931 diff --git a/models/ggml-vocab-qwen2.gguf b/models/ggml-vocab-qwen2.gguf new file mode 100644 index 000000000..541e475bc Binary files /dev/null and b/models/ggml-vocab-qwen2.gguf differ diff --git a/models/ggml-vocab-qwen2.gguf.inp b/models/ggml-vocab-qwen2.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-qwen2.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-qwen2.gguf.out b/models/ggml-vocab-qwen2.gguf.out new file mode 100644 index 000000000..401a510e8 --- /dev/null +++ b/models/ggml-vocab-qwen2.gguf.out @@ -0,0 +1,43 @@ + 1122 220 19 220 26062 3951 + 37 50753 261 + + 220 + 256 + 262 + 197 + 198 + 271 + 1406 + 1572 + 9707 1879 + 21927 1879 + 9707 4337 + 21927 4337 + 21927 4337 0 + 9707 11 1879 0 + 21927 11 1879 0 + 419 374 11162 99 247 13 10821 + 86 15 19 23 220 22 83 1963 41808 11472 2940 16739 + 78762 14144 1456 13073 63471 33594 3038 133178 79012 + 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848 + 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8 + 9707 + 21927 + 220 21927 + 256 21927 + 262 21927 + 262 21927 198 262 21927 + 320 + 198 284 + 6 11385 + 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 + 18 + 18 18 + 18 18 18 + 18 18 18 18 + 18 18 18 18 18 + 18 18 18 18 18 18 + 18 18 18 18 18 18 18 + 18 18 18 18 18 18 18 18 + 18 18 18 18 18 18 18 18 18 + 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43 diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf index 8f26cfb76..52afcf01a 100644 Binary files a/models/ggml-vocab-refact.gguf and b/models/ggml-vocab-refact.gguf differ diff --git a/models/ggml-vocab-refact.gguf.inp b/models/ggml-vocab-refact.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-refact.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-refact.gguf.out b/models/ggml-vocab-refact.gguf.out new file mode 100644 index 000000000..06b15c090 --- /dev/null +++ b/models/ggml-vocab-refact.gguf.out @@ -0,0 +1,43 @@ + 4833 225 38 225 143 140 17723 + 56 2006 3935 265 + + 225 + 261 + 264 + 202 + 203 + 478 + 2831 + 15773 + 8279 5788 + 12000 5788 + 8279 10896 + 12000 10896 + 12000 10896 19 + 8279 30 5788 19 + 12000 30 5788 19 + 458 438 5945 118 252 32 3766 + 105 34 38 42 225 41 102 1707 12530 10180 1479 8278 + 39862 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 + 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 14574 228 14574 254 14574 232 30457 228 14574 236 + 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 308 2585 22680 688 1401 2819 4369 2404 27 + 8279 + 12000 + 225 12000 + 261 12000 + 264 12000 + 264 12000 284 12000 + 308 + 203 280 + 25 34666 + 8279 30 533 25 464 19 4971 884 844 18458 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 + 37 + 37 37 + 37 37 37 + 37 37 37 37 + 37 37 37 37 37 + 37 37 37 37 37 37 + 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 + 37 37 37 37 37 37 37 37 37 + 334 719 8878 202 10885 4222 16104 28570 203 3807 253 227 308 4382 27 18458 133 46113 44967 123 13868 308 12565 19775 33071 40824 733 27 41889 5945 118 252 3807 118 252 225 37 225 37 37 225 37 37 37 225 37 37 37 37 225 37 37 37 37 37 225 37 37 37 37 37 37 225 37 37 37 37 37 37 37 225 37 37 37 37 37 37 37 37 225 37 32 37 225 37 497 37 225 37 1179 37 225 14574 227 14574 133 14574 246 30457 238 14574 242 30457 229 14574 249 14574 134 14574 258 30457 228 14574 258 14574 114 14574 133 14574 232 36628 228 1018 4982 13368 2909 9513 17827 35 37 35 38 35 39 35 11873 47838 20921 16623 13028 8372 1039 9446 40242 13852 2053 8949 12531 1520 10700 5881 9592 13299 914 31753 31359 9163 3202 35472 10397 439 4763 2583 330 102 1455 938 1182 2017 30 330 613 844 3654 49 330 63 646 3654 439 4621 1930 561 30 330 54 844 2124 1629 35993 49 2688 25 7709 312 25 94 62 diff --git a/models/ggml-vocab-stablelm-3b-4e1t.gguf b/models/ggml-vocab-stablelm.gguf similarity index 100% rename from models/ggml-vocab-stablelm-3b-4e1t.gguf rename to models/ggml-vocab-stablelm.gguf diff --git a/models/ggml-vocab-starcoder.gguf b/models/ggml-vocab-starcoder.gguf index a52983fdb..7a7e7742a 100644 Binary files a/models/ggml-vocab-starcoder.gguf and b/models/ggml-vocab-starcoder.gguf differ diff --git a/models/ggml-vocab-starcoder.gguf.inp b/models/ggml-vocab-starcoder.gguf.inp new file mode 100644 index 000000000..0a89107c6 --- /dev/null +++ b/models/ggml-vocab-starcoder.gguf.inp @@ -0,0 +1,106 @@ +ied 4 ½ months +__ggml_vocab_test__ +Führer +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + +__ggml_vocab_test__ + + +__ggml_vocab_test__ + + + +__ggml_vocab_test__ + + + + +__ggml_vocab_test__ + + +__ggml_vocab_test__ +Hello world +__ggml_vocab_test__ + Hello world +__ggml_vocab_test__ +Hello World +__ggml_vocab_test__ + Hello World +__ggml_vocab_test__ + Hello World! +__ggml_vocab_test__ +Hello, world! +__ggml_vocab_test__ + Hello, world! +__ggml_vocab_test__ + this is 🦙.cpp +__ggml_vocab_test__ +w048 7tuijk dsdfhu +__ggml_vocab_test__ +нещо на Български +__ggml_vocab_test__ +កាន់តែពិសេសអាចខលចេញ +__ggml_vocab_test__ +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) +__ggml_vocab_test__ +Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello +__ggml_vocab_test__ + Hello + Hello +__ggml_vocab_test__ + ( +__ggml_vocab_test__ + + = +__ggml_vocab_test__ +' era +__ggml_vocab_test__ +Hello, y'all! How are you 😁 ?我想在apple工作1314151天~ +__ggml_vocab_test__ +3 +__ggml_vocab_test__ +33 +__ggml_vocab_test__ +333 +__ggml_vocab_test__ +3333 +__ggml_vocab_test__ +33333 +__ggml_vocab_test__ +333333 +__ggml_vocab_test__ +3333333 +__ggml_vocab_test__ +33333333 +__ggml_vocab_test__ +333333333 +__ggml_vocab_test__ + + + + + + + + + + + +🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL +__ggml_vocab_test__ diff --git a/models/ggml-vocab-starcoder.gguf.out b/models/ggml-vocab-starcoder.gguf.out new file mode 100644 index 000000000..ccb55c7fe --- /dev/null +++ b/models/ggml-vocab-starcoder.gguf.out @@ -0,0 +1,43 @@ + 4850 244 57 244 162 159 17722 + 75 2022 3943 284 + + 244 + 280 + 283 + 221 + 222 + 499 + 3067 + 15767 + 8302 5810 + 12009 5810 + 8302 10914 + 12009 10914 + 12009 10914 38 + 8302 49 5810 38 + 12009 49 5810 38 + 477 458 5954 137 271 51 3779 + 124 53 57 61 244 60 121 1726 12568 10240 1519 8290 + 39916 8389 1059 9504 40216 13858 2073 8983 12571 1539 10721 + 14566 246 14566 152 14566 265 30428 257 14566 261 30428 248 14566 268 14566 153 14566 277 30428 247 14566 277 14566 133 14566 152 14566 251 14566 247 14566 273 14566 251 30428 247 14566 255 + 3822 272 246 327 4434 46 18445 152 46030 45022 142 13878 327 12585 19884 33773 40920 751 46 41839 327 2605 22716 708 1421 2840 4387 2421 46 + 8302 + 12009 + 244 12009 + 280 12009 + 283 12009 + 283 12009 303 12009 + 327 + 222 299 + 44 34719 + 8302 49 553 44 483 38 4998 904 863 18445 247 1037 4995 13379 2924 9515 17823 54 56 54 57 54 58 54 11904 47892 + 56 + 56 56 + 56 56 56 + 56 56 56 56 + 56 56 56 56 56 + 56 56 56 56 56 56 + 56 56 56 56 56 56 56 + 56 56 56 56 56 56 56 56 + 56 56 56 56 56 56 56 56 56 + 353 736 8886 221 10883 4238 16101 28540 222 3822 272 246 327 4434 46 18445 152 46030 45022 142 13878 327 12585 19884 33773 40920 751 46 41839 5954 137 271 3822 137 271 244 56 244 56 56 244 56 56 56 244 56 56 56 56 244 56 56 56 56 56 244 56 56 56 56 56 56 244 56 56 56 56 56 56 56 244 56 56 56 56 56 56 56 56 244 56 51 56 244 56 516 56 244 56 1198 56 244 14566 246 14566 152 14566 265 30428 257 14566 261 30428 248 14566 268 14566 153 14566 277 30428 247 14566 277 14566 133 14566 152 14566 251 36570 247 1037 4995 13379 2924 9515 17823 54 56 54 57 54 58 54 11904 47892 20895 16625 13047 8389 1059 9504 40216 13858 2073 8983 12571 1539 10721 5918 9643 13298 932 31723 31330 9221 3226 35426 10400 457 4783 2602 349 121 1477 957 1200 2038 49 349 632 863 3673 68 349 82 666 3673 457 4650 1949 580 49 349 73 863 2144 1649 35941 68 2726 44 7728 331 44 113 81 diff --git a/pyrightconfig.json b/pyrightconfig.json new file mode 100644 index 000000000..020a71a4e --- /dev/null +++ b/pyrightconfig.json @@ -0,0 +1,3 @@ +{ + "extraPaths": ["gguf-py"], +} diff --git a/requirements.txt b/requirements.txt index d36f74520..fc1e28278 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ -r ./requirements/requirements-convert.txt -r ./requirements/requirements-convert-hf-to-gguf.txt +-r ./requirements/requirements-convert-hf-to-gguf-update.txt -r ./requirements/requirements-convert-llama-ggml-to-gguf.txt -r ./requirements/requirements-convert-lora-to-ggml.txt -r ./requirements/requirements-convert-persimmon-to-gguf.txt diff --git a/requirements/requirements-convert-hf-to-gguf-update.txt b/requirements/requirements-convert-hf-to-gguf-update.txt new file mode 100644 index 000000000..6ac402610 --- /dev/null +++ b/requirements/requirements-convert-hf-to-gguf-update.txt @@ -0,0 +1,2 @@ +-r ./requirements-convert.txt +torch~=2.1.1 diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt index 6ce840d73..6ac402610 100644 --- a/requirements/requirements-convert-hf-to-gguf.txt +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -1,3 +1,2 @@ -r ./requirements-convert.txt torch~=2.1.1 -einops~=0.7.0 diff --git a/requirements/requirements-convert.txt b/requirements/requirements-convert.txt index a3d6ecec0..7ab1228cb 100644 --- a/requirements/requirements-convert.txt +++ b/requirements/requirements-convert.txt @@ -1,5 +1,5 @@ numpy~=1.24.4 -sentencepiece~=0.1.98 -transformers>=4.35.2,<5.0.0 +sentencepiece~=0.2.0 +transformers>=4.40.1,<5.0.0 gguf>=0.1.0 protobuf>=4.21.0,<5.0.0 diff --git a/scripts/LlamaConfig.cmake.in b/scripts/LlamaConfig.cmake.in index 6a6d8e39e..f842c7137 100644 --- a/scripts/LlamaConfig.cmake.in +++ b/scripts/LlamaConfig.cmake.in @@ -3,7 +3,7 @@ set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@) set(LLAMA_BLAS @LLAMA_BLAS@) -set(LLAMA_CUBLAS @LLAMA_CUBLAS@) +set(LLAMA_CUDA @LLAMA_CUDA@) set(LLAMA_METAL @LLAMA_METAL@) set(LLAMA_MPI @LLAMA_MPI@) set(LLAMA_CLBLAST @LLAMA_CLBLAST@) @@ -27,7 +27,7 @@ if (LLAMA_BLAS) find_package(BLAS REQUIRED) endif() -if (LLAMA_CUBLAS) +if (LLAMA_CUDA) find_package(CUDAToolkit REQUIRED) endif() diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh index af7bab753..6a7400d3c 100755 --- a/scripts/check-requirements.sh +++ b/scripts/check-requirements.sh @@ -168,6 +168,11 @@ fi check_convert_script convert.py for py in convert-*.py; do + # skip convert-hf-to-gguf-update.py + # TODO: the check is failing for some reason: + # https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920 + [[ $py == convert-hf-to-gguf-update.py ]] && continue + check_convert_script "$py" done diff --git a/scripts/compare-commits.sh b/scripts/compare-commits.sh index 331c4b9ce..fd0ee88b2 100755 --- a/scripts/compare-commits.sh +++ b/scripts/compare-commits.sh @@ -12,19 +12,7 @@ bench_args="${@:3}" rm -f llama-bench.sqlite -backend="cpu" - -if [[ "$OSTYPE" == "darwin"* ]]; then - backend="metal" -elif command -v nvcc &> /dev/null; then - backend="cuda" -fi - -make_opts="" - -if [[ "$backend" == "cuda" ]]; then - make_opts="LLAMA_CUBLAS=1" -fi +# to test a backend, call the script with the corresponding environment variable (e.g. LLAMA_CUDA=1 ./scripts/compare-commits.sh ...) git checkout $1 make clean && make -j32 $make_opts llama-bench diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 54a7771d8..fed3c1ee3 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import argparse import heapq import sys @@ -11,9 +12,11 @@ try: import git from tabulate import tabulate except ImportError as e: - print("ERROR: the following Python libraries are required: GitPython, tabulate.") + print("the following Python libraries are required: GitPython, tabulate.") # noqa: NP100 raise e +logger = logging.getLogger("compare-llama-bench") + # Properties by which to differentiate results per commit: KEY_PROPERTIES = [ "cpu_info", "gpu_info", "n_gpu_layers", "main_gpu", "cuda", "opencl", "metal", "gpu_blas", @@ -90,12 +93,14 @@ help_s = ( "specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench." ) parser.add_argument("-s", "--show", help=help_s) +parser.add_argument("--verbose", action="store_true", help="increase output verbosity") known_args, unknown_args = parser.parse_known_args() +logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO) + if unknown_args: - print(f"ERROR: Received unknown args: {unknown_args}.") - print() + logger.error(f"Received unknown args: {unknown_args}.\n") parser.print_help() sys.exit(1) @@ -108,8 +113,7 @@ if input_file is None: input_file = sqlite_files[0] if input_file is None: - print("ERROR: Cannot find a suitable input file, please provide one.") - print() + logger.error("Cannot find a suitable input file, please provide one.\n") parser.print_help() sys.exit(1) @@ -178,6 +182,9 @@ def get_commit_hexsha8(name): for t in repo.tags: if t.name == name: return t.commit.hexsha[:8] + for c in repo.iter_commits("--all"): + if c.hexsha[:8] == name[:8]: + return c.hexsha[:8] return None @@ -191,23 +198,19 @@ if known_args.baseline is not None: hexsha8_baseline = get_commit_hexsha8(known_args.baseline) name_baseline = known_args.baseline if hexsha8_baseline is None: - print(f"ERROR: cannot find data for baseline={known_args.baseline}.") + logger.error(f"cannot find data for baseline={known_args.baseline}.") sys.exit(1) # Otherwise, search for the most recent parent of master for which there is data: elif repo is not None: hexsha8_baseline = find_parent_in_data(repo.heads.master.commit) if hexsha8_baseline is None: - print("ERROR: No baseline was provided and did not find data for any master branch commits.") - print() + logger.error("No baseline was provided and did not find data for any master branch commits.\n") parser.print_help() sys.exit(1) else: - print( - "ERROR: No baseline was provided and the current working directory " - "is not part of a git repository from which a baseline could be inferred." - ) - print() + logger.error("No baseline was provided and the current working directory " + "is not part of a git repository from which a baseline could be inferred.\n") parser.print_help() sys.exit(1) @@ -224,7 +227,7 @@ if known_args.compare is not None: hexsha8_compare = get_commit_hexsha8(known_args.compare) name_compare = known_args.compare if hexsha8_compare is None: - print(f"ERROR: cannot find data for baseline={known_args.compare}.") + logger.error(f"cannot find data for compare={known_args.compare}.") sys.exit(1) # Otherwise, search for the commit for llama-bench was most recently run # and that is not a parent of master: @@ -238,16 +241,12 @@ elif repo is not None: break if hexsha8_compare is None: - print("ERROR: No compare target was provided and did not find data for any non-master commits.") - print() + logger.error("No compare target was provided and did not find data for any non-master commits.\n") parser.print_help() sys.exit(1) else: - print( - "ERROR: No compare target was provided and the current working directory " - "is not part of a git repository from which a compare target could be inferred." - ) - print() + logger.error("No compare target was provided and the current working directory " + "is not part of a git repository from which a compare target could be inferred.\n") parser.print_help() sys.exit(1) @@ -281,8 +280,7 @@ if known_args.show is not None: if prop not in KEY_PROPERTIES[:-2]: # Last two values are n_prompt, n_gen. unknown_cols.append(prop) if unknown_cols: - print(f"ERROR: Unknown values for --show: {', '.join(unknown_cols)}") - print() + logger.error(f"Unknown values for --show: {', '.join(unknown_cols)}") parser.print_usage() sys.exit(1) rows_show = get_rows(show) @@ -366,7 +364,7 @@ if "gpu_info" in show: headers = [PRETTY_NAMES[p] for p in show] headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"] -print(tabulate( +print(tabulate( # noqa: NP100 table, headers=headers, floatfmt=".2f", diff --git a/scripts/gen-authors.sh b/scripts/gen-authors.sh new file mode 100755 index 000000000..3ef8391cc --- /dev/null +++ b/scripts/gen-authors.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +printf "# date: $(date)\n" > AUTHORS +printf "# this file is auto-generated by scripts/gen-authors.sh\n\n" >> AUTHORS + +git log --format='%an <%ae>' --reverse --date=short master | awk '!seen[$0]++' | sort >> AUTHORS + +# if necessary, update your name here. for example: jdoe -> John Doe +sed -i '' 's/^jdoe/John Doe/g' AUTHORS diff --git a/scripts/gen-unicode-data.py b/scripts/gen-unicode-data.py new file mode 100644 index 000000000..37d1e396c --- /dev/null +++ b/scripts/gen-unicode-data.py @@ -0,0 +1,64 @@ +import regex + + +def get_matches(regex_expr): + regex_expr_compiled = regex.compile(regex_expr) + unicode_ranges = [] + current_range = None + + for codepoint in range(0x110000): + char = chr(codepoint) + if regex_expr_compiled.match(char): + if current_range is None: + current_range = [codepoint, codepoint] + else: + current_range[1] = codepoint + elif current_range is not None: + unicode_ranges.append(tuple(current_range)) + current_range = None + + if current_range is not None: + unicode_ranges.append(tuple(current_range)) + + return unicode_ranges + + +def print_cat(mode, cat, ranges): + if mode == "range": + print("const std::vector> unicode_ranges_{} = {{".format(cat)) # noqa: NP100 + if mode == "map": + print("const std::map unicode_map_{} = {{".format(cat)) # noqa: NP100 + for i, values in enumerate(ranges): + end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", " + values = ["0x%08X" % value for value in values] + print("{" + ", ".join(values) + "}", end=end) # noqa: NP100 + print("};") # noqa: NP100 + print("") # noqa: NP100 + + +print_cat("range", "number", get_matches(r'\p{N}')) +print_cat("range", "letter", get_matches(r'\p{L}')) +print_cat("range", "separator", get_matches(r'\p{Z}')) +print_cat("range", "accent_mark", get_matches(r'\p{M}')) +print_cat("range", "punctuation", get_matches(r'\p{P}')) +print_cat("range", "symbol", get_matches(r'\p{S}')) +print_cat("range", "control", get_matches(r'\p{C}')) + +print_cat("range", "whitespace", get_matches(r'\s')) + + +map_lowercase = [] +map_uppercase = [] +for codepoint in range(0x110000): + char = chr(codepoint) + lower = ord(char.lower()[0]) + upper = ord(char.upper()[0]) + if codepoint != lower: + map_lowercase.append((codepoint, lower)) + if codepoint != upper: + map_uppercase.append((codepoint, upper)) +print_cat("map", "lowercase", map_lowercase) +print_cat("map", "uppercase", map_uppercase) + + +# TODO: generate unicode_map_nfd diff --git a/scripts/get-wikitext-103.sh b/scripts/get-wikitext-103.sh new file mode 100755 index 000000000..880dd5cbe --- /dev/null +++ b/scripts/get-wikitext-103.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip + +echo "Usage:" +echo "" +echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]" +echo "" + +exit 0 diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh index 7ca760fa6..b01476a46 100755 --- a/scripts/get-wikitext-2.sh +++ b/scripts/get-wikitext-2.sh @@ -1,10 +1,11 @@ #!/bin/bash wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip +unzip wikitext-2-raw-v1.zip echo "Usage:" echo "" -echo " ./perplexity -m model.gguf -f wiki.test.raw [other params]" +echo " ./perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" echo "" exit 0 diff --git a/scripts/hf.sh b/scripts/hf.sh index 1e9e5a6ea..58f83d6fe 100755 --- a/scripts/hf.sh +++ b/scripts/hf.sh @@ -3,9 +3,9 @@ # Shortcut for downloading HF models # # Usage: -# ./main -m $(./examples/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./main -m $(./examples/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) -# ./main -m $(./examples/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./scripts/hf.sh https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./scripts/hf.sh --url https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/blob/main/mixtral-8x7b-v0.1.Q4_K_M.gguf) +# ./main -m $(./scripts/hf.sh --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf) # # all logs go to stderr @@ -14,7 +14,7 @@ function log { } function usage { - log "Usage: $0 [[--url] ] [--repo ] [--file ] [-h|--help]" + log "Usage: $0 [[--url] ] [--repo ] [--file ] [--outdir [-h|--help]" exit 1 } @@ -26,9 +26,9 @@ function has_cmd { } if has_cmd wget; then - cmd="wget -q --show-progress -c -O %s %s" + cmd="wget -q --show-progress -c -O %s/%s %s" elif has_cmd curl; then - cmd="curl -C - -f -o %s -L %s" + cmd="curl -C - -f --output-dir %s -o %s -L %s" else log "[E] curl or wget not found" exit 1 @@ -37,6 +37,7 @@ fi url="" repo="" file="" +outdir="." # parse args while [[ $# -gt 0 ]]; do @@ -53,6 +54,10 @@ while [[ $# -gt 0 ]]; do file="$2" shift 2 ;; + --outdir) + outdir="$2" + shift 2 + ;; -h|--help) usage ;; @@ -94,10 +99,10 @@ basename=$(basename $url) log "[+] attempting to download $basename" if [ -n "$cmd" ]; then - cmd=$(printf "$cmd" "$basename" "$url") + cmd=$(printf "$cmd" "$outdir" "$basename" "$url") log "[+] $cmd" if $cmd; then - echo $basename + echo $outdir/$basename exit 0 fi fi diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh index 6cf1ab4f3..2058ceabf 100644 --- a/scripts/pod-llama.sh +++ b/scripts/pod-llama.sh @@ -42,7 +42,7 @@ git clone https://github.com/ggerganov/llama.cpp cd llama.cpp -LLAMA_CUBLAS=1 make -j +LLAMA_CUDA=1 make -j ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b @@ -60,7 +60,7 @@ cd /workspace/llama.cpp mkdir build-cublas cd build-cublas -cmake -DLLAMA_CUBLAS=1 ../ +cmake -DLLAMA_CUDA=1 ../ make -j if [ "$1" -eq "0" ]; then @@ -186,17 +186,17 @@ if [ "$1" -eq "1" ]; then # batched cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + LLAMA_CUDA=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 # batched-bench cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + LLAMA_CUDA=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 # parallel cd /workspace/llama.cpp - LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + LLAMA_CUDA=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb fi @@ -204,10 +204,10 @@ fi #if [ "$1" -eq "7" ]; then # cd /workspace/llama.cpp # -# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +# LLAMA_CUDA=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 #fi # more benches -#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 -#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUDA=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 diff --git a/scripts/run-with-preset.py b/scripts/run-with-preset.py index a18252730..e986a3604 100755 --- a/scripts/run-with-preset.py +++ b/scripts/run-with-preset.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import logging import argparse import os import subprocess @@ -7,6 +8,8 @@ import sys import yaml +logger = logging.getLogger("run-with-preset") + CLI_ARGS_MAIN_PERPLEXITY = [ "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape", "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag", @@ -56,6 +59,7 @@ parser.add_argument("-bin", "--binary", help="The binary to run.") parser.add_argument("yaml_files", nargs="*", help="Arbitrary number of YAML files from which to read preset values. " "If two files specify the same values the later one will be used.") +parser.add_argument("--verbose", action="store_true", help="increase output verbosity") known_args, unknown_args = parser.parse_known_args() @@ -63,6 +67,8 @@ if not known_args.yaml_files and not unknown_args: parser.print_help() sys.exit(0) +logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO) + props = dict() for yaml_file in known_args.yaml_files: @@ -85,7 +91,7 @@ elif binary.lower().endswith("llama-bench"): elif binary.lower().endswith("server"): cli_args = CLI_ARGS_SERVER else: - print(f"Unknown binary: {binary}") + logger.error(f"Unknown binary: {binary}") sys.exit(1) command_list = [binary] @@ -121,11 +127,11 @@ for cli_arg in cli_args: num_unused = len(props) if num_unused > 10: - print(f"The preset file contained a total of {num_unused} unused properties.") + logger.info(f"The preset file contained a total of {num_unused} unused properties.") elif num_unused > 0: - print("The preset file contained the following unused properties:") + logger.info("The preset file contained the following unused properties:") for prop, value in props.items(): - print(f" {prop}: {value}") + logger.info(f" {prop}: {value}") command_list += unknown_args diff --git a/scripts/server-llm.sh b/scripts/server-llm.sh index 30bbac321..eb6ce458e 100644 --- a/scripts/server-llm.sh +++ b/scripts/server-llm.sh @@ -380,7 +380,7 @@ fi if [[ "$backend" == "cuda" ]]; then printf "[+] Building with CUDA backend\n" - LLAMA_CUBLAS=1 make -j server $log + LLAMA_CUDA=1 make -j server $log elif [[ "$backend" == "cpu" ]]; then printf "[+] Building with CPU backend\n" make -j server $log diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 83e6359b1..7c157eb4c 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -60,11 +60,14 @@ while read c; do src/ggml*.m \ src/ggml*.metal \ src/ggml*.cu \ + src/ggml-cuda/* \ tests/test-opt.cpp \ tests/test-grad0.cpp \ tests/test-quantize-fns.cpp \ tests/test-quantize-perf.cpp \ tests/test-backend-ops.cpp \ + LICENSE \ + scripts/gen-authors.sh \ >> $SRC_LLAMA/ggml-src.patch done < $SRC_LLAMA/ggml-commits @@ -95,6 +98,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-backend-impl.h -> ggml-backend-impl.h # src/ggml-backend.c -> ggml-backend.c # src/ggml-common.h -> ggml-common.h + # src/ggml-cuda/* -> ggml-cuda/ # src/ggml-cuda.cu -> ggml-cuda.cu # src/ggml-cuda.h -> ggml-cuda.h # src/ggml-impl.h -> ggml-impl.h @@ -121,6 +125,9 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # tests/test-quantize-fns.cpp -> tests/test-quantize-fns.cpp # tests/test-quantize-perf.cpp -> tests/test-quantize-perf.cpp # tests/test-backend-ops.cpp -> tests/test-backend-ops.cpp + # + # LICENSE -> LICENSE + # scripts/gen-authors.sh -> scripts/gen-authors.sh cat ggml-src.patch | sed \ -e 's/src\/ggml\.c/ggml.c/g' \ @@ -128,6 +135,7 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \ -e 's/src\/ggml-backend\.c/ggml-backend.c/g' \ -e 's/src\/ggml-common\.h/ggml-common.h/g' \ + -e 's/src\/ggml-cuda\//ggml-cuda\//g' \ -e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \ -e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \ -e 's/src\/ggml-impl\.h/ggml-impl.h/g' \ @@ -153,6 +161,8 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/tests\/test-quantize-fns\.cpp/tests\/test-quantize-fns.cpp/g' \ -e 's/tests\/test-quantize-perf\.cpp/tests\/test-quantize-perf.cpp/g' \ -e 's/tests\/test-backend-ops\.cpp/tests\/test-backend-ops.cpp/g' \ + -e 's/LICENSE/LICENSE/g' \ + -e 's/scripts\/gen-authors\.sh/scripts\/gen-authors.sh/g' \ > ggml-src.patch.tmp mv ggml-src.patch.tmp ggml-src.patch diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index 7c30162e2..d4400695b 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -43a6d4af1971ee2912ff7bc2404011ff327b6a60 +98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index f1e6f0e57..80c43976f 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -5,6 +5,7 @@ cp -rpv ../ggml/src/ggml-alloc.c ./ggml-alloc.c cp -rpv ../ggml/src/ggml-backend-impl.h ./ggml-backend-impl.h cp -rpv ../ggml/src/ggml-backend.c ./ggml-backend.c cp -rpv ../ggml/src/ggml-common.h ./ggml-common.h +cp -rpv ../ggml/src/ggml-cuda/* ./ggml-cuda/ cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h cp -rpv ../ggml/src/ggml-impl.h ./ggml-impl.h @@ -30,3 +31,6 @@ cp -rpv ../ggml/include/ggml/ggml-backend.h ./ggml-backend.h cp -rpv ../ggml/tests/test-opt.cpp ./tests/test-opt.cpp cp -rpv ../ggml/tests/test-grad0.cpp ./tests/test-grad0.cpp cp -rpv ../ggml/tests/test-backend-ops.cpp ./tests/test-backend-ops.cpp + +cp -rpv ../LICENSE ./LICENSE +cp -rpv ../ggml/scripts/gen-authors.sh ./scripts/gen-authors.sh diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py index dff4b4734..0b5b9aafa 100755 --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,8 +1,11 @@ #!/usr/bin/env python3 +import logging import os import hashlib +logger = logging.getLogger("verify-checksum-models") + def sha256sum(file): block_size = 16 * 1024 * 1024 # 16 MB block size @@ -27,7 +30,7 @@ hash_list_file = os.path.join(llama_path, "SHA256SUMS") # Check if the hash list file exists if not os.path.exists(hash_list_file): - print(f"Hash list file not found: {hash_list_file}") + logger.error(f"Hash list file not found: {hash_list_file}") exit(1) # Read the hash file content and split it into an array of lines @@ -46,7 +49,7 @@ for line in hash_list: file_path = os.path.join(llama_path, filename) # Informing user of the progress of the integrity check - print(f"Verifying the checksum of {file_path}") + logger.info(f"Verifying the checksum of {file_path}") # Check if the file exists if os.path.exists(file_path): @@ -73,9 +76,9 @@ for line in hash_list: # Print column headers for results table -print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) -print("-" * 80) +print("filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) # noqa: NP100 +print("-" * 80) # noqa: NP100 # Output the results as a table for r in results: - print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") + print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") # noqa: NP100 diff --git a/scripts/xxd.cmake b/scripts/xxd.cmake new file mode 100644 index 000000000..f5ad6ab9b --- /dev/null +++ b/scripts/xxd.cmake @@ -0,0 +1,16 @@ +# CMake equivalent of `xxd -i ${INPUT} ${OUTPUT}` +# Usage: cmake -DINPUT=examples/server/public/index.html -DOUTPUT=examples/server/index.html.hpp -P scripts/xxd.cmake + +SET(INPUT "" CACHE STRING "Input File") +SET(OUTPUT "" CACHE STRING "Output File") + +get_filename_component(filename "${INPUT}" NAME) +string(REGEX REPLACE "\\.|-" "_" name "${filename}") + +file(READ "${INPUT}" hex_data HEX) +string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," hex_sequence "${hex_data}") + +string(LENGTH ${hex_data} hex_len) +math(EXPR len "${hex_len} / 2") + +file(WRITE "${OUTPUT}" "unsigned char ${name}[] = {${hex_sequence}};\nunsigned int ${name}_len = ${len};\n") diff --git a/sgemm.cpp b/sgemm.cpp new file mode 100644 index 000000000..40ba9d7e9 --- /dev/null +++ b/sgemm.cpp @@ -0,0 +1,1030 @@ +// Copyright 2024 Mozilla Foundation +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +// BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +// ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +// +// _ _ ___ _ _ ___ +// | |_(_)_ _ _ _| _ ) | /_\ / __| +// | _| | ' \ || | _ \ |__ / _ \\__ \. +// \__|_|_||_\_, |___/____/_/ \_\___/ +// |__/ +// +// BASIC LINEAR ALGEBRA SUBPROGRAMS +// +// +// This file implements multithreaded CPU matrix multiplication for the +// common contiguous use case C = Aᵀ * B. These kernels are designed to +// have excellent performance[1] for matrices that fit in the CPU cache +// without imposing any overhead such as cache filling or malloc calls. +// +// This implementation does not guarantee any upper bound with rounding +// errors, which grow along with k. Our goal's to maximally exploit the +// hardware for performance, and then use whatever resources remain for +// improving numerical accuracy. +// +// [1] J. Tunney, ‘LLaMA Now Goes Faster on CPUs’, Mar. 2024. [Online]. +// Available: https://justine.lol/matmul/. [Accessed: 29-Mar-2024]. + +#pragma GCC diagnostic ignored "-Wpedantic" +#pragma GCC diagnostic ignored "-Wignored-attributes" + +#include "sgemm.h" +#include "ggml-impl.h" +#include "ggml-quants.h" + +#ifdef _MSC_VER +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE __attribute__((__noinline__)) +#endif + +#if defined(__ARM_NEON) || defined(__AVX512F__) +#define VECTOR_REGISTERS 32 +#else +#define VECTOR_REGISTERS 16 +#endif + +#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1) + +namespace { + +inline float unhalf(ggml_fp16_t d) { + return GGML_FP16_TO_FP32(d); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// VECTORIZED ARITHMETIC OPERATIONS + +#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +inline __m128 add(__m128 x, __m128 y) { return _mm_add_ps(x, y); } +inline __m128 sub(__m128 x, __m128 y) { return _mm_sub_ps(x, y); } +inline __m128 mul(__m128 x, __m128 y) { return _mm_mul_ps(x, y); } +#endif // __SSE__ + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +inline __m256 add(__m256 x, __m256 y) { return _mm256_add_ps(x, y); } +inline __m256 sub(__m256 x, __m256 y) { return _mm256_sub_ps(x, y); } +inline __m256 mul(__m256 x, __m256 y) { return _mm256_mul_ps(x, y); } +#endif // __AVX__ + +#if defined(__AVX512F__) +inline __m512 add(__m512 x, __m512 y) { return _mm512_add_ps(x, y); } +inline __m512 sub(__m512 x, __m512 y) { return _mm512_sub_ps(x, y); } +inline __m512 mul(__m512 x, __m512 y) { return _mm512_mul_ps(x, y); } +#endif // __AVX512F__ + +#if defined(__ARM_NEON) +inline float32x4_t add(float32x4_t x, float32x4_t y) { return vaddq_f32(x, y); } +inline float32x4_t sub(float32x4_t x, float32x4_t y) { return vsubq_f32(x, y); } +inline float32x4_t mul(float32x4_t x, float32x4_t y) { return vmulq_f32(x, y); } +#endif // __ARM_NEON + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) +inline float16x8_t add(float16x8_t x, float16x8_t y) { return vaddq_f16(x, y); } +inline float16x8_t sub(float16x8_t x, float16x8_t y) { return vsubq_f16(x, y); } +inline float16x8_t mul(float16x8_t x, float16x8_t y) { return vmulq_f16(x, y); } +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// VECTORIZED FUSED MULTIPLY ADD + +/** + * Computes a * b + c. + */ +template +inline U madd(T a, T b, U c) { + return add(mul(a, b), c); +} + +#if defined(__FMA__) +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +template <> +inline __m256 madd(__m256 a, __m256 b, __m256 c) { + return _mm256_fmadd_ps(a, b, c); +} +#endif +#if defined(__AVX512F__) +template <> +inline __m512 madd(__m512 a, __m512 b, __m512 c) { + return _mm512_fmadd_ps(a, b, c); +} +#endif +#endif + +#if defined(__ARM_FEATURE_FMA) +template <> +inline float32x4_t madd(float32x4_t a, float32x4_t b, float32x4_t c) { + return vfmaq_f32(c, b, a); +} +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) +template <> +inline float16x8_t madd(float16x8_t a, float16x8_t b, float16x8_t c) { + return vfmaq_f16(c, b, a); +} +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// VECTORIZED HORIZONTAL SUM + +#if defined(__ARM_NEON) +inline float hsum(float32x4_t x) { + return vaddvq_f32(x); +} +#endif // __ARM_NEON + +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) +inline float hsum(float16x8_t x) { + return vaddvq_f32(vaddq_f32(vcvt_f32_f16(vget_low_f16(x)), + vcvt_f32_f16(vget_high_f16(x)))); +} +#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +inline float hsum(__m128 x) { +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) + x = _mm_add_ps(x, _mm_movehl_ps(x, x)); + x = _mm_add_ss(x, _mm_movehdup_ps(x)); +#else + __m128 t; + t = _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)); + x = _mm_add_ps(x, t); + t = _mm_movehl_ps(t, x); + x = _mm_add_ss(x, t); +#endif + return _mm_cvtss_f32(x); +} +#endif + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +inline float hsum(__m256 x) { + return hsum(_mm_add_ps(_mm256_extractf128_ps(x, 1), + _mm256_castps256_ps128(x))); +} +#endif // __AVX__ + +#if defined(__AVX512F__) +inline float hsum(__m512 x) { + return _mm512_reduce_add_ps(x); +} +#endif // __AVX512F__ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// VECTORIZED MEMORY LOADING + +template T load(const U *); + +#if defined(__ARM_NEON) +template <> inline float32x4_t load(const float *p) { + return vld1q_f32(p); +} +#if !defined(_MSC_VER) +template <> inline float16x8_t load(const ggml_fp16_t *p) { + return vld1q_f16((const float16_t *)p); +} +template <> inline float32x4_t load(const ggml_fp16_t *p) { + return vcvt_f32_f16(vld1_f16((const float16_t *)p)); +} +#endif // _MSC_VER +#endif // __ARM_NEON + +#if defined(__SSE__) || defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +template <> inline __m128 load(const float *p) { + return _mm_loadu_ps(p); +} +#endif // __SSE__ + +#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) +template <> inline __m256 load(const float *p) { + return _mm256_loadu_ps(p); +} +#endif // __AVX__ + +#if defined(__F16C__) +template <> inline __m256 load(const ggml_fp16_t *p) { + return _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)p)); +} +#endif // __F16C__ + +#if defined(__AVX512F__) +template <> inline __m512 load(const float *p) { + return _mm512_loadu_ps(p); +} +template <> inline __m512 load(const ggml_fp16_t *p) { + return _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)p)); +} +#endif // __AVX512F__ + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// FLOATING POINT MATRIX MULTIPLICATION + +template +class tinyBLAS { + public: + tinyBLAS(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, + int ith, int nth) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + } + + void matmul(int64_t m, int64_t n, int task) { + if (task == GGML_TASK_TYPE_COMPUTE) + mnpack(0, m, 0, n); + } + + private: + NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + switch ((MIN(m - m0, 5) << 4) | MIN(n - n0, 5)) { +#if VECTOR_REGISTERS == 32 + case 0x55: + mc = 5; + nc = 5; + gemm<5, 5>(m0, m, n0, n); + break; + case 0x45: + mc = 4; + nc = 5; + gemm<4, 5>(m0, m, n0, n); + break; + case 0x54: + mc = 5; + nc = 4; + gemm<5, 4>(m0, m, n0, n); + break; + case 0x44: + mc = 4; + nc = 4; + gemm<4, 4>(m0, m, n0, n); + break; + case 0x53: + mc = 5; + nc = 3; + gemm<5, 3>(m0, m, n0, n); + break; + case 0x35: + mc = 3; + nc = 5; + gemm<3, 5>(m0, m, n0, n); + break; + case 0x43: + mc = 4; + nc = 3; + gemm<4, 3>(m0, m, n0, n); + break; +#else + case 0x55: + case 0x54: + case 0x53: + case 0x45: + case 0x44: + case 0x43: + mc = 4; + nc = 3; + gemm<4, 3>(m0, m, n0, n); + break; + case 0x35: +#endif + case 0x34: + mc = 3; + nc = 4; + gemm<3, 4>(m0, m, n0, n); + break; + case 0x52: + mc = 5; + nc = 2; + gemm<5, 2>(m0, m, n0, n); + break; + case 0x33: + mc = 3; + nc = 3; + gemm<3, 3>(m0, m, n0, n); + break; + case 0x25: + mc = 2; + nc = 5; + gemm<2, 5>(m0, m, n0, n); + break; + case 0x42: + mc = 4; + nc = 2; + gemm<4, 2>(m0, m, n0, n); + break; + case 0x24: + mc = 2; + nc = 4; + gemm<2, 4>(m0, m, n0, n); + break; + case 0x32: + mc = 3; + nc = 2; + gemm<3, 2>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm<2, 3>(m0, m, n0, n); + break; + case 0x51: + mc = 5; + nc = 1; + gemm<5, 1>(m0, m, n0, n); + break; + case 0x41: + mc = 4; + nc = 1; + gemm<4, 1>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm<2, 2>(m0, m, n0, n); + break; + case 0x15: + mc = 1; + nc = 5; + gemm<1, 5>(m0, m, n0, n); + break; + case 0x14: + mc = 1; + nc = 4; + gemm<1, 4>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm<3, 1>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm<1, 3>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm<2, 1>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm<1, 2>(m0, m, n0, n); + break; + case 0x11: + mc = 1; + nc = 1; + gemm<1, 1>(m0, m, n0, n); + break; + default: + return; + } + mp = m0 + (m - m0) / mc * mc; + np = n0 + (n - n0) / nc * nc; + mnpack(mp, m, n0, np); + mnpack(m0, m, np, n); + } + + template + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + D Cv[RN][RM] = {}; + for (int64_t l = 0; l < k; l += KN) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) + Cv[j][i] = madd(load(A + lda * (ii + i) + l), + load(B + ldb * (jj + j) + l), + Cv[j][i]); + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) + C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); + } + } + + const TA *const A; + const TB *const B; + TC *const C; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; + const int ith; + const int nth; +}; + +////////////////////////////////////////////////////////////////////////////////////////// +// QUANT ZERO MATRIX MULTIPLICATION + +#if defined(__ARM_FEATURE_DOTPROD) +template +class tinyBLAS_Q0_ARM { + public: + tinyBLAS_Q0_ARM(int64_t k, + const TA *A, int64_t lda, + const block_q8_0 *B, int64_t ldb, + float *C, int64_t ldc, + int ith, int nth) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + } + + void matmul(int64_t m, int64_t n, int task) { + if (task == GGML_TASK_TYPE_COMPUTE) + mnpack(0, m, 0, n); + } + + private: + NOINLINE void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + switch ((MIN(m - m0, 3) << 4) | MIN(n - n0, 3ll)) { + case 0x33: + mc = 3; + nc = 3; + gemm<3, 3>(m0, m, n0, n); + break; + case 0x32: + mc = 3; + nc = 2; + gemm<3, 2>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm<2, 3>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm<2, 2>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm<3, 1>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm<1, 3>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm<2, 1>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm<1, 2>(m0, m, n0, n); + break; + case 0x11: + mc = 1; + nc = 1; + gemm<1, 1>(m0, m, n0, n); + break; + default: + return; + } + mp = m0 + (m - m0) / mc * mc; + np = n0 + (n - n0) / nc * nc; + mnpack(mp, m, n0, np); + mnpack(m0, m, np, n); + } + + template + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + float32x4_t Cv[RN][RM] = {}; + for (int64_t l = 0; l < k; ++l) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) + Cv[j][i] = vmlaq_n_f32(Cv[j][i], + vcvtq_f32_s32(vdotq_s32( + vdotq_s32(vdupq_n_s32(0), + load_lo(A + lda * (ii + i) + l), + load_lo(B + ldb * (jj + j) + l)), + load_hi(A + lda * (ii + i) + l), + load_hi(B + ldb * (jj + j) + l))), + unhalf(A[lda * (ii + i) + l].d) * + unhalf(B[ldb * (jj + j) + l].d)); + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) + C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); + } + } + + inline int8x16_t load_lo(const block_q8_0 *b) { + return vld1q_s8(b->qs); + } + + inline int8x16_t load_hi(const block_q8_0 *b) { + return vld1q_s8(b->qs + 16); + } + + inline int8x16_t load_lo(const block_q4_0 *b) { + return vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vld1q_u8(b->qs), + vdupq_n_u8(0x0f))), + vdupq_n_s8(0x8)); + } + + inline int8x16_t load_hi(const block_q4_0 *b) { + return vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(vld1q_u8(b->qs), 4)), + vdupq_n_s8(0x8)); + } + + const TA *const A; + const block_q8_0 *const B; + float *const C; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; + const int ith; + const int nth; +}; +#endif // __ARM_FEATURE_DOTPROD + +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) +template +class tinyBLAS_Q0_AVX { + public: + tinyBLAS_Q0_AVX(int64_t k, + const TA *A, int64_t lda, + const TB *B, int64_t ldb, + TC *C, int64_t ldc, + int ith, int nth) + : A(A), B(B), C(C), k(k), lda(lda), ldb(ldb), ldc(ldc), ith(ith), nth(nth) { + } + + void matmul(int64_t m, int64_t n, int task) { + if (task == GGML_TASK_TYPE_COMPUTE) + mnpack(0, m, 0, n); + } + + private: + void mnpack(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t mc, nc, mp, np; + switch ((MIN(m - m0, 4) << 4) | MIN(n - n0, 4)) { +#if VECTOR_REGISTERS == 32 + case 0x44: + mc = 4; + nc = 4; + gemm<4, 4>(m0, m, n0, n); + break; + case 0x43: + mc = 4; + nc = 3; + gemm<4, 3>(m0, m, n0, n); + break; + case 0x34: + mc = 3; + nc = 4; + gemm<3, 4>(m0, m, n0, n); + break; + case 0x33: + mc = 3; + nc = 3; + gemm<3, 3>(m0, m, n0, n); + break; + case 0x42: + mc = 4; + nc = 2; + gemm<4, 2>(m0, m, n0, n); + break; + case 0x24: + mc = 2; + nc = 4; + gemm<2, 4>(m0, m, n0, n); + break; +#else + case 0x44: + case 0x43: + case 0x42: + mc = 4; + nc = 2; + gemm<4, 2>(m0, m, n0, n); + break; + case 0x34: + case 0x24: + mc = 2; + nc = 4; + gemm<2, 4>(m0, m, n0, n); + break; + case 0x33: +#endif + case 0x32: + mc = 3; + nc = 2; + gemm<3, 2>(m0, m, n0, n); + break; + case 0x23: + mc = 2; + nc = 3; + gemm<2, 3>(m0, m, n0, n); + break; + case 0x41: + mc = 4; + nc = 1; + gemm<4, 1>(m0, m, n0, n); + break; + case 0x22: + mc = 2; + nc = 2; + gemm<2, 2>(m0, m, n0, n); + break; + case 0x14: + mc = 1; + nc = 4; + gemm<1, 4>(m0, m, n0, n); + break; + case 0x31: + mc = 3; + nc = 1; + gemm<3, 1>(m0, m, n0, n); + break; + case 0x13: + mc = 1; + nc = 3; + gemm<1, 3>(m0, m, n0, n); + break; + case 0x21: + mc = 2; + nc = 1; + gemm<2, 1>(m0, m, n0, n); + break; + case 0x12: + mc = 1; + nc = 2; + gemm<1, 2>(m0, m, n0, n); + break; + case 0x11: + mc = 1; + nc = 1; + gemm<1, 1>(m0, m, n0, n); + break; + default: + return; + } + mp = m0 + (m - m0) / mc * mc; + np = n0 + (n - n0) / nc * nc; + mnpack(mp, m, n0, np); + mnpack(m0, m, np, n); + } + + template + NOINLINE void gemm(int64_t m0, int64_t m, int64_t n0, int64_t n) { + int64_t ytiles = (m - m0) / RM; + int64_t xtiles = (n - n0) / RN; + int64_t tiles = xtiles * ytiles; + int64_t duty = (tiles + nth - 1) / nth; + int64_t start = duty * ith; + int64_t end = start + duty; + if (end > tiles) + end = tiles; + for (int64_t job = start; job < end; ++job) { + int64_t ii = m0 + job / xtiles * RM; + int64_t jj = n0 + job % xtiles * RN; + __m256 Cv[RN][RM] = {}; + for (int64_t l = 0; l < k; ++l) + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) { +#if defined(__AVX2__) + __m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l), + load(A + lda * (ii + i) + l)), + _mm256_sign_epi8(load(B + ldb * (jj + j) + l), + load(A + lda * (ii + i) + l))); +#else + __m128i ali0 = load0(A + lda * (ii + i) + l); + __m128i ali1 = load1(A + lda * (ii + i) + l); + __m128i blj0 = load0(B + ldb * (jj + j) + l); + __m128i blj1 = load1(B + ldb * (jj + j) + l); + + __m128i sepAA0 = _mm_sign_epi8(ali0, ali0); + __m128i sepAA1 = _mm_sign_epi8(ali1, ali1); + __m128i sepBA0 = _mm_sign_epi8(blj0, ali0); + __m128i sepBA1 = _mm_sign_epi8(blj1, ali1); + + // updot + const __m128i oneFill = _mm_set1_epi16(1); + __m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0); + __m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1); + __m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0))); +#endif + Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) * + unhalf(B[ldb * (jj + j) + l].d)), + udTmp, + Cv[j][i]); + } + for (int64_t j = 0; j < RN; ++j) + for (int64_t i = 0; i < RM; ++i) + C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]); + } + } + + inline __m256i load(const block_q8_0 *b) { + return _mm256_loadu_si256((const __m256i *)b->qs); + } + + inline __m128i load0(const block_q8_0 *b) { + return _mm_loadu_si128((const __m128i *)b->qs); + } + + inline __m128i load1(const block_q8_0 *b) { + return _mm_loadu_si128(((const __m128i *)b->qs) + 1); + } + + inline __m256i load(const block_q4_0 *b) { + return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8)); + } + + inline __m128i load0(const block_q4_0 *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8)); + } + + inline __m128i load1(const block_q4_0 *b) { + const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs)); + return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8)); + } + + inline __m256 updot(__m256i u, __m256i s) { + __m256i res; +#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__)) + res = _mm256_dpbusd_epi32(_mm256_setzero_si256(), u, s); +#else + res = _mm256_madd_epi16(_mm256_set1_epi16(1), _mm256_maddubs_epi16(u, s)); +#endif + return _mm256_cvtepi32_ps(res); + } + + static inline __m256i denibble(const uint8_t *p) { + __m128i x = _mm_loadu_si128((const __m128i *)p); + return _mm256_and_si256(_mm256_set1_epi8(15), + _mm256_insertf128_si256(_mm256_castsi128_si256(x), + _mm_srli_epi16(x, 4), 1)); + } + + const TA *const A; + const TB *const B; + TC *const C; + const int64_t k; + const int64_t lda; + const int64_t ldb; + const int64_t ldc; + const int ith; + const int nth; +}; +#endif // __AVX__ + +} // namespace + +/** + * Performs optimized matrix multiplication on CPU. + * + * This subroutine may compute C = Aᵀ * B with column major ordering. + * Despite its name, this isn't a generalized implementation. Work is + * only performed when a handwritten kernel is written and available. + * Otherwise the caller should fall back to a general matmul routine. + * + * For example, for single-threaded single-precision GEMM you can say + * + * llamafile_sgemm(m, n, k, A, lda, B, ldb, C, ldc, + * 0, 1, GGML_TASK_TYPE_COMPUTE, + * GGML_TYPE_F32, GGML_TYPE_F32, GGML_TYPE_F32); + * + * @param m is rows in `A` and `C` + * @param n is cols in `B` and `C` + * @param k is cols in `A` and rows in `B` + * @param A is first input matrix (always transposed) + * @param lda is row stride of `A` + * @param B is second input matrix (never transposed) + * @param ldb is row stride of `B` + * @param C is input/output array of output matrices + * @param ldc is row stride of `C` + * @param ith is thread id (must be less than `nth`) + * @param nth is number of threads (must be greater than zero) + * @param task is GGML task type + * @param Atype is GGML data type of `A` + * @param Btype is GGML data type of `B` + * @param Ctype is GGML data type of `C` + * @return true if this function was able to service the matmul request + */ +bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda, const void *B, int64_t ldb, void *C, + int64_t ldc, int ith, int nth, int task, int Atype, int Btype, int Ctype) { + + assert(m >= 0); + assert(n >= 0); + assert(k >= 0); + assert(lda >= k); + assert(ldb >= k); + assert(ldc >= m); + assert(nth > 0); + assert(ith < nth); + + if (Ctype != GGML_TYPE_F32) + return false; + + switch (Atype) { + + case GGML_TYPE_F32: { + if (Btype != GGML_TYPE_F32) + return false; +#if defined(__AVX512F__) + if (k % 16) + return false; + tinyBLAS<16, __m512, __m512, float, float, float> tb{ + k, (const float *)A, lda, + (const float *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__AVX__) || defined(__AVX2__) + if (k % 8) + return false; + tinyBLAS<8, __m256, __m256, float, float, float> tb{ + k, (const float *)A, lda, + (const float *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_NEON) + if (n < 4) + return false; + if (k % 4) + return false; + tinyBLAS<4, float32x4_t, float32x4_t, float, float, float> tb{ + k, (const float *)A, lda, + (const float *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return false; +#endif + } + + case GGML_TYPE_F16: { +#if defined(__AVX512F__) + if (k % 16) + return false; + if (Btype != GGML_TYPE_F32) + return false; + tinyBLAS<16, __m512, __m512, ggml_fp16_t, float, float> tb{ + k, (const ggml_fp16_t *)A, lda, + (const float *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#elif (defined(__AVX__) || defined(__AVX2__)) && defined(__F16C__) + if (k % 8) + return false; + if (Btype != GGML_TYPE_F32) + return false; + tinyBLAS<8, __m256, __m256, ggml_fp16_t, float, float> tb{ + k, (const ggml_fp16_t *)A, lda, + (const float *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(_MSC_VER) + if (n < 8) + return false; + if (k % 8) + return false; + if (Btype != GGML_TYPE_F16) + return false; + tinyBLAS<8, float16x8_t, float16x8_t, ggml_fp16_t, ggml_fp16_t, float> tb{ + k, (const ggml_fp16_t *)A, lda, + (const ggml_fp16_t *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_NEON) && !defined(_MSC_VER) + if (k % 4) + return false; + if (Btype != GGML_TYPE_F32) + return false; + tinyBLAS<4, float32x4_t, float32x4_t, ggml_fp16_t, float, float> tb{ + k, (const ggml_fp16_t *)A, lda, + (const float *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return false; +#endif + } + + case GGML_TYPE_Q8_0: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ + k, (const block_q8_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_FEATURE_DOTPROD) + tinyBLAS_Q0_ARM tb{ + k, (const block_q8_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return false; +#endif + } + + case GGML_TYPE_Q4_0: { + if (Btype != GGML_TYPE_Q8_0) + return false; +#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__) + tinyBLAS_Q0_AVX tb{ + k, (const block_q4_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#elif defined(__ARM_FEATURE_DOTPROD) + tinyBLAS_Q0_ARM tb{ + k, (const block_q4_0 *)A, lda, + (const block_q8_0 *)B, ldb, + (float *)C, ldc, + ith, nth}; + tb.matmul(m, n, task); + return true; +#else + return false; +#endif + } + + default: + return false; + } + + (void)m; + (void)n; + (void)k; + (void)A; + (void)lda; + (void)B; + (void)ldb; + (void)C; + (void)ldc; + (void)ith; + (void)nth; + (void)task; + (void)Atype; + (void)Btype; + (void)Ctype; +} diff --git a/sgemm.h b/sgemm.h new file mode 100644 index 000000000..f29747d0a --- /dev/null +++ b/sgemm.h @@ -0,0 +1,14 @@ +#pragma once +#include +#include +#ifdef __cplusplus +extern "C" { +#endif + +bool llamafile_sgemm(int64_t, int64_t, int64_t, const void *, int64_t, + const void *, int64_t, void *, int64_t, int, int, + int, int, int, int); + +#ifdef __cplusplus +} +#endif diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a43439aed..d409a1d6b 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -1,10 +1,40 @@ +function(llama_test target) + include(CMakeParseArguments) + set(options) + set(oneValueArgs NAME LABEL WORKING_DIRECTORY) + set(multiValueArgs ARGS) + cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (NOT DEFINED LLAMA_TEST_LABEL) + set(LLAMA_TEST_LABEL "main") + endif() + if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY) + set(LLAMA_TEST_WORKING_DIRECTORY .) + endif() + if (DEFINED LLAMA_TEST_NAME) + set(TEST_NAME ${LLAMA_TEST_NAME}) + else() + set(TEST_NAME ${target}) + endif() + + set(TEST_TARGET ${target}) + + add_test( + NAME ${TEST_NAME} + WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY} + COMMAND $ + ${LLAMA_TEST_ARGS}) + + set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL}) +endfunction() + # Builds and runs a test source file. # Optional args: # - NAME: name of the executable & test target (defaults to the source file name without extension) # - LABEL: label for the test (defaults to main) # - ARGS: arguments to pass to the test executable # - WORKING_DIRECTORY -function(llama_test source) +function(llama_target_and_test source) include(CMakeParseArguments) set(options) set(oneValueArgs NAME LABEL WORKING_DIRECTORY) @@ -25,7 +55,7 @@ function(llama_test source) add_executable(${TEST_TARGET} ${source} get-model.cpp) install(TARGETS ${TEST_TARGET} RUNTIME) - target_link_libraries(${TEST_TARGET} PRIVATE common json-schema-to-grammar) + target_link_libraries(${TEST_TARGET} PRIVATE common) add_test( NAME ${TEST_TARGET} WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY} @@ -35,40 +65,71 @@ function(llama_test source) set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL}) endfunction() -# llama_test(test-double-float.cpp) # SLOW -llama_test(test-quantize-fns.cpp) -llama_test(test-quantize-perf.cpp) -llama_test(test-sampling.cpp) -llama_test(test-chat-template.cpp) +# build test-tokenizer-0 target once and add many tests +add_executable(test-tokenizer-0 test-tokenizer-0.cpp) +target_link_libraries(test-tokenizer-0 PRIVATE common) +install(TARGETS test-tokenizer-0 RUNTIME) -llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) -llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf) +# TODO: enable when fixed +# https://github.com/ggerganov/llama.cpp/pull/7036 +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) +#llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf) +llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf) -llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) -llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) +# build test-tokenizer-1-bpe target once and add many tests +add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp) +target_link_libraries(test-tokenizer-1-bpe PRIVATE common) +install(TARGETS test-tokenizer-1-bpe RUNTIME) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-stablelm-3b-4e1t ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) -llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf) -#llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG +# TODO: disabled due to slowness +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf) +#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) -llama_test(test-grammar-parser.cpp) -llama_test(test-llama-grammar.cpp) -llama_test(test-grad0.cpp) -# llama_test(test-opt.cpp) # SLOW -llama_test(test-backend-ops.cpp) +# build test-tokenizer-1-spm target once and add many tests +add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp) +target_link_libraries(test-tokenizer-1-spm PRIVATE common) +install(TARGETS test-tokenizer-1-spm RUNTIME) -llama_test(test-rope.cpp) +llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf) +#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) -llama_test(test-model-load-cancel.cpp LABEL "model") -llama_test(test-autorelease.cpp LABEL "model") +# llama_target_and_test(test-double-float.cpp) # SLOW +llama_target_and_test(test-quantize-fns.cpp) +llama_target_and_test(test-quantize-perf.cpp) +llama_target_and_test(test-sampling.cpp) +llama_target_and_test(test-chat-template.cpp) -llama_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) +llama_target_and_test(test-grammar-parser.cpp) +llama_target_and_test(test-llama-grammar.cpp) +llama_target_and_test(test-grammar-integration.cpp) +llama_target_and_test(test-grad0.cpp) +# llama_target_and_test(test-opt.cpp) # SLOW +llama_target_and_test(test-backend-ops.cpp) + +llama_target_and_test(test-rope.cpp) + +llama_target_and_test(test-model-load-cancel.cpp LABEL "model") +llama_target_and_test(test-autorelease.cpp LABEL "model") + +llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..) target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) # dummy executable - not installed diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index c2916c3e4..0d66de5d9 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) { ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float)); - } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) { + } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) { GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0); std::vector dataq(ggml_row_size(tensor->type, size)); std::vector imatrix(tensor->ne[0], 1.0f); // dummy importance matrix @@ -92,6 +92,8 @@ static std::vector tensor_to_float(const ggml_tensor * t) { size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0]; if (t->type == GGML_TYPE_F16) { tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i])); + } else if (t->type == GGML_TYPE_BF16) { + tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i])); } else if (t->type == GGML_TYPE_F32) { tv.push_back(*(float *) &buf[i]); } else if (t->type == GGML_TYPE_I32) { @@ -101,7 +103,7 @@ static std::vector tensor_to_float(const ggml_tensor * t) { } else if (t->type == GGML_TYPE_I8) { tv.push_back((float)*(int8_t *) &buf[i]); } else if (quantized) { - tt.to_float(&buf[i], vq.data(), ggml_blck_size(t->type)); + tt.to_float(&buf[i], vq.data(), bs); tv.insert(tv.end(), vq.begin(), vq.end()); } else { GGML_ASSERT(false); @@ -948,14 +950,14 @@ struct test_mul_mat_id : public test_case { const ggml_type type_a; const ggml_type type_b; const int n_mats; - const int id; + const int n_used; + const bool b; // brodcast b matrix const int64_t m; const int64_t n; const int64_t k; - const bool v; // view (non-contiguous ids) std::string vars() override { - return VARS_TO_STR8(type_a, type_b, n_mats, id, m, n, k, v); + return VARS_TO_STR8(type_a, type_b, n_mats, n_used, b, m, n, k); } double max_nmse_err() override { @@ -972,24 +974,22 @@ struct test_mul_mat_id : public test_case { } test_mul_mat_id(ggml_type type_a = GGML_TYPE_F32, ggml_type type_b = GGML_TYPE_F32, - int n_mats = 2, int id = 0, - int64_t m = 32, int64_t n = 32, int64_t k = 32, bool v = false) - : type_a(type_a), type_b(type_b), n_mats(n_mats), id(id), - m(m), n(n), k(k), v(v) {} + int n_mats = 8, int n_used = 2, bool b = false, + int64_t m = 32, int64_t n = 32, int64_t k = 32) + : type_a(type_a), type_b(type_b), n_mats(n_mats), n_used(n_used), b(b), + m(m), n(n), k(k) { + GGML_ASSERT(n_used <= n_mats); + } ggml_tensor * build_graph(ggml_context * ctx) override { // C^T = A * B^T: (k, m) * (k, n) => (m, n) - std::vector mats; - for (int i = 0; i < n_mats; i++) { - ggml_tensor * a = ggml_new_tensor_2d(ctx, type_a, k, m); - mats.push_back(a); - } + ggml_tensor * as = ggml_new_tensor_3d(ctx, type_a, k, m, n_mats); ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, n); - if (v) { - ids = ggml_view_2d(ctx, ids, n_mats/2, ids->ne[1], ids->nb[1], 0); + if (n_used != n_mats) { + ids = ggml_view_2d(ctx, ids, n_used, n, ids->nb[1], 0); } - ggml_tensor * b = ggml_new_tensor_2d(ctx, type_b, k, n); - ggml_tensor * out = ggml_mul_mat_id(ctx, mats.data(), n_mats, ids, v ? id/2 : id, b); + ggml_tensor * b = ggml_new_tensor_3d(ctx, type_b, k, this->b ? 1 : n_used, n); + ggml_tensor * out = ggml_mul_mat_id(ctx, as, b, ids); return out; } @@ -1092,6 +1092,12 @@ struct test_soft_max : public test_case { return VARS_TO_STR5(type, ne, mask, scale, max_bias); } + // the 1024 test with bias occasionally fails: + // SOFT_MAX(type=f32,ne=[1024,16,1,1],mask=1,scale=1.000000,max_bias=8.000000): [SOFT_MAX] NMSE = 0.000000103 > 0.000000100 FAIL + virtual double max_nmse_err() override { + return 1e-6; + } + test_soft_max(ggml_type type = GGML_TYPE_F32, std::array ne = {10, 10, 10, 10}, bool mask = false, @@ -1103,7 +1109,7 @@ struct test_soft_max : public test_case { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_tensor * mask = nullptr; if (this->mask) { - mask = ggml_new_tensor_2d(ctx, type, ne[0], ne[1]); + mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]); } ggml_tensor * pos = nullptr; if (max_bias > 0.0f) { @@ -1477,91 +1483,34 @@ struct test_leaky_relu : public test_case { } }; -// Mixtral MOE -struct test_moe : public test_case { - const int n_experts; - const int n_experts_per_tok; - const int n_tokens; - const int n_embd; - const int n_ff; - - std::string op_desc(ggml_tensor * t) override { - return "MOE"; - - GGML_UNUSED(t); - } +// GGML_OP_FLASH_ATTN_EXT +struct test_flash_attn_ext : public test_case { + const int64_t hs; // head size + const int64_t nh; // num heads + const int64_t kv; // kv size + const int64_t nb; // batch size std::string vars() override { - return VARS_TO_STR5(n_experts, n_experts_per_tok, n_tokens, n_embd, n_ff); + return VARS_TO_STR4(hs, nh, kv, nb); } - test_moe(int n_experts = 8, int n_experts_per_tok = 2, int n_tokens = 1, int n_embd = 4096, int n_ff = 14336) - : n_experts(n_experts), n_experts_per_tok(n_experts_per_tok), n_tokens(n_tokens), n_embd(n_embd), n_ff(n_ff) { + double max_nmse_err() override { + return 5e-4; } + test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8) + : hs(hs), nh(nh), kv(kv), nb(nb) {} + ggml_tensor * build_graph(ggml_context * ctx) override { - ggml_tensor * ffn_gate_inp = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_experts); - - std::vector ffn_up_exp(n_experts); - std::vector ffn_gate_exp(n_experts); - std::vector ffn_down_exp(n_experts); - - for (int i = 0; i < n_experts; ++i) { - ffn_up_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - ffn_gate_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); - ffn_down_exp[i] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); - } - - ggml_tensor * cur = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_tensor * logits = ggml_mul_mat(ctx, ffn_gate_inp, cur); - ggml_tensor * probs = ggml_soft_max_ext(ctx, logits, nullptr, nullptr, 1.0f/sqrtf(n_embd), 0.0f); - - // select experts - ggml_tensor * selected_experts = ggml_top_k(ctx, probs, n_experts_per_tok); - - ggml_tensor * weights = ggml_get_rows(ctx, - ggml_reshape_3d(ctx, probs, 1, n_experts, n_tokens), selected_experts); - - weights = ggml_reshape_2d(ctx, weights, n_experts_per_tok, n_tokens); - - ggml_tensor * weights_sum = ggml_sum_rows(ctx, weights); - - weights = ggml_div(ctx, weights, weights_sum); - - // compute expert outputs - ggml_tensor * moe_out = nullptr; - - for (int i = 0; i < n_experts_per_tok; ++i) { - ggml_tensor * cur_expert; - - ggml_tensor * cur_up = ggml_mul_mat_id(ctx, ffn_up_exp.data(), n_experts, selected_experts, i, cur); - - ggml_tensor * cur_gate = ggml_mul_mat_id(ctx, ffn_gate_exp.data(), n_experts, selected_experts, i, cur); - - cur_gate = ggml_silu(ctx, cur_gate); - - cur_expert = ggml_mul(ctx, cur_up, cur_gate); - - cur_expert = ggml_mul_mat_id(ctx, ffn_down_exp.data(), n_experts, selected_experts, i, cur_expert); - - cur_expert = ggml_mul(ctx, cur_expert, - ggml_view_2d(ctx, weights, 1, n_tokens, weights->nb[1], i*weights->nb[0])); - - if (i == 0) { - moe_out = cur_expert; - } else { - moe_out = ggml_add(ctx, moe_out, cur_expert); - } - } - - cur = moe_out; - - return cur; + ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1); + ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1); + ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1); + ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1); + ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs)); + return out; } }; - enum llm_norm_type { LLM_NORM, LLM_NORM_RMS, @@ -1700,7 +1649,6 @@ public: } }; - // Llama struct test_llama : public test_llm { static constexpr float freq_base = 10000.0f; @@ -1749,7 +1697,7 @@ struct test_llama : public test_llm { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1); ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400); ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400); @@ -1871,7 +1819,7 @@ struct test_falcon : public test_llm { struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, hp.n_tokens); // KQ_mask (mask for 1 head, it will be broadcasted to all heads) - struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, hp.n_kv, hp.n_tokens, 1); + struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, hp.n_kv, hp.n_tokens, 1); ggml_tensor * k_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400); ggml_tensor * v_l = ggml_new_tensor_1d(ctx, GGML_TYPE_F16, 1638400); @@ -1952,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op std::default_random_engine rng(0); const ggml_type all_types[] = { - GGML_TYPE_F32, GGML_TYPE_F16, + GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_Q4_0, GGML_TYPE_Q4_1, GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, GGML_TYPE_Q8_0, @@ -1960,13 +1908,33 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op GGML_TYPE_Q4_K, GGML_TYPE_Q5_K, GGML_TYPE_Q6_K, GGML_TYPE_IQ2_XXS, GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, - GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, + GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, + GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, + }; + + const ggml_type base_types[] = { + GGML_TYPE_F32, GGML_TYPE_F16, + GGML_TYPE_Q4_0, + GGML_TYPE_Q4_K, + GGML_TYPE_IQ2_XXS + }; + + const ggml_type other_types[] = { + GGML_TYPE_Q4_1, + GGML_TYPE_Q5_0, GGML_TYPE_Q5_1, + GGML_TYPE_Q8_0, + GGML_TYPE_Q2_K, GGML_TYPE_Q3_K, + GGML_TYPE_Q5_K, + GGML_TYPE_Q6_K, + GGML_TYPE_IQ2_XS, GGML_TYPE_IQ2_S, + GGML_TYPE_IQ3_XXS, GGML_TYPE_IQ1_S, GGML_TYPE_IQ1_M, GGML_TYPE_IQ4_NL, GGML_TYPE_IQ3_S, GGML_TYPE_IQ4_XS, }; // unary ops for (int op = 0; op < GGML_UNARY_OP_COUNT; op++) { test_cases.emplace_back(new test_unary((ggml_unary_op) op)); + test_cases.emplace_back(new test_unary((ggml_unary_op) op, GGML_TYPE_F32, { 7, 13, 19, 23 })); } test_cases.emplace_back(new test_get_rows(GGML_TYPE_F32, 1, 8, 2, 1, false)); @@ -2071,7 +2039,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps)); } - for (ggml_type type_a : all_types) { + for (ggml_type type_a : base_types) { for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) { test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1})); @@ -2091,12 +2059,45 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op } } - for (ggml_type type_a : all_types) { + for (ggml_type type_a : other_types) { + for (ggml_type type_b : {GGML_TYPE_F32}) { + test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1})); + } + } + + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 128, { 8, 1}, {1, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 128, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 2, 64, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1})); + test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1})); + + for (ggml_type type_a : base_types) { for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { - for (int n_mats : {2, 4, 8}) { - for (int id = 0; id < n_mats; id++) { - for (bool v : {false, true}) { - test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, id, 16, 16, 256, v)); + for (int n_mats : {4, 8}) { + for (int n_used : {1, 2, 4}) { + for (bool b : {false, true}) { + for (int n : {1, 32}) { + int m = 512; + int k = 256; + test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k)); + } + } + } + } + } + } + + for (ggml_type type_a : other_types) { + for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) { + for (int n_mats : {4}) { + for (int n_used : {2}) { + for (bool b : {false}) { + for (int n : {1}) { + int m = 512; + int k = 256; + test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k)); + } } } } @@ -2130,7 +2131,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (float scale : {1.0f, 0.1f}) { for (int64_t ne0 : {16, 1024}) { for (int64_t ne1 : {16, 1024}) { - test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias)); + test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0, ne1, 1, 1}, mask, scale, max_bias)); test_cases.emplace_back(new test_soft_max(GGML_TYPE_F32, {ne0-1, ne1-1, 1, 1}, mask, scale, max_bias)); } } @@ -2162,6 +2163,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op for (ggml_sort_order order : {GGML_SORT_ORDER_ASC, GGML_SORT_ORDER_DESC}) { test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {8, 1, 1, 1}, order)); test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {16, 10, 10, 10}, order)); + test_cases.emplace_back(new test_argsort(GGML_TYPE_F32, {60, 10, 10, 10}, order)); // qwen } test_cases.emplace_back(new test_sum_rows()); @@ -2173,13 +2175,22 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + for (int hs : { 64, 128, }) { // other head sizes not implemented +#else + for (int hs : { 64, 80, 128, 256, }) { +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + for (int nh : { 32, }) { + for (int kv : { 512, 1024, }) { + for (int nb : { 1, 2, 4, 8, }) { + test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb)); + } + } + } + } + // these tests are disabled to save execution time, but they can be handy for debugging #if 0 -#if !defined(__SANITIZE_THREAD__) - // FIXME: these tests use too much memory with thread sanitizer - test_cases.emplace_back(new test_moe(8, 2, 1, 4096, 8*1024)); - //test_cases.emplace_back(new test_moe(8, 2, 8, 4096, 14336)); -#endif test_cases.emplace_back(new test_llama(1)); test_cases.emplace_back(new test_llama(2)); test_cases.emplace_back(new test_falcon(1)); diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index 2ac32f713..f056ae6fa 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -35,6 +35,24 @@ int main(void) { "{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}", // openbmb/MiniCPM-2B-dpo-fp32 "{% for message in messages %}{% if message['role'] == 'user' %}{{'<\xe7\x94\xa8\xe6\x88\xb7>' + message['content'].strip() + ''}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}", + // openchat/openchat-3.5-0106 + // The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d + // So we match against the included template but implement the suggested version. + "{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}", + // deepseek-ai/deepseek-coder-33b-instruct + "{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}", + // eachadea/vicuna-13b-1.1 + // No template included in tokenizer_config.json, so this template likely needs to be manually set. + "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}", + // Orca-Vicuna + // No template included in tokenizer_config.json, so this template likely needs to be manually set. + "{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}", + // CohereForAI/c4ai-command-r-plus + "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}", + // Llama-3 + "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", + // Phi-3 + "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + ' ' + message['content'] + '<|end|> ' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|> ' }}{% else %}{{ eos_token }}{% endif %}" }; std::vector expected_output = { // teknium/OpenHermes-2.5-Mistral-7B @@ -53,6 +71,20 @@ int main(void) { "Human: You are a helpful assistant\n\nHello\n\nAssistant: Hi thereHuman: Who are you\n\nAssistant: I am an assistant Human: Another question\n\nAssistant: ", // openbmb/MiniCPM-2B-dpo-fp32 "You are a helpful assistant<\xe7\x94\xa8\xe6\x88\xb7>HelloHi there<\xe7\x94\xa8\xe6\x88\xb7>Who are youI am an assistant<\xe7\x94\xa8\xe6\x88\xb7>Another question", + // openchat/openchat-3.5-0106 + "You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant: I am an assistant <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:", + // deepseek-ai/deepseek-coder-33b-instruct + "You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n I am an assistant \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n", + // eachadea/vicuna-13b-1.1 + "You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there\nUSER: Who are you\nASSISTANT: I am an assistant \nUSER: Another question\nASSISTANT:", + // Orca-Vicuna + "SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there\nUSER: Who are you\nASSISTANT: I am an assistant \nUSER: Another question\nASSISTANT:", + // CohereForAI/c4ai-command-r-plus + "<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", + // Llama 3 + "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", + // Phi 3 + "<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\nI am an assistant<|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n", }; std::vector formatted_chat(1024); int32_t res; diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp new file mode 100644 index 000000000..1a4004e2a --- /dev/null +++ b/tests/test-grammar-integration.cpp @@ -0,0 +1,332 @@ +#ifdef NDEBUG +#undef NDEBUG +#endif + +#define LLAMA_API_INTERNAL + +#include "ggml.h" +#include "llama.h" +#include "grammar-parser.h" +#include "unicode.h" +#include +#include +#include + +static llama_grammar* build_grammar(const std::string & grammar_str) { + auto parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // Ensure we parsed correctly + assert(!parsed_grammar.rules.empty()); + + // Ensure we have a root node + assert(!(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end())); + + std::vector grammar_rules(parsed_grammar.c_rules()); + llama_grammar* grammar = llama_grammar_init( + grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root")); + + return grammar; +} + +static bool match_string(const std::string & input, llama_grammar* grammar) { + auto decoded = decode_utf8(input, {}); + + const auto & code_points = decoded.first; + + for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) { + auto prev_stacks = grammar->stacks; + llama_grammar_accept(grammar->rules, prev_stacks, *it, grammar->stacks); + if (grammar->stacks.empty()) { + // no stacks means that the grammar failed to match at this point + return false; + } + } + + for (const auto & stack : grammar->stacks) { + if (stack.empty()) { + // An empty stack means that the grammar has been completed + return true; + } + } + + return false; +} + +static void test_grammar(const std::string & test_desc, const std::string & grammar_str, const std::vector & passing_strings, const std::vector & failing_strings) { + fprintf(stderr, "⚫ Testing %s. Grammar: %s\n", test_desc.c_str(), grammar_str.c_str()); + fflush(stderr); + + auto grammar = build_grammar(grammar_str); + + // Save the original grammar stacks so that we can reset after every new string we want to test + auto original_stacks = grammar->stacks; + + fprintf(stderr, " 🔵 Valid strings:\n"); + + // Passing strings + for (const auto & test_string : passing_strings) { + fprintf(stderr, " \"%s\" ", test_string.c_str()); + fflush(stderr); + + bool matched = match_string(test_string, grammar); + + if (!matched) { + fprintf(stderr, "❌ (failed to match)\n"); + } else { + fprintf(stdout, "✅︎\n"); + } + + assert(matched); + + // Reset the grammar stacks + grammar->stacks = original_stacks; + } + + fprintf(stderr, " 🟠 Invalid strings:\n"); + + // Failing strings + for (const auto & test_string : failing_strings) { + fprintf(stderr, " \"%s\" ", test_string.c_str()); + fflush(stderr); + + bool matched = match_string(test_string, grammar); + + if (matched) { + fprintf(stderr, "❌ (incorrectly matched)\n"); + } else { + fprintf(stdout, "✅︎\n"); + } + assert(!matched); + + // Reset the grammar stacks + grammar->stacks = original_stacks; + } + + // Clean up allocated memory + llama_grammar_free(grammar); +} + +static void test_simple_grammar() { + // Test case for a simple grammar + test_grammar( + "simple grammar", + R"""( + root ::= expr + expr ::= term ("+" term)* + term ::= number + number ::= [0-9]+)""", + // Passing strings + { + "42", + "1+2+3+4+5", + "123+456", + }, + // Failing strings + { + "+", + "/ 3", + "1+2+3+4+5+", + "12a45", + } + ); +} + +static void test_complex_grammar() { + // Test case for a more complex grammar, with both failure strings and success strings + test_grammar( + "medium complexity grammar", + // Grammar + R"""( + root ::= expression + expression ::= term ws (("+"|"-") ws term)* + term ::= factor ws (("*"|"/") ws factor)* + factor ::= number | variable | "(" expression ")" | function-call + number ::= [0-9]+ + variable ::= [a-zA-Z_][a-zA-Z0-9_]* + function-call ::= variable ws "(" (expression ("," ws expression)*)? ")" + ws ::= [ \t\n\r]?)""", + // Passing strings + { + "42", + "1*2*3*4*5", + "x", + "x+10", + "x1+y2", + "(a+b)*(c-d)", + "func()", + "func(x,y+2)", + "a*(b+c)-d/e", + "f(g(x),h(y,z))", + "x + 10", + "x1 + y2", + "(a + b) * (c - d)", + "func()", + "func(x, y + 2)", + "a * (b + c) - d / e", + "f(g(x), h(y, z))", + "123+456", + "123*456*789-123/456+789*123", + "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456" + }, + // Failing strings + { + "+", + "/ 3x", + "x + + y", + "a * / b", + "func(,)", + "func(x y)", + "(a + b", + "x + y)", + "a + b * (c - d", + "42 +", + "x +", + "x + 10 +", + "(a + b) * (c - d", + "func(", + "func(x, y + 2", + "a * (b + c) - d /", + "f(g(x), h(y, z)", + "123+456*789-123/456+789*123-456/789+123*456-789/123+456*789-123/456+789*123-456/", + } + ); +} + +static void test_quantifiers() { + // A collection of tests to exercise * + and ? quantifiers + + test_grammar( + "* quantifier", + // Grammar + R"""(root ::= "a"*)""", + // Passing strings + { + "", + "a", + "aaaaa", + "aaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + }, + // Failing strings + { + "b", + "ab", + "aab", + "ba", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab" + } + ); + test_grammar( + "+ quantifier", + // Grammar + R"""(root ::= "a"+)""", + // Passing strings + { + "a", + "aaaaa", + "aaaaaaaaaaaaaaaaaa", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + }, + // Failing strings + { + "", + "b", + "ab", + "aab", + "ba", + "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab" + } + ); + test_grammar( + "? quantifier", + // Grammar + R"""(root ::= "a"?)""", + // Passing strings + { + "", + "a" + }, + // Failing strings + { + "b", + "ab", + "aa", + "ba", + } + ); + test_grammar( + "mixed quantifiers", + // Grammar + R"""( + root ::= cons+ vowel* cons? (vowel cons)* + vowel ::= [aeiouy] + cons ::= [bcdfghjklmnpqrstvwxyz] + )""", + // Passing strings + { + "yes", + "no", + "noyes", + "crwth", + "four", + "bryyyy", + }, + // Failing strings + { + "yess", + "yesno", + "forty", + "catyyy", + } + ); +} + +static void test_failure_missing_root() { + fprintf(stderr, "⚫ Testing missing root node:\n"); + // Test case for a grammar that is missing a root rule + const std::string grammar_str = R"""(rot ::= expr +expr ::= term ("+" term)* +term ::= number +number ::= [0-9]+)"""; + + grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // Ensure we parsed correctly + assert(!parsed_grammar.rules.empty()); + + // Ensure we do NOT have a root node + assert(parsed_grammar.symbol_ids.find("root") == parsed_grammar.symbol_ids.end()); + fprintf(stderr, " ✅︎ Passed\n"); +} + +static void test_failure_missing_reference() { + fprintf(stderr, "⚫ Testing missing reference node:\n"); + + // Test case for a grammar that is missing a referenced rule + const std::string grammar_str = +R"""(root ::= expr +expr ::= term ("+" term)* +term ::= numero +number ::= [0-9]+)"""; + + fprintf(stderr, " Expected error: "); + + grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_str.c_str()); + + // Ensure we did NOT parsed correctly + assert(parsed_grammar.rules.empty()); + + fprintf(stderr, " End of expected error.\n"); + fprintf(stderr, " ✅︎ Passed\n"); +} + +int main() { + fprintf(stdout, "Running grammar integration tests...\n"); + test_simple_grammar(); + test_complex_grammar(); + test_quantifiers(); + test_failure_missing_root(); + test_failure_missing_reference(); + fprintf(stdout, "All tests passed.\n"); + return 0; +} diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp index 3c40e7563..c5361b5b8 100755 --- a/tests/test-json-schema-to-grammar.cpp +++ b/tests/test-json-schema-to-grammar.cpp @@ -2,6 +2,7 @@ #undef NDEBUG #endif +#include #include #include #include @@ -90,7 +91,7 @@ static void test_all(const std::string & lang, std::function test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); - // tc.verify(read("test-grammar-output.tmp")); - //}); - //test_all("JavaScript", [](const TestCase & tc) { - // write("test-json-schema-input.tmp", tc.schema); - // tc.verify_status(std::system( - // "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); - // tc.verify(read("test-grammar-output.tmp")); - //}); + + if (getenv("LLAMA_PYTHON_AVAILABLE") || (std::system("python --version") == 0)) { + test_all("Python", [](const TestCase & tc) { + write("test-json-schema-input.tmp", tc.schema); + tc.verify_status(std::system( + "python ./examples/json_schema_to_grammar.py test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); + tc.verify(read("test-grammar-output.tmp")); + }); + } else { + fprintf(stderr, "\033[33mWARNING: Python not found, skipping Python JSON schema -> grammar tests.\n\033[0m"); + } + + if (getenv("LLAMA_NODE_AVAILABLE") || (std::system("node --version") == 0)) { + test_all("JavaScript", [](const TestCase & tc) { + write("test-json-schema-input.tmp", tc.schema); + tc.verify_status(std::system( + "node ./tests/run-json-schema-to-grammar.mjs test-json-schema-input.tmp > test-grammar-output.tmp") == 0 ? SUCCESS : FAILURE); + tc.verify(read("test-grammar-output.tmp")); + }); + } else { + fprintf(stderr, "\033[33mWARNING: Node not found, skipping JavaScript JSON schema -> grammar tests.\n\033[0m"); + } test_all("Check Expectations Validity", [](const TestCase & tc) { if (tc.expected_status == SUCCESS) { diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index f615b612d..e690ac6c8 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -138,11 +138,6 @@ int main(int argc, char * argv[]) { const ggml_type ei = (ggml_type)i; - if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) { - printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei)); - continue; - } - printf("Testing %s\n", ggml_type_name((ggml_type) i)); ggml_quantize_init(ei); diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp deleted file mode 100644 index 472b0b3a8..000000000 --- a/tests/test-tokenizer-0-falcon.cpp +++ /dev/null @@ -1,187 +0,0 @@ -#include "llama.h" -#include "common.h" -#include "console.h" - -#include -#include -#include -#include -#include - -// generate using test-tokenizer-0-falcon.py -static const std::map> & k_tests() { - static std::map> _k_tests = { - { "" , { }, }, - { " " , { 204, }, }, - { " " , { 258, }, }, - { " " , { 466, }, }, - { "\t" , { 192, }, }, - { "\n" , { 193, }, }, - { "\t\n" , { 19125, }, }, - { "Hello world" , { 9856, 1079, }, }, - { " Hello world" , { 23090, 1079, }, }, - { "Hello World" , { 9856, 2889, }, }, - { " Hello World" , { 23090, 2889, }, }, - { " Hello World!" , { 23090, 2889, 12, }, }, - { "Hello, world!" , { 9856, 23, 1079, 12, }, }, - { " Hello, world!" , { 23090, 23, 1079, 12, }, }, - { " this is 🦙.cpp" , { 414, 304, 3346, 111, 231, 25, 29247, }, }, - { "w048 7tuijk dsdfhu" , { 98, 55866, 204, 34, 16682, 7149, 36190, 6869, 11481, }, }, - { "нещо на Български" , { 150, 133, 6207, 151, 215, 150, 134, 5052, 133, 6279, 5052, 223, 151, 216, 49679, 123, 53110, 47043, 7795, }, }, - { "កាន់តែពិសេសអាចខលចេញ" , { 38154, 206, 38154, 126, 38154, 225, 167, 237, 217, 38154, 221, 167, 237, 208, 38154, 228, 38154, 127, 38154, 237, 167, 237, 207, 38154, 237, 38154, 107, 38154, 126, 38154, 211, 38154, 207, 38154, 233, 38154, 211, 167, 237, 207, 38154, 215, }, }, - { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 2571, 232, 206, 204, 19, 11003, 20, 8196, 126, 283, 219, 48778, 116, 13392, 204, 19, 51831, 732, 63209, 1741, 7955, 522, 20, 22438, 211, 204, 19, 7927, 53360, 325, 504, 701, 946, 10930, 20, }, }, - { "Hello" , { 9856, }, }, - { " Hello" , { 23090, }, }, - { " Hello" , { 204, 23090, }, }, - { " Hello" , { 258, 23090, }, }, - { " Hello" , { 466, 23090, }, }, - { " Hello\n Hello" , { 466, 23090, 742, 23090, }, }, - { "\n =" , { 1212, 40, }, }, - { "' era" , { 18, 4932, }, }, - }; - - return _k_tests; -} - -int main(int argc, char **argv) { - if (argc < 2) { - fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); - return 1; - } - - const std::string fname = argv[1]; - - std::string fname_text; - if (argc > 2) { - fname_text = argv[2]; - } - - fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); - - llama_model * model; - llama_context * ctx; - - llama_backend_init(); - - // load the vocab - { - auto mparams = llama_model_default_params(); - - mparams.vocab_only = true; - - model = llama_load_model_from_file(fname.c_str(), mparams); - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - return 1; - } - - auto cparams = llama_context_default_params(); - - ctx = llama_new_context_with_model(model, cparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); - return 1; - } - } - - if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) { - fprintf(stderr, "%s : error: vocab type is not BPE\n", __func__); - llama_free_model(model); - llama_free(ctx); - return 2; - } - -#ifdef _WIN32 - // We need this for unicode console support - console::init(false, false); - atexit([]() { console::cleanup(); }); -#endif - - bool success = true; - - for (const auto & test_kv : k_tests()) { - const std::vector res = llama_tokenize(ctx, test_kv.first, false); - - printf("\n"); - printf("src: '%s'\n", test_kv.first.c_str()); - printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); - printf("tok: "); - for (const auto & tok : res) { - printf("%d ", tok); - } - printf("\n"); - - bool correct = res.size() == test_kv.second.size(); - - for (int i = 0; i < (int) res.size() && correct; ++i) { - if (test_kv.second[i] != res[i]) { - correct = false; - } - } - - if (!correct) { - fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); - fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - llama_detokenize_bpe(ctx, res).c_str(), - llama_detokenize_bpe(ctx, test_kv.second).c_str()); - fprintf(stderr, "%s : expected tokens: ", __func__); - for (const auto & t : test_kv.second) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - fprintf(stderr, "%s : got tokens: ", __func__); - for (const auto & t : res) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - - success = false; - } - } - - if (!fname_text.empty()) { - fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); - - std::string text; - { - std::ifstream ifs(fname_text); - if (!ifs) { - fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); - return 1; - } - text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); - } - - fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - - const std::vector res = llama_tokenize(ctx, text, false); - - fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); - - { - const std::string fname_out = fname_text + ".tokcpp"; - - std::ofstream ofs(fname_out); - if (!ofs) { - fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); - return 1; - } - - for (const auto & tok : res) { - ofs << tok << " '" << llama_detokenize_bpe(ctx, std::vector{tok}) << "'" << std::endl; - } - } - - fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); - } - - llama_free_model(model); - llama_free(ctx); - - llama_backend_free(); - - return success ? 0 : 3; -} diff --git a/tests/test-tokenizer-0-falcon.py b/tests/test-tokenizer-0-falcon.py deleted file mode 100644 index 4f06ec9bb..000000000 --- a/tests/test-tokenizer-0-falcon.py +++ /dev/null @@ -1,82 +0,0 @@ -# tests with BPE tokenizer - -import argparse - -from transformers import AutoTokenizer - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", - "\n =", - "' era", -] - -for text in tests: - print('text: ', text) - print(tokenizer.encode(text)) - print(tokenizer.decode(tokenizer.encode(text))) - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0-llama.cpp b/tests/test-tokenizer-0-llama.cpp deleted file mode 100644 index 0a16cd7eb..000000000 --- a/tests/test-tokenizer-0-llama.cpp +++ /dev/null @@ -1,190 +0,0 @@ -#include "llama.h" -#include "common.h" -#include "console.h" - -#include -#include -#include -#include -#include - -// generate using test-tokenizer-0-llama.py -static const std::map> & k_tests() { - static std::map> _k_tests = { - { "" , { }, }, - { " " , { 259, }, }, - { " " , { 1678, }, }, - { " " , { 268, }, }, - { "\t" , { 29871, 12, }, }, - { "\n" , { 29871, 13, }, }, - { "\t\n" , { 29871, 12, 13, }, }, - { "Hello world" , { 15043, 3186, }, }, - { " Hello world" , { 29871, 15043, 3186, }, }, - { "Hello World" , { 15043, 2787, }, }, - { " Hello World" , { 29871, 15043, 2787, }, }, - { " Hello World!" , { 29871, 15043, 2787, 29991, }, }, - { "Hello, world!" , { 15043, 29892, 3186, 29991, }, }, - { " Hello, world!" , { 29871, 15043, 29892, 3186, 29991, }, }, - { " this is 🦙.cpp" , { 29871, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, }, - { "w048 7tuijk dsdfhu" , { 281, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, }, - { "нещо на Български" , { 1538, 4851, 665, 1386, 29713, 1305, }, }, - { "កាន់តែពិសេសអាចខលចេញ" , { 29871, 31849, 31324, 31934, 228, 162, 142, 228, 161, 146, 228, 162, 133, 228, 161, 153, 228, 161, 186, 31708, 228, 162, 132, 31708, 228, 161, 165, 31324, 228, 161, 136, 228, 161, 132, 228, 161, 158, 228, 161, 136, 228, 162, 132, 228, 161, 140, }, }, - { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 29871, 243, 162, 157, 131, 313, 8945, 29897, 29871, 243, 162, 155, 185, 30722, 243, 162, 143, 174, 30598, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, - { "Hello" , { 15043, }, }, - { " Hello" , { 29871, 15043, }, }, - { " Hello" , { 259, 15043, }, }, - { " Hello" , { 1678, 15043, }, }, - { " Hello" , { 268, 15043, }, }, - { " Hello\n Hello" , { 268, 15043, 13, 1678, 15043, }, }, - { " (" , { 29871, 313, }, }, - }; - - return _k_tests; -} - -int main(int argc, char **argv) { - if (argc < 2) { - fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); - return 1; - } - - const std::string fname = argv[1]; - - std::string fname_text; - if (argc > 2) { - fname_text = argv[2]; - } - - fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); - - llama_model * model; - llama_context * ctx; - - llama_backend_init(); - - // load the vocab - { - auto mparams = llama_model_default_params(); - - mparams.vocab_only = true; - - model = llama_load_model_from_file(fname.c_str(), mparams); - - if (model == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - return 1; - } - - auto cparams = llama_context_default_params(); - - ctx = llama_new_context_with_model(model, cparams); - - if (ctx == NULL) { - fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); - llama_free_model(model); - return 1; - } - } - - if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) { - fprintf(stderr, "%s : error: vocab type is not SPM\n", __func__); - llama_free_model(model); - llama_free(ctx); - return 2; - } - -#ifdef _WIN32 - // We need this for unicode console support - console::init(false, false); - atexit([]() { console::cleanup(); }); -#endif - - bool success = true; - - for (const auto & test_kv : k_tests()) { - const std::vector res_bos = llama_tokenize(ctx, test_kv.first, true); - const std::vector res_nobos = llama_tokenize(ctx, test_kv.first, false); - - printf("\n"); - printf("src: '%s'\n", test_kv.first.c_str()); - printf("res: '%s'\n", llama_detokenize_spm(ctx, res_bos).c_str()); - printf("tok: "); - for (const auto & tok : res_bos) { - printf("%d ", tok); - } - printf("\n"); - - bool correct = res_nobos.size() == test_kv.second.size() && res_bos.size() == res_nobos.size() + 1 && res_bos[0] == 1; - - for (int i = 0; i < (int) res_nobos.size() && correct; ++i) { - if (test_kv.second[i] != res_bos[i + 1]) { - correct = false; - } - if (test_kv.second[i] != res_nobos[i]) { - correct = false; - } - } - - if (!correct) { - fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); - fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, - llama_detokenize_spm(ctx, res_nobos).c_str(), - llama_detokenize_spm(ctx, test_kv.second).c_str()); - fprintf(stderr, "%s : expected tokens: ", __func__); - for (const auto & t : test_kv.second) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - fprintf(stderr, "%s : got tokens: ", __func__); - for (const auto & t : res_nobos) { - fprintf(stderr, "%6d, ", t); - } - fprintf(stderr, "\n"); - - success = false; - } - } - - if (!fname_text.empty()) { - fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); - - std::string text; - { - std::ifstream ifs(fname_text); - if (!ifs) { - fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); - return 1; - } - text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); - } - - fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); - - const std::vector res = llama_tokenize(ctx, text, true); - - fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); - - { - const std::string fname_out = fname_text + ".tokcpp"; - - std::ofstream ofs(fname_out); - if (!ofs) { - fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); - return 1; - } - - for (const auto & tok : res) { - ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector{tok}) << "'" << std::endl; - } - } - - fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); - } - - llama_free_model(model); - llama_free(ctx); - - llama_backend_free(); - - return success ? 0 : 3; -} diff --git a/tests/test-tokenizer-0-llama.py b/tests/test-tokenizer-0-llama.py deleted file mode 100644 index f3d4d7e3d..000000000 --- a/tests/test-tokenizer-0-llama.py +++ /dev/null @@ -1,92 +0,0 @@ -# tests with SPM tokenizer - -import argparse - -from sentencepiece import SentencePieceProcessor - -parser = argparse.ArgumentParser() -parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") -parser.add_argument("--fname-tok", help="path to a text file to tokenize") -args = parser.parse_args() - -dir_tokenizer = args.dir_tokenizer - -tokenizer = SentencePieceProcessor(dir_tokenizer + '/tokenizer.model') - -tests = [ - "", - " ", - " ", - " ", - "\t", - "\n", - "\t\n", - "Hello world", - " Hello world", - "Hello World", - " Hello World", - " Hello World!", - "Hello, world!", - " Hello, world!", - " this is 🦙.cpp", - "w048 7tuijk dsdfhu", - "нещо на Български", - "កាន់តែពិសេសអាចខលចេញ", - "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", - "Hello", - " Hello", - " Hello", - " Hello", - " Hello", - " Hello\n Hello", -] - - -for text in tests: - print('text: ', text) - print('\nwith bos:') - print(tokenizer.encode(text, add_bos=True)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=True))) - print('\nwithout bos:') - print(tokenizer.encode(text, add_bos=False)) - print(tokenizer.decode(tokenizer.encode(text, add_bos=False))) - -print("'" + tokenizer.id_to_piece(15043) + "'") # '_Hello' -print("'" + tokenizer.id_to_piece(29871) + "'") # '_' -print("'" + tokenizer.decode([15043]) + "'") # 'Hello' -print("'" + tokenizer.decode([15043, 15043]) + "'") # 'Hello Hello' -print("'" + tokenizer.decode([29871, 15043]) + "'") # ' Hello' -print("'" + tokenizer.decode([29871, 15043, 29871, 15043]) + "'") # ' Hello Hello' - -print("\n\ntests for C++:\n") -for text in tests: - res = tokenizer.encode(text, add_bos=False) - - k = text.replace('\n', '\\n') - k = k.replace('\t', '\\t') - k = '"' + k + '"' - print("{ %-24s, { " % k, end='') - for x in res: - print("%7d," % x, end='') - print(" }, },") - -print(tokenizer.encode('hello')) -print(tokenizer.encode('world')) -print(tokenizer.encode(' world')) -print(tokenizer.encode('hello world')) - -fname_tok = args.fname_tok -if fname_tok: - print('tokenizing file: ', fname_tok) - fname_out = fname_tok + '.tok' - with open(fname_tok, 'r', encoding='utf-8') as f: - lines = f.readlines() - s = ''.join(lines) - res = tokenizer.encode(s, add_bos=True) - # write to file - with open(fname_out, 'w', encoding='utf-8') as f: - for x in res: - f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') - print('len(res): ', len(res)) - print('len(lines): ', len(lines)) - print('results written to: ', fname_out) diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp new file mode 100644 index 000000000..d478f1041 --- /dev/null +++ b/tests/test-tokenizer-0.cpp @@ -0,0 +1,292 @@ +#include "llama.h" +#include "common.h" +#include "console.h" + +#include +#include +#include +#include +#include + +//static const std::map> & k_tests() { +// static std::map> _k_tests = { +// { "" , { }, }, +// { " " , { 220, }, }, +// { " " , { 256, }, }, +// { " " , { 262, }, }, +// { "\t" , { 197, }, }, +// { "\n" , { 198, }, }, +// { "\n\n" , { 271, }, }, +// { "\n\n\n" , { 1432, }, }, +// { "\t\n" , { 1602, }, }, +// { "Hello world" , { 9906, 1917, }, }, +// { " Hello world" , { 22691, 1917, }, }, +// { "Hello World" , { 9906, 4435, }, }, +// { " Hello World" , { 22691, 4435, }, }, +// { " Hello World!" , { 22691, 4435, 0, }, }, +// { "Hello, world!" , { 9906, 11, 1917, 0, }, }, +// { " Hello, world!" , { 22691, 11, 1917, 0, }, }, +// { " this is 🦙.cpp" , { 420, 374, 11410, 99, 247, 13, 11055, }, }, +// { "w048 7tuijk dsdfhu" , { 86, 23904, 220, 22, 83, 2005, 42908, 11729, 3013, 17156, }, }, +// { "нещо на Български" , { 79862, 102118, 13373, 64571, 34694, 3114, 112203, 80112, }, }, +// { "កាន់តែពិសេសអាចខលចេញ" , { 21549, 222, 98629, 241, 45358, 233, 21549, 237, 45358, 224, 21549, 244, 21549, 115, 21549, 253, 45358, 223, 21549, 253, 21549, 95, 98629, 227, 21549, 223, 21549, 249, 21549, 227, 45358, 223, 21549, 231, }, }, +// { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 9468, 248, 222, 320, 8416, 8, 27623, 114, 102470, 9468, 234, 104, 31643, 320, 36773, 100166, 98634, 8, 26602, 227, 320, 3323, 43465, 430, 706, 1202, 1866, 4037, 8, }, }, +// { "Hello" , { 9906, }, }, +// { " Hello" , { 22691, }, }, +// { " Hello" , { 220, 22691, }, }, +// { " Hello" , { 256, 22691, }, }, +// { " Hello" , { 262, 22691, }, }, +// { " Hello\n Hello" , { 262, 22691, 198, 262, 22691, }, }, +// { " (" , { 320, }, }, +// { "\n =" , { 198, 284, }, }, +// { "' era" , { 6, 11639, }, }, +// { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", { 9906, 11, 379, 65948, 0, 2650, 527, 499, 27623, 223, 949, 37046, 101067, 19000, 23182, 102301, 9263, 18136, 16, 36827, 21909, }, }, +// { "3" , { 18, }, }, +// { "33" , { 1644, }, }, +// { "333" , { 8765, }, }, +// { "3333" , { 8765, 18, }, }, +// { "33333" , { 8765, 1644, }, }, +// { "333333" , { 8765, 8765, }, }, +// { "3333333" , { 8765, 8765, 18, }, }, +// { "33333333" , { 8765, 8765, 1644, }, }, +// { "333333333" , { 8765, 8765, 8765, }, }, +// }; +// +// return _k_tests; +//} + +using llama_tests = std::map>; + +static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) { + llama_tests tests; + + std::ifstream ifs_inp(fname_inp); + if (!ifs_inp) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str()); + return tests; + } + + std::string sraw((std::istreambuf_iterator(ifs_inp)), std::istreambuf_iterator()); + + std::ifstream ifs_out(fname_out); + if (!ifs_out) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return tests; + } + + std::vector sout; + for (std::string line; std::getline(ifs_out, line);) { + sout.push_back(line); + } + + const std::string sep = "\n__ggml_vocab_test__\n"; + + std::vector sinp; + + size_t pos = 0; + while (pos < sraw.size()) { + const size_t next = sraw.find(sep, pos); + if (next == std::string::npos) { + sinp.push_back(sraw.substr(pos)); + break; + } + sinp.push_back(sraw.substr(pos, next - pos)); + pos = next + sep.size(); + } + + if (sinp.size() != sout.size()) { + fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__); + return tests; + } + + for (size_t i = 0; i < sinp.size(); ++i) { + const std::string & s = sinp[i]; + const std::string & o = string_strip(sout[i]); + + std::vector toks; + + size_t pos = 0; + while (pos < o.size()) { + size_t next = o.find(' ', pos); + if (next == std::string::npos) { + next = o.size(); + } + const std::string stok = o.substr(pos, next - pos); + toks.push_back(std::stoi(stok)); + pos = next + 1; + } + + tests[s] = toks; + } + + return tests; +} + +int main(int argc, char **argv) { + if (argc < 2) { + fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]); + return 1; + } + + const std::string fname = argv[1]; + + const std::string fname_inp = fname + ".inp"; + const std::string fname_out = fname + ".out"; + + std::string fname_text; + if (argc > 2) { + fname_text = argv[2]; + } + + fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str()); + + llama_model * model; + llama_context * ctx; + + llama_backend_init(); + + // load the vocab + { + auto mparams = llama_model_default_params(); + + mparams.vocab_only = true; + + model = llama_load_model_from_file(fname.c_str(), mparams); + + if (model == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + return 1; + } + + auto cparams = llama_context_default_params(); + + ctx = llama_new_context_with_model(model, cparams); + + if (ctx == NULL) { + fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str()); + llama_free_model(model); + return 1; + } + } + +#ifdef _WIN32 + // We need this for unicode console support + console::init(false, false); + atexit([]() { console::cleanup(); }); +#endif + + bool success = true; + + const auto k_tests = [&]() -> llama_tests { + if (!fname_text.empty()) { + return {}; + } + + const auto res = read_tests(fname_inp, fname_out); + + if (res.empty()) { + fprintf(stderr, "%s : error: no tests found\n", __func__); + exit(1); + } + + return res; + }(); + + const bool add_special = false; + + for (const auto & test_kv : k_tests) { + const std::vector res = llama_tokenize(ctx, test_kv.first, add_special); + + printf("\n"); + printf("src: '%s'\n", test_kv.first.c_str()); + printf("res: '%s'\n", llama_detokenize_bpe(ctx, res).c_str()); + printf("tok: "); + for (const auto & tok : res) { + printf("%d ", tok); + } + printf("\n"); + + bool correct = res.size() == test_kv.second.size(); + for (int i = 0; i < (int) res.size() && correct; ++i) { + if (test_kv.second[i] != res[i]) { + correct = false; + } + } + + if (!correct) { + fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str()); + fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__, + llama_detokenize_bpe(ctx, res).c_str(), + llama_detokenize_bpe(ctx, test_kv.second).c_str()); + fprintf(stderr, "%s : expected tokens: ", __func__); + for (const auto & t : test_kv.second) { + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + } + fprintf(stderr, "\n"); + fprintf(stderr, "%s : got tokens: ", __func__); + for (const auto & t : res) { + fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str()); + } + fprintf(stderr, "\n"); + + success = false; + } + } + + if (!fname_text.empty()) { + fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); + + std::string text; + { + std::ifstream ifs(fname_text); + if (!ifs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str()); + return 1; + } + text = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); + } + + fprintf(stderr, "%s : text size: %zu\n", __func__, text.size()); + + std::vector res; + + { + const auto t_start = ggml_time_us(); + + res = llama_tokenize(ctx, text, add_special); + + const auto t_end = ggml_time_us(); + + fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0); + } + + fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size()); + + { + const std::string fname_out = fname_text + ".tokcpp"; + + std::ofstream ofs(fname_out); + if (!ofs) { + fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str()); + return 1; + } + + for (const auto & tok : res) { + //ofs << tok << " '" << string_strip(llama_detokenize_bpe(ctx, std::vector{tok})) << "'" << std::endl; + ofs << tok << "\n"; + } + } + + fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str()); + } + + llama_free_model(model); + llama_free(ctx); + + llama_backend_free(); + + printf("\n"); + printf("Tests %s\n", success ? "passed" : "failed"); + + return success ? 0 : 3; +} diff --git a/tests/test-tokenizer-0.py b/tests/test-tokenizer-0.py new file mode 100644 index 000000000..cd760d1ce --- /dev/null +++ b/tests/test-tokenizer-0.py @@ -0,0 +1,46 @@ +import time +import argparse + +from transformers import AutoTokenizer + +parser = argparse.ArgumentParser() +parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") +parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True) +args = parser.parse_args() + +dir_tokenizer = args.dir_tokenizer +fname_tok = args.fname_tok + +tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer) + +print('tokenizing file: ', fname_tok) # noqa: NP100 +fname_out = fname_tok + '.tok' +with open(fname_tok, 'r', encoding='utf-8') as f: + lines = f.readlines() + s = ''.join(lines) + t_start = time.time() + res = tokenizer.encode(s, add_special_tokens=False) + t_end = time.time() + print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100 + with open(fname_out, 'w', encoding='utf-8') as f: + for x in res: + # LLaMA v3 for some reason strips the space for these tokens (and others) + # if x == 662: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 1174: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 2564: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 758: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 949: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # elif x == 5354: + # f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n') + # else: + # f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n') + # f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n') + f.write(str(x) + '\n') + print('len(res): ', len(res)) # noqa: NP100 + print('len(lines): ', len(lines)) # noqa: NP100 +print('results written to: ', fname_out) # noqa: NP100 diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh new file mode 100755 index 000000000..2fb8632d8 --- /dev/null +++ b/tests/test-tokenizer-0.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# +# Usage: +# +# test-tokenizer-0.sh +# + +if [ $# -ne 2 ]; then + printf "Usage: $0 \n" + exit 1 +fi + +name=$1 +input=$2 + +make -j tests/test-tokenizer-0 + +printf "Testing %s on %s ...\n" $name $input + +python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1 +cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in" + +./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1 +cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in" + +diff $input.tok $input.tokcpp > /dev/null 2>&1 + +if [ $? -eq 0 ]; then + printf "Tokenization is correct!\n" +else + diff $input.tok $input.tokcpp | head -n 32 + + printf "Tokenization differs!\n" +fi diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-spm.cpp similarity index 98% rename from tests/test-tokenizer-1-llama.cpp rename to tests/test-tokenizer-1-spm.cpp index 8caf0b24e..ac2333dda 100644 --- a/tests/test-tokenizer-1-llama.cpp +++ b/tests/test-tokenizer-1-spm.cpp @@ -12,7 +12,7 @@ #include #include -int main(int argc, char **argv) { +int main(int argc, char ** argv) { if (argc < 2) { fprintf(stderr, "Usage: %s \n", argv[0]); return 1; diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py new file mode 100644 index 000000000..5b2ab8ef7 --- /dev/null +++ b/tests/test-tokenizer-random.py @@ -0,0 +1,295 @@ +# Test libllama tokenizer == AutoTokenizer. +# Brute force random tokens/text generation. +# +# Sample usage: +# +# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe +# + +import time +import logging +import argparse +import subprocess +import random + +from typing import Iterator + +import cffi +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +logger = logging.getLogger("test-tokenizer-random-bpe") + + +class LibLlama: + + DEFAULT_PATH_LLAMA_H = "./llama.h" + DEFAULT_PATH_LIBLLAMA = "./build/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON + + def __init__(self, path_llama_h: str = None, path_libllama: str = None): + path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H + path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA + (self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama) + self.lib.llama_backend_init() + + def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str): + cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h] + res = subprocess.run(cmd, stdout=subprocess.PIPE) + assert (res.returncode == 0) + source = res.stdout.decode() + ffi = cffi.FFI() + if True: # workarounds for pycparser + source = "typedef struct { } __builtin_va_list;" + "\n" + source + source = source.replace("sizeof (int)", str(ffi.sizeof("int"))) + source = source.replace("sizeof (void *)", str(ffi.sizeof("void*"))) + source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t"))) + source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t"))) + ffi.cdef(source, override=True) + lib = ffi.dlopen(path_libllama) + return (ffi, lib) + + def model_default_params(self, **kwargs): + mparams = self.lib.llama_model_default_params() + for k, v in kwargs.items(): + setattr(mparams, k, v) + return mparams + + def context_default_params(self, **kwargs): + cparams = self.lib.llama_context_default_params() + for k, v in kwargs.items(): + setattr(cparams, k, v) + return cparams + + +class LibLlamaModel: + + def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}): + self.lib = libllama.lib + self.ffi = libllama.ffi + if isinstance(mparams, dict): + mparams = libllama.model_default_params(**mparams) + self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams) + if not self.model: + raise RuntimeError("error: failed to load model '%s'" % path_model) + if isinstance(cparams, dict): + cparams = libllama.context_default_params(**cparams) + self.ctx = self.lib.llama_new_context_with_model(self.model, cparams) + if not self.ctx: + raise RuntimeError("error: failed to create context for model '%s'" % path_model) + n_tokens_max = self.lib.llama_n_ctx(self.ctx) + self.token_ids = self.ffi.new("llama_token[]", n_tokens_max) + + def free(self): + if self.ctx: + self.lib.llama_free(self.ctx) + if self.model: + self.lib.llama_free_model(self.model) + self.ctx = None + self.model = None + self.lib = None + + def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]: + n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids) + text = text.encode("utf-8") + num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special) + if num < 0: + return [] + return list(self.token_ids[0:num]) + + +def generator_custom_text() -> Iterator[str]: + """General tests""" + yield from [ + "", + " ", + " ", + " ", + "\t", + "\n", + "\n\n", + "\n\n\n", + "\t\n", + "Hello world", + " Hello world", + "Hello World", + " Hello World", + " Hello World!", + "Hello, world!", + " Hello, world!", + " this is 🦙.cpp", + "w048 7tuijk dsdfhu", + "нещо на Български", + "កាន់តែពិសេសអាចខលចេញ", + "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", + "Hello", + " Hello", + " Hello", + " Hello", + " Hello", + " Hello\n Hello", + " (", + "\n =", + "' era", + "Hello, y'all! How are you 😁 ?我想在apple工作1314151天~", + "3", + "33", + "333", + "3333", + "33333", + "333333", + "3333333", + "33333333", + "333333333", + ] + + +def generator_custom_text_edge_cases() -> Iterator[str]: + """Edge cases found while debugging""" + yield from [ + '\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F} + '¼-a', # unicode_ranges_digit, 0x00BC + '½-a', # unicode_ranges_digit, 0x00BD + '¾-a', # unicode_ranges_digit, 0x00BE + 'a 〇b', # unicode_ranges_digit, 0x3007 + 'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms + '\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM) + 'a' # TODO: Phi-3 fail + ] + + +def generator_random_chars(iterations = 100) -> Iterator[str]: + """Brute force random text with simple characters""" + + WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) + CHARS = list(set(""" + ABCDEFGHIJKLMNOPQRSTUVWXYZ + abcdefghijklmnopqrstuvwxyz + ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ + áéíóúàèìòùâêîôûäëïöü + .-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_ + """)) + + rand = random.Random() + for m in range(iterations): + rand.seed(m) + text = [] + num_words = rand.randint(300, 400) + for i in range(num_words): + k = rand.randint(1, 7) + word = rand.choices(CHARS, k=k) + space = rand.choice(WHITESPACES) + text.append("".join(word) + space) + yield "".join(text) + + +def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]: + """Brute force random text with vocab characters""" + + vocab_ids = list(tokenizer.vocab.values()) + vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True) + vocab_chars = list(set(vocab_text)) + del vocab_ids, vocab_text + + rand = random.Random() + for m in range(iterations): + rand.seed(m) + text = rand.choices(vocab_chars, k=1024) + yield "".join(text) + + +def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]: + """Brute force random text from vocab tokens""" + + space_id = tokenizer.encode(" ", add_special_tokens=False)[0] + vocab_ids = list(tokenizer.vocab.values()) + vocab_ids = list(sorted(vocab_ids + vocab_ids)) + for i in range(1, len(vocab_ids), 2): + vocab_ids[i] = space_id + vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True) + vocab_tokens = vocab_tokens.split(" ") + del vocab_ids + + yield from vocab_tokens + + rand = random.Random() + for m in range(iterations): + rand.seed(m) + text = [] + num_words = rand.randint(300, 400) + for i in range(num_words): + k = rand.randint(1, 3) + tokens = rand.choices(vocab_tokens, k=k) + tokens = [t.strip(" \n\r\t") for t in tokens] + sep = rand.choice(" \n\r\t") + text.append("".join(tokens) + sep) + yield "".join(text) + + +def generator_random_bytes(iterations = 100) -> Iterator[str]: + """Brute force random bytes""" + + WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5) + + rand = random.Random() + for m in range(iterations): + rand.seed(m) + text = [] + num_words = rand.randint(300, 400) + for i in range(num_words): + k = rand.randint(1, 8) + word = [chr(r) for r in rand.randbytes(k) if r] + word.append(rand.choice(WHITESPACES)) + text.append("".join(word)) + yield "".join(text) + + +def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]): + + def find_first_mismatch(ids1: list[int], ids2: list[int]): + for i, (a,b) in enumerate(zip(ids1, ids2)): + if a != b: + return i + if len(ids1) == len(ids2): + return -1 + return min(len(ids1), len(ids2)) + + t0 = time.perf_counter() + logger.info("%s: %s" % (generator.__name__, "ini")) + for text in generator: + ids1 = model.tokenize(text, add_special=False, parse_special=False) + ids2 = tokenizer.encode(text, add_special_tokens=False) + if ids1 != ids2: + i = find_first_mismatch(ids1, ids2) + ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1] + ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1] + text2 = tokenizer.decode(ids2, skip_special_tokens=True) + assert (text2 in text) + logger.info(" Text: " + repr(text2)) + logger.info(" TokenIDs: " + str(ids1)) + logger.info(" Expected: " + str(ids2)) + raise Exception() + t1 = time.perf_counter() + logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0)) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("vocab_file", help="path to vocab 'gguf' file") + parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file") + parser.add_argument("--verbose", action="store_true", help="increase output verbosity") + args = parser.parse_args() + + logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO) + + model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048)) + + tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer) + + test_compare_tokenizer(model, tokenizer, generator_custom_text()) + test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases()) + test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000)) + test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000)) + test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000)) + # test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL + + model.free() diff --git a/unicode-data.cpp b/unicode-data.cpp new file mode 100644 index 000000000..c54175fc3 --- /dev/null +++ b/unicode-data.cpp @@ -0,0 +1,2183 @@ +#include "unicode-data.h" + +#include +#include +#include +#include + +// generated with scripts/gen-unicode-data.py +// +// TODO: generate unicode_map_nfd + +const std::vector> unicode_ranges_number = { +{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x000000BC, 0x000000BE}, +{0x00000660, 0x00000669}, {0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, +{0x000009E6, 0x000009EF}, {0x000009F4, 0x000009F9}, {0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, +{0x00000B66, 0x00000B6F}, {0x00000B72, 0x00000B77}, {0x00000BE6, 0x00000BF2}, {0x00000C66, 0x00000C6F}, +{0x00000C78, 0x00000C7E}, {0x00000CE6, 0x00000CEF}, {0x00000D58, 0x00000D5E}, {0x00000D66, 0x00000D78}, +{0x00000DE6, 0x00000DEF}, {0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F33}, +{0x00001040, 0x00001049}, {0x00001090, 0x00001099}, {0x00001369, 0x0000137C}, {0x000016EE, 0x000016F0}, +{0x000017E0, 0x000017E9}, {0x000017F0, 0x000017F9}, {0x00001810, 0x00001819}, {0x00001946, 0x0000194F}, +{0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, {0x00001B50, 0x00001B59}, +{0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, {0x00002070, 0x00002070}, +{0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002150, 0x00002182}, {0x00002185, 0x00002189}, +{0x00002460, 0x0000249B}, {0x000024EA, 0x000024FF}, {0x00002776, 0x00002793}, {0x00002CFD, 0x00002CFD}, +{0x00003007, 0x00003007}, {0x00003021, 0x00003029}, {0x00003038, 0x0000303A}, {0x00003192, 0x00003195}, +{0x00003220, 0x00003229}, {0x00003248, 0x0000324F}, {0x00003251, 0x0000325F}, {0x00003280, 0x00003289}, +{0x000032B1, 0x000032BF}, {0x0000A620, 0x0000A629}, {0x0000A6E6, 0x0000A6EF}, {0x0000A830, 0x0000A835}, +{0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, {0x0000A9F0, 0x0000A9F9}, +{0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, {0x00010107, 0x00010133}, +{0x00010140, 0x00010178}, {0x0001018A, 0x0001018B}, {0x000102E1, 0x000102FB}, {0x00010320, 0x00010323}, +{0x00010341, 0x00010341}, {0x0001034A, 0x0001034A}, {0x000103D1, 0x000103D5}, {0x000104A0, 0x000104A9}, +{0x00010858, 0x0001085F}, {0x00010879, 0x0001087F}, {0x000108A7, 0x000108AF}, {0x000108FB, 0x000108FF}, +{0x00010916, 0x0001091B}, {0x000109BC, 0x000109BD}, {0x000109C0, 0x000109CF}, {0x000109D2, 0x000109FF}, +{0x00010A40, 0x00010A48}, {0x00010A7D, 0x00010A7E}, {0x00010A9D, 0x00010A9F}, {0x00010AEB, 0x00010AEF}, +{0x00010B58, 0x00010B5F}, {0x00010B78, 0x00010B7F}, {0x00010BA9, 0x00010BAF}, {0x00010CFA, 0x00010CFF}, +{0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E7E}, {0x00010F1D, 0x00010F26}, {0x00010F51, 0x00010F54}, +{0x00010FC5, 0x00010FCB}, {0x00011052, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F}, +{0x000111D0, 0x000111D9}, {0x000111E1, 0x000111F4}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, +{0x000114D0, 0x000114D9}, {0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x0001173B}, +{0x000118E0, 0x000118F2}, {0x00011950, 0x00011959}, {0x00011C50, 0x00011C6C}, {0x00011D50, 0x00011D59}, +{0x00011DA0, 0x00011DA9}, {0x00011F50, 0x00011F59}, {0x00011FC0, 0x00011FD4}, {0x00012400, 0x0001246E}, +{0x00016A60, 0x00016A69}, {0x00016AC0, 0x00016AC9}, {0x00016B50, 0x00016B59}, {0x00016B5B, 0x00016B61}, +{0x00016E80, 0x00016E96}, {0x0001D2C0, 0x0001D2D3}, {0x0001D2E0, 0x0001D2F3}, {0x0001D360, 0x0001D378}, +{0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, {0x0001E2F0, 0x0001E2F9}, {0x0001E4F0, 0x0001E4F9}, +{0x0001E8C7, 0x0001E8CF}, {0x0001E950, 0x0001E959}, {0x0001EC71, 0x0001ECAB}, {0x0001ECAD, 0x0001ECAF}, +{0x0001ECB1, 0x0001ECB4}, {0x0001ED01, 0x0001ED2D}, {0x0001ED2F, 0x0001ED3D}, {0x0001F100, 0x0001F10C}, +{0x0001FBF0, 0x0001FBF9}, +}; + +const std::vector> unicode_ranges_letter = { +{0x00000041, 0x0000005A}, {0x00000061, 0x0000007A}, {0x000000AA, 0x000000AA}, {0x000000B5, 0x000000B5}, +{0x000000BA, 0x000000BA}, {0x000000C0, 0x000000D6}, {0x000000D8, 0x000000F6}, {0x000000F8, 0x000002C1}, +{0x000002C6, 0x000002D1}, {0x000002E0, 0x000002E4}, {0x000002EC, 0x000002EC}, {0x000002EE, 0x000002EE}, +{0x00000370, 0x00000374}, {0x00000376, 0x00000377}, {0x0000037A, 0x0000037D}, {0x0000037F, 0x0000037F}, +{0x00000386, 0x00000386}, {0x00000388, 0x0000038A}, {0x0000038C, 0x0000038C}, {0x0000038E, 0x000003A1}, +{0x000003A3, 0x000003F5}, {0x000003F7, 0x00000481}, {0x0000048A, 0x0000052F}, {0x00000531, 0x00000556}, +{0x00000559, 0x00000559}, {0x00000560, 0x00000588}, {0x000005D0, 0x000005EA}, {0x000005EF, 0x000005F2}, +{0x00000620, 0x0000064A}, {0x0000066E, 0x0000066F}, {0x00000671, 0x000006D3}, {0x000006D5, 0x000006D5}, +{0x000006E5, 0x000006E6}, {0x000006EE, 0x000006EF}, {0x000006FA, 0x000006FC}, {0x000006FF, 0x000006FF}, +{0x00000710, 0x00000710}, {0x00000712, 0x0000072F}, {0x0000074D, 0x000007A5}, {0x000007B1, 0x000007B1}, +{0x000007CA, 0x000007EA}, {0x000007F4, 0x000007F5}, {0x000007FA, 0x000007FA}, {0x00000800, 0x00000815}, +{0x0000081A, 0x0000081A}, {0x00000824, 0x00000824}, {0x00000828, 0x00000828}, {0x00000840, 0x00000858}, +{0x00000860, 0x0000086A}, {0x00000870, 0x00000887}, {0x00000889, 0x0000088E}, {0x000008A0, 0x000008C9}, +{0x00000904, 0x00000939}, {0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, +{0x00000971, 0x00000980}, {0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, +{0x000009AA, 0x000009B0}, {0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, +{0x000009CE, 0x000009CE}, {0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, +{0x000009FC, 0x000009FC}, {0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, +{0x00000A2A, 0x00000A30}, {0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, +{0x00000A59, 0x00000A5C}, {0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, +{0x00000A8F, 0x00000A91}, {0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, +{0x00000AB5, 0x00000AB9}, {0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, +{0x00000AF9, 0x00000AF9}, {0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, +{0x00000B2A, 0x00000B30}, {0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, +{0x00000B5C, 0x00000B5D}, {0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, +{0x00000B85, 0x00000B8A}, {0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, +{0x00000B9C, 0x00000B9C}, {0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, +{0x00000BAE, 0x00000BB9}, {0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, +{0x00000C12, 0x00000C28}, {0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, +{0x00000C5D, 0x00000C5D}, {0x00000C60, 0x00000C61}, {0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, +{0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, {0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, +{0x00000CBD, 0x00000CBD}, {0x00000CDD, 0x00000CDE}, {0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, +{0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, {0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, +{0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, {0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, +{0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, {0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, +{0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, {0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, +{0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, {0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, +{0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, {0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, +{0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, {0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, +{0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, {0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, +{0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, {0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, +{0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, {0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, +{0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, {0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, +{0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, {0x00001250, 0x00001256}, {0x00001258, 0x00001258}, +{0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, {0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, +{0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, {0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, +{0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, {0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, +{0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, {0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, +{0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, {0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, +{0x00001700, 0x00001711}, {0x0000171F, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, +{0x0000176E, 0x00001770}, {0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, +{0x00001820, 0x00001878}, {0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, +{0x000018B0, 0x000018F5}, {0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, +{0x00001980, 0x000019AB}, {0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, +{0x00001AA7, 0x00001AA7}, {0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4C}, {0x00001B83, 0x00001BA0}, +{0x00001BAE, 0x00001BAF}, {0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, +{0x00001C5A, 0x00001C7D}, {0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, +{0x00001CE9, 0x00001CEC}, {0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, +{0x00001D00, 0x00001DBF}, {0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, +{0x00001F48, 0x00001F4D}, {0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, +{0x00001F5D, 0x00001F5D}, {0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, +{0x00001FBE, 0x00001FBE}, {0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, +{0x00001FD6, 0x00001FDB}, {0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, +{0x00002071, 0x00002071}, {0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, +{0x00002107, 0x00002107}, {0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, +{0x00002124, 0x00002124}, {0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, +{0x0000212F, 0x00002139}, {0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, +{0x00002183, 0x00002184}, {0x00002C00, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, {0x00002CF2, 0x00002CF3}, +{0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, {0x00002D30, 0x00002D67}, +{0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, {0x00002DA8, 0x00002DAE}, +{0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, {0x00002DC8, 0x00002DCE}, +{0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, {0x00003005, 0x00003006}, +{0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, {0x0000309D, 0x0000309F}, +{0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, {0x00003131, 0x0000318E}, +{0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, {0x00004E00, 0x0000A48C}, +{0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, {0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, +{0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, {0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, +{0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7CA}, {0x0000A7D0, 0x0000A7D1}, {0x0000A7D3, 0x0000A7D3}, +{0x0000A7D5, 0x0000A7D9}, {0x0000A7F2, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A}, +{0x0000A80C, 0x0000A822}, {0x0000A840, 0x0000A873}, {0x0000A882, 0x0000A8B3}, {0x0000A8F2, 0x0000A8F7}, +{0x0000A8FB, 0x0000A8FB}, {0x0000A8FD, 0x0000A8FE}, {0x0000A90A, 0x0000A925}, {0x0000A930, 0x0000A946}, +{0x0000A960, 0x0000A97C}, {0x0000A984, 0x0000A9B2}, {0x0000A9CF, 0x0000A9CF}, {0x0000A9E0, 0x0000A9E4}, +{0x0000A9E6, 0x0000A9EF}, {0x0000A9FA, 0x0000A9FE}, {0x0000AA00, 0x0000AA28}, {0x0000AA40, 0x0000AA42}, +{0x0000AA44, 0x0000AA4B}, {0x0000AA60, 0x0000AA76}, {0x0000AA7A, 0x0000AA7A}, {0x0000AA7E, 0x0000AAAF}, +{0x0000AAB1, 0x0000AAB1}, {0x0000AAB5, 0x0000AAB6}, {0x0000AAB9, 0x0000AABD}, {0x0000AAC0, 0x0000AAC0}, +{0x0000AAC2, 0x0000AAC2}, {0x0000AADB, 0x0000AADD}, {0x0000AAE0, 0x0000AAEA}, {0x0000AAF2, 0x0000AAF4}, +{0x0000AB01, 0x0000AB06}, {0x0000AB09, 0x0000AB0E}, {0x0000AB11, 0x0000AB16}, {0x0000AB20, 0x0000AB26}, +{0x0000AB28, 0x0000AB2E}, {0x0000AB30, 0x0000AB5A}, {0x0000AB5C, 0x0000AB69}, {0x0000AB70, 0x0000ABE2}, +{0x0000AC00, 0x0000D7A3}, {0x0000D7B0, 0x0000D7C6}, {0x0000D7CB, 0x0000D7FB}, {0x0000F900, 0x0000FA6D}, +{0x0000FA70, 0x0000FAD9}, {0x0000FB00, 0x0000FB06}, {0x0000FB13, 0x0000FB17}, {0x0000FB1D, 0x0000FB1D}, +{0x0000FB1F, 0x0000FB28}, {0x0000FB2A, 0x0000FB36}, {0x0000FB38, 0x0000FB3C}, {0x0000FB3E, 0x0000FB3E}, +{0x0000FB40, 0x0000FB41}, {0x0000FB43, 0x0000FB44}, {0x0000FB46, 0x0000FBB1}, {0x0000FBD3, 0x0000FD3D}, +{0x0000FD50, 0x0000FD8F}, {0x0000FD92, 0x0000FDC7}, {0x0000FDF0, 0x0000FDFB}, {0x0000FE70, 0x0000FE74}, +{0x0000FE76, 0x0000FEFC}, {0x0000FF21, 0x0000FF3A}, {0x0000FF41, 0x0000FF5A}, {0x0000FF66, 0x0000FFBE}, +{0x0000FFC2, 0x0000FFC7}, {0x0000FFCA, 0x0000FFCF}, {0x0000FFD2, 0x0000FFD7}, {0x0000FFDA, 0x0000FFDC}, +{0x00010000, 0x0001000B}, {0x0001000D, 0x00010026}, {0x00010028, 0x0001003A}, {0x0001003C, 0x0001003D}, +{0x0001003F, 0x0001004D}, {0x00010050, 0x0001005D}, {0x00010080, 0x000100FA}, {0x00010280, 0x0001029C}, +{0x000102A0, 0x000102D0}, {0x00010300, 0x0001031F}, {0x0001032D, 0x00010340}, {0x00010342, 0x00010349}, +{0x00010350, 0x00010375}, {0x00010380, 0x0001039D}, {0x000103A0, 0x000103C3}, {0x000103C8, 0x000103CF}, +{0x00010400, 0x0001049D}, {0x000104B0, 0x000104D3}, {0x000104D8, 0x000104FB}, {0x00010500, 0x00010527}, +{0x00010530, 0x00010563}, {0x00010570, 0x0001057A}, {0x0001057C, 0x0001058A}, {0x0001058C, 0x00010592}, +{0x00010594, 0x00010595}, {0x00010597, 0x000105A1}, {0x000105A3, 0x000105B1}, {0x000105B3, 0x000105B9}, +{0x000105BB, 0x000105BC}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767}, +{0x00010780, 0x00010785}, {0x00010787, 0x000107B0}, {0x000107B2, 0x000107BA}, {0x00010800, 0x00010805}, +{0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, {0x0001083C, 0x0001083C}, +{0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, {0x000108E0, 0x000108F2}, +{0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, {0x00010980, 0x000109B7}, +{0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, {0x00010A15, 0x00010A17}, +{0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, {0x00010AC0, 0x00010AC7}, +{0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, {0x00010B60, 0x00010B72}, +{0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, {0x00010CC0, 0x00010CF2}, +{0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, {0x00010F00, 0x00010F1C}, +{0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010F70, 0x00010F81}, {0x00010FB0, 0x00010FC4}, +{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011071, 0x00011072}, {0x00011075, 0x00011075}, +{0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, {0x00011103, 0x00011126}, {0x00011144, 0x00011144}, +{0x00011147, 0x00011147}, {0x00011150, 0x00011172}, {0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, +{0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, {0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, +{0x00011213, 0x0001122B}, {0x0001123F, 0x00011240}, {0x00011280, 0x00011286}, {0x00011288, 0x00011288}, +{0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, {0x000112B0, 0x000112DE}, +{0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, {0x0001132A, 0x00011330}, +{0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, {0x00011350, 0x00011350}, +{0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, {0x0001145F, 0x00011461}, +{0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, {0x00011580, 0x000115AE}, +{0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, {0x00011680, 0x000116AA}, +{0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011740, 0x00011746}, {0x00011800, 0x0001182B}, +{0x000118A0, 0x000118DF}, {0x000118FF, 0x00011906}, {0x00011909, 0x00011909}, {0x0001190C, 0x00011913}, +{0x00011915, 0x00011916}, {0x00011918, 0x0001192F}, {0x0001193F, 0x0001193F}, {0x00011941, 0x00011941}, +{0x000119A0, 0x000119A7}, {0x000119AA, 0x000119D0}, {0x000119E1, 0x000119E1}, {0x000119E3, 0x000119E3}, +{0x00011A00, 0x00011A00}, {0x00011A0B, 0x00011A32}, {0x00011A3A, 0x00011A3A}, {0x00011A50, 0x00011A50}, +{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AB0, 0x00011AF8}, {0x00011C00, 0x00011C08}, +{0x00011C0A, 0x00011C2E}, {0x00011C40, 0x00011C40}, {0x00011C72, 0x00011C8F}, {0x00011D00, 0x00011D06}, +{0x00011D08, 0x00011D09}, {0x00011D0B, 0x00011D30}, {0x00011D46, 0x00011D46}, {0x00011D60, 0x00011D65}, +{0x00011D67, 0x00011D68}, {0x00011D6A, 0x00011D89}, {0x00011D98, 0x00011D98}, {0x00011EE0, 0x00011EF2}, +{0x00011F02, 0x00011F02}, {0x00011F04, 0x00011F10}, {0x00011F12, 0x00011F33}, {0x00011FB0, 0x00011FB0}, +{0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00012F90, 0x00012FF0}, {0x00013000, 0x0001342F}, +{0x00013441, 0x00013446}, {0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, +{0x00016A70, 0x00016ABE}, {0x00016AD0, 0x00016AED}, {0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, +{0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, {0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, +{0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, {0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, +{0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, {0x00018D00, 0x00018D08}, {0x0001AFF0, 0x0001AFF3}, +{0x0001AFF5, 0x0001AFFB}, {0x0001AFFD, 0x0001AFFE}, {0x0001B000, 0x0001B122}, {0x0001B132, 0x0001B132}, +{0x0001B150, 0x0001B152}, {0x0001B155, 0x0001B155}, {0x0001B164, 0x0001B167}, {0x0001B170, 0x0001B2FB}, +{0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, {0x0001BC90, 0x0001BC99}, +{0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, {0x0001D4A2, 0x0001D4A2}, +{0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, {0x0001D4BB, 0x0001D4BB}, +{0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, {0x0001D50D, 0x0001D514}, +{0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, {0x0001D540, 0x0001D544}, +{0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, {0x0001D6A8, 0x0001D6C0}, +{0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, {0x0001D716, 0x0001D734}, +{0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, {0x0001D78A, 0x0001D7A8}, +{0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001DF00, 0x0001DF1E}, {0x0001DF25, 0x0001DF2A}, +{0x0001E030, 0x0001E06D}, {0x0001E100, 0x0001E12C}, {0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, +{0x0001E290, 0x0001E2AD}, {0x0001E2C0, 0x0001E2EB}, {0x0001E4D0, 0x0001E4EB}, {0x0001E7E0, 0x0001E7E6}, +{0x0001E7E8, 0x0001E7EB}, {0x0001E7ED, 0x0001E7EE}, {0x0001E7F0, 0x0001E7FE}, {0x0001E800, 0x0001E8C4}, +{0x0001E900, 0x0001E943}, {0x0001E94B, 0x0001E94B}, {0x0001EE00, 0x0001EE03}, {0x0001EE05, 0x0001EE1F}, +{0x0001EE21, 0x0001EE22}, {0x0001EE24, 0x0001EE24}, {0x0001EE27, 0x0001EE27}, {0x0001EE29, 0x0001EE32}, +{0x0001EE34, 0x0001EE37}, {0x0001EE39, 0x0001EE39}, {0x0001EE3B, 0x0001EE3B}, {0x0001EE42, 0x0001EE42}, +{0x0001EE47, 0x0001EE47}, {0x0001EE49, 0x0001EE49}, {0x0001EE4B, 0x0001EE4B}, {0x0001EE4D, 0x0001EE4F}, +{0x0001EE51, 0x0001EE52}, {0x0001EE54, 0x0001EE54}, {0x0001EE57, 0x0001EE57}, {0x0001EE59, 0x0001EE59}, +{0x0001EE5B, 0x0001EE5B}, {0x0001EE5D, 0x0001EE5D}, {0x0001EE5F, 0x0001EE5F}, {0x0001EE61, 0x0001EE62}, +{0x0001EE64, 0x0001EE64}, {0x0001EE67, 0x0001EE6A}, {0x0001EE6C, 0x0001EE72}, {0x0001EE74, 0x0001EE77}, +{0x0001EE79, 0x0001EE7C}, {0x0001EE7E, 0x0001EE7E}, {0x0001EE80, 0x0001EE89}, {0x0001EE8B, 0x0001EE9B}, +{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DF}, +{0x0002A700, 0x0002B739}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0}, +{0x0002EBF0, 0x0002EE5D}, {0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, {0x00031350, 0x000323AF}, +}; + +const std::vector> unicode_ranges_separator = { +{0x00000020, 0x00000020}, {0x000000A0, 0x000000A0}, {0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, +{0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, {0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, +}; + +const std::vector> unicode_ranges_whitespace = { +{0x00000009, 0x0000000D}, {0x00000020, 0x00000020}, {0x00000085, 0x00000085}, {0x000000A0, 0x000000A0}, +{0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, {0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, +{0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, +}; + +const std::vector> unicode_ranges_accent_mark = { +{0x00000300, 0x0000036F}, {0x00000483, 0x00000489}, {0x00000591, 0x000005BD}, {0x000005BF, 0x000005BF}, +{0x000005C1, 0x000005C2}, {0x000005C4, 0x000005C5}, {0x000005C7, 0x000005C7}, {0x00000610, 0x0000061A}, +{0x0000064B, 0x0000065F}, {0x00000670, 0x00000670}, {0x000006D6, 0x000006DC}, {0x000006DF, 0x000006E4}, +{0x000006E7, 0x000006E8}, {0x000006EA, 0x000006ED}, {0x00000711, 0x00000711}, {0x00000730, 0x0000074A}, +{0x000007A6, 0x000007B0}, {0x000007EB, 0x000007F3}, {0x000007FD, 0x000007FD}, {0x00000816, 0x00000819}, +{0x0000081B, 0x00000823}, {0x00000825, 0x00000827}, {0x00000829, 0x0000082D}, {0x00000859, 0x0000085B}, +{0x00000898, 0x0000089F}, {0x000008CA, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, +{0x0000093E, 0x0000094F}, {0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, +{0x000009BC, 0x000009BC}, {0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, +{0x000009D7, 0x000009D7}, {0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, +{0x00000A3C, 0x00000A3C}, {0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, +{0x00000A51, 0x00000A51}, {0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, +{0x00000ABC, 0x00000ABC}, {0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, +{0x00000AE2, 0x00000AE3}, {0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, +{0x00000B3E, 0x00000B44}, {0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, +{0x00000B62, 0x00000B63}, {0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, +{0x00000BCA, 0x00000BCD}, {0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3C, 0x00000C3C}, +{0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, {0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, +{0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, {0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, +{0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, {0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, +{0x00000CF3, 0x00000CF3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, {0x00000D3E, 0x00000D44}, +{0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, {0x00000D62, 0x00000D63}, +{0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, {0x00000DD6, 0x00000DD6}, +{0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, {0x00000E34, 0x00000E3A}, +{0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, {0x00000EC8, 0x00000ECE}, +{0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, {0x00000F39, 0x00000F39}, +{0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, {0x00000F8D, 0x00000F97}, +{0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, {0x00001056, 0x00001059}, +{0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, {0x00001071, 0x00001074}, +{0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, {0x0000135D, 0x0000135F}, +{0x00001712, 0x00001715}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, {0x00001772, 0x00001773}, +{0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, {0x0000180F, 0x0000180F}, +{0x00001885, 0x00001886}, {0x000018A9, 0x000018A9}, {0x00001920, 0x0000192B}, {0x00001930, 0x0000193B}, +{0x00001A17, 0x00001A1B}, {0x00001A55, 0x00001A5E}, {0x00001A60, 0x00001A7C}, {0x00001A7F, 0x00001A7F}, +{0x00001AB0, 0x00001ACE}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73}, +{0x00001B80, 0x00001B82}, {0x00001BA1, 0x00001BAD}, {0x00001BE6, 0x00001BF3}, {0x00001C24, 0x00001C37}, +{0x00001CD0, 0x00001CD2}, {0x00001CD4, 0x00001CE8}, {0x00001CED, 0x00001CED}, {0x00001CF4, 0x00001CF4}, +{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DFF}, {0x000020D0, 0x000020F0}, {0x00002CEF, 0x00002CF1}, +{0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, {0x00003099, 0x0000309A}, +{0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, {0x0000A6F0, 0x0000A6F1}, +{0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, {0x0000A823, 0x0000A827}, +{0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, {0x0000A8E0, 0x0000A8F1}, +{0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, {0x0000A980, 0x0000A983}, +{0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, {0x0000AA43, 0x0000AA43}, +{0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, {0x0000AAB2, 0x0000AAB4}, +{0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, {0x0000AAEB, 0x0000AAEF}, +{0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, {0x0000FB1E, 0x0000FB1E}, +{0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, {0x000102E0, 0x000102E0}, +{0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, {0x00010A0C, 0x00010A0F}, +{0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, {0x00010D24, 0x00010D27}, +{0x00010EAB, 0x00010EAC}, {0x00010EFD, 0x00010EFF}, {0x00010F46, 0x00010F50}, {0x00010F82, 0x00010F85}, +{0x00011000, 0x00011002}, {0x00011038, 0x00011046}, {0x00011070, 0x00011070}, {0x00011073, 0x00011074}, +{0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x000110C2, 0x000110C2}, {0x00011100, 0x00011102}, +{0x00011127, 0x00011134}, {0x00011145, 0x00011146}, {0x00011173, 0x00011173}, {0x00011180, 0x00011182}, +{0x000111B3, 0x000111C0}, {0x000111C9, 0x000111CC}, {0x000111CE, 0x000111CF}, {0x0001122C, 0x00011237}, +{0x0001123E, 0x0001123E}, {0x00011241, 0x00011241}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, +{0x0001133B, 0x0001133C}, {0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, +{0x00011357, 0x00011357}, {0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, +{0x00011435, 0x00011446}, {0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, +{0x000115B8, 0x000115C0}, {0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, +{0x0001171D, 0x0001172B}, {0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, +{0x0001193B, 0x0001193E}, {0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, +{0x000119DA, 0x000119E0}, {0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, +{0x00011A3B, 0x00011A3E}, {0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, +{0x00011C2F, 0x00011C36}, {0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, +{0x00011D31, 0x00011D36}, {0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, +{0x00011D47, 0x00011D47}, {0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, +{0x00011EF3, 0x00011EF6}, {0x00011F00, 0x00011F01}, {0x00011F03, 0x00011F03}, {0x00011F34, 0x00011F3A}, +{0x00011F3E, 0x00011F42}, {0x00013440, 0x00013440}, {0x00013447, 0x00013455}, {0x00016AF0, 0x00016AF4}, +{0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, {0x00016F8F, 0x00016F92}, +{0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, {0x0001CF00, 0x0001CF2D}, +{0x0001CF30, 0x0001CF46}, {0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, +{0x0001D185, 0x0001D18B}, {0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, +{0x0001DA3B, 0x0001DA6C}, {0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, +{0x0001DAA1, 0x0001DAAF}, {0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, +{0x0001E023, 0x0001E024}, {0x0001E026, 0x0001E02A}, {0x0001E08F, 0x0001E08F}, {0x0001E130, 0x0001E136}, +{0x0001E2AE, 0x0001E2AE}, {0x0001E2EC, 0x0001E2EF}, {0x0001E4EC, 0x0001E4EF}, {0x0001E8D0, 0x0001E8D6}, +{0x0001E944, 0x0001E94A}, {0x000E0100, 0x000E01EF}, +}; + +const std::vector> unicode_ranges_punctuation = { +{0x00000021, 0x00000023}, {0x00000025, 0x0000002A}, {0x0000002C, 0x0000002F}, {0x0000003A, 0x0000003B}, +{0x0000003F, 0x00000040}, {0x0000005B, 0x0000005D}, {0x0000005F, 0x0000005F}, {0x0000007B, 0x0000007B}, +{0x0000007D, 0x0000007D}, {0x000000A1, 0x000000A1}, {0x000000A7, 0x000000A7}, {0x000000AB, 0x000000AB}, +{0x000000B6, 0x000000B7}, {0x000000BB, 0x000000BB}, {0x000000BF, 0x000000BF}, {0x0000037E, 0x0000037E}, +{0x00000387, 0x00000387}, {0x0000055A, 0x0000055F}, {0x00000589, 0x0000058A}, {0x000005BE, 0x000005BE}, +{0x000005C0, 0x000005C0}, {0x000005C3, 0x000005C3}, {0x000005C6, 0x000005C6}, {0x000005F3, 0x000005F4}, +{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061D, 0x0000061F}, +{0x0000066A, 0x0000066D}, {0x000006D4, 0x000006D4}, {0x00000700, 0x0000070D}, {0x000007F7, 0x000007F9}, +{0x00000830, 0x0000083E}, {0x0000085E, 0x0000085E}, {0x00000964, 0x00000965}, {0x00000970, 0x00000970}, +{0x000009FD, 0x000009FD}, {0x00000A76, 0x00000A76}, {0x00000AF0, 0x00000AF0}, {0x00000C77, 0x00000C77}, +{0x00000C84, 0x00000C84}, {0x00000DF4, 0x00000DF4}, {0x00000E4F, 0x00000E4F}, {0x00000E5A, 0x00000E5B}, +{0x00000F04, 0x00000F12}, {0x00000F14, 0x00000F14}, {0x00000F3A, 0x00000F3D}, {0x00000F85, 0x00000F85}, +{0x00000FD0, 0x00000FD4}, {0x00000FD9, 0x00000FDA}, {0x0000104A, 0x0000104F}, {0x000010FB, 0x000010FB}, +{0x00001360, 0x00001368}, {0x00001400, 0x00001400}, {0x0000166E, 0x0000166E}, {0x0000169B, 0x0000169C}, +{0x000016EB, 0x000016ED}, {0x00001735, 0x00001736}, {0x000017D4, 0x000017D6}, {0x000017D8, 0x000017DA}, +{0x00001800, 0x0000180A}, {0x00001944, 0x00001945}, {0x00001A1E, 0x00001A1F}, {0x00001AA0, 0x00001AA6}, +{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001B7D, 0x00001B7E}, {0x00001BFC, 0x00001BFF}, +{0x00001C3B, 0x00001C3F}, {0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, +{0x00002010, 0x00002027}, {0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, +{0x0000207D, 0x0000207E}, {0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, +{0x00002768, 0x00002775}, {0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, +{0x000029D8, 0x000029DB}, {0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, +{0x00002D70, 0x00002D70}, {0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E5D}, +{0x00003001, 0x00003003}, {0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, +{0x0000303D, 0x0000303D}, {0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, +{0x0000A60D, 0x0000A60F}, {0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, +{0x0000A874, 0x0000A877}, {0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, +{0x0000A92E, 0x0000A92F}, {0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, +{0x0000AA5C, 0x0000AA5F}, {0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, +{0x0000FD3E, 0x0000FD3F}, {0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, +{0x0000FE63, 0x0000FE63}, {0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, +{0x0000FF05, 0x0000FF0A}, {0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, +{0x0000FF3B, 0x0000FF3D}, {0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, +{0x0000FF5F, 0x0000FF65}, {0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, +{0x0001056F, 0x0001056F}, {0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, +{0x00010A50, 0x00010A58}, {0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, +{0x00010B99, 0x00010B9C}, {0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00010F86, 0x00010F89}, +{0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, {0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, +{0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, {0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, +{0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, {0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, +{0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, {0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, +{0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, {0x000116B9, 0x000116B9}, {0x0001173C, 0x0001173E}, +{0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, {0x00011A3F, 0x00011A46}, +{0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011B00, 0x00011B09}, {0x00011C41, 0x00011C45}, +{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011F43, 0x00011F4F}, {0x00011FFF, 0x00011FFF}, +{0x00012470, 0x00012474}, {0x00012FF1, 0x00012FF2}, {0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, +{0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, {0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, +{0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, {0x0001E95E, 0x0001E95F}, +}; + +const std::vector> unicode_ranges_symbol = { +{0x00000024, 0x00000024}, {0x0000002B, 0x0000002B}, {0x0000003C, 0x0000003E}, {0x0000005E, 0x0000005E}, +{0x00000060, 0x00000060}, {0x0000007C, 0x0000007C}, {0x0000007E, 0x0000007E}, {0x000000A2, 0x000000A6}, +{0x000000A8, 0x000000A9}, {0x000000AC, 0x000000AC}, {0x000000AE, 0x000000B1}, {0x000000B4, 0x000000B4}, +{0x000000B8, 0x000000B8}, {0x000000D7, 0x000000D7}, {0x000000F7, 0x000000F7}, {0x000002C2, 0x000002C5}, +{0x000002D2, 0x000002DF}, {0x000002E5, 0x000002EB}, {0x000002ED, 0x000002ED}, {0x000002EF, 0x000002FF}, +{0x00000375, 0x00000375}, {0x00000384, 0x00000385}, {0x000003F6, 0x000003F6}, {0x00000482, 0x00000482}, +{0x0000058D, 0x0000058F}, {0x00000606, 0x00000608}, {0x0000060B, 0x0000060B}, {0x0000060E, 0x0000060F}, +{0x000006DE, 0x000006DE}, {0x000006E9, 0x000006E9}, {0x000006FD, 0x000006FE}, {0x000007F6, 0x000007F6}, +{0x000007FE, 0x000007FF}, {0x00000888, 0x00000888}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, +{0x00000AF1, 0x00000AF1}, {0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, +{0x00000D4F, 0x00000D4F}, {0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, +{0x00000F13, 0x00000F13}, {0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, +{0x00000F36, 0x00000F36}, {0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, +{0x00000FCE, 0x00000FCF}, {0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, +{0x0000166D, 0x0000166D}, {0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, +{0x00001B61, 0x00001B6A}, {0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, +{0x00001FCD, 0x00001FCF}, {0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, +{0x00002044, 0x00002044}, {0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, +{0x000020A0, 0x000020C0}, {0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, +{0x00002114, 0x00002114}, {0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, +{0x00002127, 0x00002127}, {0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, +{0x00002140, 0x00002144}, {0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, +{0x00002190, 0x00002307}, {0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, +{0x0000249C, 0x000024E9}, {0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, +{0x000027F0, 0x00002982}, {0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, +{0x00002B76, 0x00002B95}, {0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, +{0x00002E80, 0x00002E99}, {0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFF}, +{0x00003004, 0x00003004}, {0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, +{0x0000303E, 0x0000303F}, {0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, +{0x000031C0, 0x000031E3}, {0x000031EF, 0x000031EF}, {0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, +{0x00003250, 0x00003250}, {0x00003260, 0x0000327F}, {0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, +{0x00004DC0, 0x00004DFF}, {0x0000A490, 0x0000A4C6}, {0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, +{0x0000A789, 0x0000A78A}, {0x0000A828, 0x0000A82B}, {0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, +{0x0000AB5B, 0x0000AB5B}, {0x0000AB6A, 0x0000AB6B}, {0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC2}, +{0x0000FD40, 0x0000FD4F}, {0x0000FDCF, 0x0000FDCF}, {0x0000FDFC, 0x0000FDFF}, {0x0000FE62, 0x0000FE62}, +{0x0000FE64, 0x0000FE66}, {0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, +{0x0000FF1C, 0x0000FF1E}, {0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, +{0x0000FF5E, 0x0000FF5E}, {0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, +{0x00010137, 0x0001013F}, {0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, +{0x000101A0, 0x000101A0}, {0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, +{0x0001173F, 0x0001173F}, {0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, +{0x0001BC9C, 0x0001BC9C}, {0x0001CF50, 0x0001CFC3}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, +{0x0001D129, 0x0001D164}, {0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, +{0x0001D1AE, 0x0001D1EA}, {0x0001D200, 0x0001D241}, {0x0001D245, 0x0001D245}, {0x0001D300, 0x0001D356}, +{0x0001D6C1, 0x0001D6C1}, {0x0001D6DB, 0x0001D6DB}, {0x0001D6FB, 0x0001D6FB}, {0x0001D715, 0x0001D715}, +{0x0001D735, 0x0001D735}, {0x0001D74F, 0x0001D74F}, {0x0001D76F, 0x0001D76F}, {0x0001D789, 0x0001D789}, +{0x0001D7A9, 0x0001D7A9}, {0x0001D7C3, 0x0001D7C3}, {0x0001D800, 0x0001D9FF}, {0x0001DA37, 0x0001DA3A}, +{0x0001DA6D, 0x0001DA74}, {0x0001DA76, 0x0001DA83}, {0x0001DA85, 0x0001DA86}, {0x0001E14F, 0x0001E14F}, +{0x0001E2FF, 0x0001E2FF}, {0x0001ECAC, 0x0001ECAC}, {0x0001ECB0, 0x0001ECB0}, {0x0001ED2E, 0x0001ED2E}, +{0x0001EEF0, 0x0001EEF1}, {0x0001F000, 0x0001F02B}, {0x0001F030, 0x0001F093}, {0x0001F0A0, 0x0001F0AE}, +{0x0001F0B1, 0x0001F0BF}, {0x0001F0C1, 0x0001F0CF}, {0x0001F0D1, 0x0001F0F5}, {0x0001F10D, 0x0001F1AD}, +{0x0001F1E6, 0x0001F202}, {0x0001F210, 0x0001F23B}, {0x0001F240, 0x0001F248}, {0x0001F250, 0x0001F251}, +{0x0001F260, 0x0001F265}, {0x0001F300, 0x0001F6D7}, {0x0001F6DC, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, +{0x0001F700, 0x0001F776}, {0x0001F77B, 0x0001F7D9}, {0x0001F7E0, 0x0001F7EB}, {0x0001F7F0, 0x0001F7F0}, +{0x0001F800, 0x0001F80B}, {0x0001F810, 0x0001F847}, {0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, +{0x0001F890, 0x0001F8AD}, {0x0001F8B0, 0x0001F8B1}, {0x0001F900, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, +{0x0001FA70, 0x0001FA7C}, {0x0001FA80, 0x0001FA88}, {0x0001FA90, 0x0001FABD}, {0x0001FABF, 0x0001FAC5}, +{0x0001FACE, 0x0001FADB}, {0x0001FAE0, 0x0001FAE8}, {0x0001FAF0, 0x0001FAF8}, {0x0001FB00, 0x0001FB92}, +{0x0001FB94, 0x0001FBCA}, +}; + +const std::vector> unicode_ranges_control = { +{0x00000000, 0x0000001F}, {0x0000007F, 0x0000009F}, {0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, +{0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, {0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, +{0x00000530, 0x00000530}, {0x00000557, 0x00000558}, {0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, +{0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, {0x000005F5, 0x00000605}, {0x0000061C, 0x0000061C}, +{0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, {0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, +{0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, {0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, +{0x0000085F, 0x0000085F}, {0x0000086B, 0x0000086F}, {0x0000088F, 0x00000897}, {0x000008E2, 0x000008E2}, +{0x00000984, 0x00000984}, {0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, +{0x000009B1, 0x000009B1}, {0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, +{0x000009C9, 0x000009CA}, {0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, +{0x000009E4, 0x000009E5}, {0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, +{0x00000A11, 0x00000A12}, {0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, +{0x00000A37, 0x00000A37}, {0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, +{0x00000A49, 0x00000A4A}, {0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, +{0x00000A5F, 0x00000A65}, {0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, +{0x00000A92, 0x00000A92}, {0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, +{0x00000ABA, 0x00000ABB}, {0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, +{0x00000AD1, 0x00000ADF}, {0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, +{0x00000B04, 0x00000B04}, {0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, +{0x00000B31, 0x00000B31}, {0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, +{0x00000B49, 0x00000B4A}, {0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, +{0x00000B64, 0x00000B65}, {0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, +{0x00000B91, 0x00000B91}, {0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, +{0x00000BA0, 0x00000BA2}, {0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, +{0x00000BC3, 0x00000BC5}, {0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, +{0x00000BD8, 0x00000BE5}, {0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, +{0x00000C29, 0x00000C29}, {0x00000C3A, 0x00000C3B}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, +{0x00000C4E, 0x00000C54}, {0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5C}, {0x00000C5E, 0x00000C5F}, +{0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, {0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, +{0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, {0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, +{0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, {0x00000CD7, 0x00000CDC}, {0x00000CDF, 0x00000CDF}, +{0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, {0x00000CF4, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, +{0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, {0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, +{0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, {0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, +{0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, {0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, +{0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, {0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, +{0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, {0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, +{0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, {0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, +{0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, {0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, +{0x00000ECF, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, {0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, +{0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, {0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, +{0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, {0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, +{0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, {0x00001257, 0x00001257}, {0x00001259, 0x00001259}, +{0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, {0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, +{0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, {0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, +{0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, {0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, +{0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, {0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, +{0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, {0x00001716, 0x0000171E}, {0x00001737, 0x0000173F}, +{0x00001754, 0x0000175F}, {0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, +{0x000017DE, 0x000017DF}, {0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180E}, +{0x0000181A, 0x0000181F}, {0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, +{0x0000191F, 0x0000191F}, {0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, +{0x0000196E, 0x0000196F}, {0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, +{0x000019DB, 0x000019DD}, {0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, +{0x00001A8A, 0x00001A8F}, {0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001ACF, 0x00001AFF}, +{0x00001B4D, 0x00001B4F}, {0x00001B7F, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, +{0x00001C4A, 0x00001C4C}, {0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, +{0x00001CFB, 0x00001CFF}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47}, +{0x00001F4E, 0x00001F4F}, {0x00001F58, 0x00001F58}, {0x00001F5A, 0x00001F5A}, {0x00001F5C, 0x00001F5C}, +{0x00001F5E, 0x00001F5E}, {0x00001F7E, 0x00001F7F}, {0x00001FB5, 0x00001FB5}, {0x00001FC5, 0x00001FC5}, +{0x00001FD4, 0x00001FD5}, {0x00001FDC, 0x00001FDC}, {0x00001FF0, 0x00001FF1}, {0x00001FF5, 0x00001FF5}, +{0x00001FFF, 0x00001FFF}, {0x0000200B, 0x0000200F}, {0x0000202A, 0x0000202E}, {0x00002060, 0x0000206F}, +{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C1, 0x000020CF}, +{0x000020F1, 0x000020FF}, {0x0000218C, 0x0000218F}, {0x00002427, 0x0000243F}, {0x0000244B, 0x0000245F}, +{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, +{0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, {0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, +{0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, {0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, +{0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, {0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, +{0x00002DDF, 0x00002DDF}, {0x00002E5E, 0x00002E7F}, {0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, +{0x00002FD6, 0x00002FEF}, {0x00003040, 0x00003040}, {0x00003097, 0x00003098}, {0x00003100, 0x00003104}, +{0x00003130, 0x00003130}, {0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EE}, {0x0000321F, 0x0000321F}, +{0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, {0x0000A6F8, 0x0000A6FF}, +{0x0000A7CB, 0x0000A7CF}, {0x0000A7D2, 0x0000A7D2}, {0x0000A7D4, 0x0000A7D4}, {0x0000A7DA, 0x0000A7F1}, +{0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, {0x0000A878, 0x0000A87F}, {0x0000A8C6, 0x0000A8CD}, +{0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, {0x0000A97D, 0x0000A97F}, {0x0000A9CE, 0x0000A9CE}, +{0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, {0x0000AA37, 0x0000AA3F}, {0x0000AA4E, 0x0000AA4F}, +{0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, {0x0000AAF7, 0x0000AB00}, {0x0000AB07, 0x0000AB08}, +{0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, {0x0000AB27, 0x0000AB27}, {0x0000AB2F, 0x0000AB2F}, +{0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, {0x0000ABFA, 0x0000ABFF}, {0x0000D7A4, 0x0000D7AF}, +{0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000F8FF}, {0x0000FA6E, 0x0000FA6F}, {0x0000FADA, 0x0000FAFF}, +{0x0000FB07, 0x0000FB12}, {0x0000FB18, 0x0000FB1C}, {0x0000FB37, 0x0000FB37}, {0x0000FB3D, 0x0000FB3D}, +{0x0000FB3F, 0x0000FB3F}, {0x0000FB42, 0x0000FB42}, {0x0000FB45, 0x0000FB45}, {0x0000FBC3, 0x0000FBD2}, +{0x0000FD90, 0x0000FD91}, {0x0000FDC8, 0x0000FDCE}, {0x0000FDD0, 0x0000FDEF}, {0x0000FE1A, 0x0000FE1F}, +{0x0000FE53, 0x0000FE53}, {0x0000FE67, 0x0000FE67}, {0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, +{0x0000FEFD, 0x0000FF00}, {0x0000FFBF, 0x0000FFC1}, {0x0000FFC8, 0x0000FFC9}, {0x0000FFD0, 0x0000FFD1}, +{0x0000FFD8, 0x0000FFD9}, {0x0000FFDD, 0x0000FFDF}, {0x0000FFE7, 0x0000FFE7}, {0x0000FFEF, 0x0000FFFB}, +{0x0000FFFE, 0x0000FFFF}, {0x0001000C, 0x0001000C}, {0x00010027, 0x00010027}, {0x0001003B, 0x0001003B}, +{0x0001003E, 0x0001003E}, {0x0001004E, 0x0001004F}, {0x0001005E, 0x0001007F}, {0x000100FB, 0x000100FF}, +{0x00010103, 0x00010106}, {0x00010134, 0x00010136}, {0x0001018F, 0x0001018F}, {0x0001019D, 0x0001019F}, +{0x000101A1, 0x000101CF}, {0x000101FE, 0x0001027F}, {0x0001029D, 0x0001029F}, {0x000102D1, 0x000102DF}, +{0x000102FC, 0x000102FF}, {0x00010324, 0x0001032C}, {0x0001034B, 0x0001034F}, {0x0001037B, 0x0001037F}, +{0x0001039E, 0x0001039E}, {0x000103C4, 0x000103C7}, {0x000103D6, 0x000103FF}, {0x0001049E, 0x0001049F}, +{0x000104AA, 0x000104AF}, {0x000104D4, 0x000104D7}, {0x000104FC, 0x000104FF}, {0x00010528, 0x0001052F}, +{0x00010564, 0x0001056E}, {0x0001057B, 0x0001057B}, {0x0001058B, 0x0001058B}, {0x00010593, 0x00010593}, +{0x00010596, 0x00010596}, {0x000105A2, 0x000105A2}, {0x000105B2, 0x000105B2}, {0x000105BA, 0x000105BA}, +{0x000105BD, 0x000105FF}, {0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x0001077F}, +{0x00010786, 0x00010786}, {0x000107B1, 0x000107B1}, {0x000107BB, 0x000107FF}, {0x00010806, 0x00010807}, +{0x00010809, 0x00010809}, {0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, {0x0001083D, 0x0001083E}, +{0x00010856, 0x00010856}, {0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, {0x000108F3, 0x000108F3}, +{0x000108F6, 0x000108FA}, {0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, {0x00010940, 0x0001097F}, +{0x000109B8, 0x000109BB}, {0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, {0x00010A07, 0x00010A0B}, +{0x00010A14, 0x00010A14}, {0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, {0x00010A3B, 0x00010A3E}, +{0x00010A49, 0x00010A4F}, {0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, {0x00010AE7, 0x00010AEA}, +{0x00010AF7, 0x00010AFF}, {0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, {0x00010B73, 0x00010B77}, +{0x00010B92, 0x00010B98}, {0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, {0x00010C49, 0x00010C7F}, +{0x00010CB3, 0x00010CBF}, {0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, {0x00010D3A, 0x00010E5F}, +{0x00010E7F, 0x00010E7F}, {0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, {0x00010EB2, 0x00010EFC}, +{0x00010F28, 0x00010F2F}, {0x00010F5A, 0x00010F6F}, {0x00010F8A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, +{0x00010FF7, 0x00010FFF}, {0x0001104E, 0x00011051}, {0x00011076, 0x0001107E}, {0x000110BD, 0x000110BD}, +{0x000110C3, 0x000110CF}, {0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, +{0x00011148, 0x0001114F}, {0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, +{0x00011212, 0x00011212}, {0x00011242, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, +{0x0001128E, 0x0001128E}, {0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, +{0x000112FA, 0x000112FF}, {0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, +{0x00011329, 0x00011329}, {0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, +{0x00011345, 0x00011346}, {0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, +{0x00011358, 0x0001135C}, {0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, +{0x0001145C, 0x0001145C}, {0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, +{0x000115B6, 0x000115B7}, {0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, +{0x0001166D, 0x0001167F}, {0x000116BA, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, +{0x0001172C, 0x0001172F}, {0x00011747, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, +{0x00011907, 0x00011908}, {0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, +{0x00011936, 0x00011936}, {0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, +{0x000119A8, 0x000119A9}, {0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, +{0x00011AA3, 0x00011AAF}, {0x00011AF9, 0x00011AFF}, {0x00011B0A, 0x00011BFF}, {0x00011C09, 0x00011C09}, +{0x00011C37, 0x00011C37}, {0x00011C46, 0x00011C4F}, {0x00011C6D, 0x00011C6F}, {0x00011C90, 0x00011C91}, +{0x00011CA8, 0x00011CA8}, {0x00011CB7, 0x00011CFF}, {0x00011D07, 0x00011D07}, {0x00011D0A, 0x00011D0A}, +{0x00011D37, 0x00011D39}, {0x00011D3B, 0x00011D3B}, {0x00011D3E, 0x00011D3E}, {0x00011D48, 0x00011D4F}, +{0x00011D5A, 0x00011D5F}, {0x00011D66, 0x00011D66}, {0x00011D69, 0x00011D69}, {0x00011D8F, 0x00011D8F}, +{0x00011D92, 0x00011D92}, {0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011EFF}, +{0x00011F11, 0x00011F11}, {0x00011F3B, 0x00011F3D}, {0x00011F5A, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, +{0x00011FF2, 0x00011FFE}, {0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, +{0x00012544, 0x00012F8F}, {0x00012FF3, 0x00012FFF}, {0x00013430, 0x0001343F}, {0x00013456, 0x000143FF}, +{0x00014647, 0x000167FF}, {0x00016A39, 0x00016A3F}, {0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, +{0x00016ABF, 0x00016ABF}, {0x00016ACA, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, {0x00016AF6, 0x00016AFF}, +{0x00016B46, 0x00016B4F}, {0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, {0x00016B78, 0x00016B7C}, +{0x00016B90, 0x00016E3F}, {0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, {0x00016F88, 0x00016F8E}, +{0x00016FA0, 0x00016FDF}, {0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, {0x000187F8, 0x000187FF}, +{0x00018CD6, 0x00018CFF}, {0x00018D09, 0x0001AFEF}, {0x0001AFF4, 0x0001AFF4}, {0x0001AFFC, 0x0001AFFC}, +{0x0001AFFF, 0x0001AFFF}, {0x0001B123, 0x0001B131}, {0x0001B133, 0x0001B14F}, {0x0001B153, 0x0001B154}, +{0x0001B156, 0x0001B163}, {0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, +{0x0001BC7D, 0x0001BC7F}, {0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CEFF}, +{0x0001CF2E, 0x0001CF2F}, {0x0001CF47, 0x0001CF4F}, {0x0001CFC4, 0x0001CFFF}, {0x0001D0F6, 0x0001D0FF}, +{0x0001D127, 0x0001D128}, {0x0001D173, 0x0001D17A}, {0x0001D1EB, 0x0001D1FF}, {0x0001D246, 0x0001D2BF}, +{0x0001D2D4, 0x0001D2DF}, {0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, +{0x0001D455, 0x0001D455}, {0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, +{0x0001D4A7, 0x0001D4A8}, {0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, +{0x0001D4C4, 0x0001D4C4}, {0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, +{0x0001D51D, 0x0001D51D}, {0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, +{0x0001D547, 0x0001D549}, {0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, +{0x0001DA8C, 0x0001DA9A}, {0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DEFF}, {0x0001DF1F, 0x0001DF24}, +{0x0001DF2B, 0x0001DFFF}, {0x0001E007, 0x0001E007}, {0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, +{0x0001E025, 0x0001E025}, {0x0001E02B, 0x0001E02F}, {0x0001E06E, 0x0001E08E}, {0x0001E090, 0x0001E0FF}, +{0x0001E12D, 0x0001E12F}, {0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E28F}, +{0x0001E2AF, 0x0001E2BF}, {0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E4CF}, {0x0001E4FA, 0x0001E7DF}, +{0x0001E7E7, 0x0001E7E7}, {0x0001E7EC, 0x0001E7EC}, {0x0001E7EF, 0x0001E7EF}, {0x0001E7FF, 0x0001E7FF}, +{0x0001E8C5, 0x0001E8C6}, {0x0001E8D7, 0x0001E8FF}, {0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, +{0x0001E960, 0x0001EC70}, {0x0001ECB5, 0x0001ED00}, {0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, +{0x0001EE20, 0x0001EE20}, {0x0001EE23, 0x0001EE23}, {0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, +{0x0001EE33, 0x0001EE33}, {0x0001EE38, 0x0001EE38}, {0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, +{0x0001EE43, 0x0001EE46}, {0x0001EE48, 0x0001EE48}, {0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, +{0x0001EE50, 0x0001EE50}, {0x0001EE53, 0x0001EE53}, {0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, +{0x0001EE5A, 0x0001EE5A}, {0x0001EE5C, 0x0001EE5C}, {0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, +{0x0001EE63, 0x0001EE63}, {0x0001EE65, 0x0001EE66}, {0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, +{0x0001EE78, 0x0001EE78}, {0x0001EE7D, 0x0001EE7D}, {0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, +{0x0001EE9C, 0x0001EEA0}, {0x0001EEA4, 0x0001EEA4}, {0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, +{0x0001EEF2, 0x0001EFFF}, {0x0001F02C, 0x0001F02F}, {0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, +{0x0001F0C0, 0x0001F0C0}, {0x0001F0D0, 0x0001F0D0}, {0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, +{0x0001F203, 0x0001F20F}, {0x0001F23C, 0x0001F23F}, {0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, +{0x0001F266, 0x0001F2FF}, {0x0001F6D8, 0x0001F6DB}, {0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, +{0x0001F777, 0x0001F77A}, {0x0001F7DA, 0x0001F7DF}, {0x0001F7EC, 0x0001F7EF}, {0x0001F7F1, 0x0001F7FF}, +{0x0001F80C, 0x0001F80F}, {0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, {0x0001F888, 0x0001F88F}, +{0x0001F8AE, 0x0001F8AF}, {0x0001F8B2, 0x0001F8FF}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, +{0x0001FA7D, 0x0001FA7F}, {0x0001FA89, 0x0001FA8F}, {0x0001FABE, 0x0001FABE}, {0x0001FAC6, 0x0001FACD}, +{0x0001FADC, 0x0001FADF}, {0x0001FAE9, 0x0001FAEF}, {0x0001FAF9, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, +{0x0001FBCB, 0x0001FBEF}, {0x0001FBFA, 0x0001FFFF}, {0x0002A6E0, 0x0002A6FF}, {0x0002B73A, 0x0002B73F}, +{0x0002B81E, 0x0002B81F}, {0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002EBEF}, {0x0002EE5E, 0x0002F7FF}, +{0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x0003134F}, {0x000323B0, 0x000E00FF}, {0x000E01F0, 0x0010FFFF}, +}; + +const std::map unicode_map_lowercase = { +{0x00000041, 0x00000061}, {0x00000042, 0x00000062}, {0x00000043, 0x00000063}, {0x00000044, 0x00000064}, +{0x00000045, 0x00000065}, {0x00000046, 0x00000066}, {0x00000047, 0x00000067}, {0x00000048, 0x00000068}, +{0x00000049, 0x00000069}, {0x0000004A, 0x0000006A}, {0x0000004B, 0x0000006B}, {0x0000004C, 0x0000006C}, +{0x0000004D, 0x0000006D}, {0x0000004E, 0x0000006E}, {0x0000004F, 0x0000006F}, {0x00000050, 0x00000070}, +{0x00000051, 0x00000071}, {0x00000052, 0x00000072}, {0x00000053, 0x00000073}, {0x00000054, 0x00000074}, +{0x00000055, 0x00000075}, {0x00000056, 0x00000076}, {0x00000057, 0x00000077}, {0x00000058, 0x00000078}, +{0x00000059, 0x00000079}, {0x0000005A, 0x0000007A}, {0x000000C0, 0x000000E0}, {0x000000C1, 0x000000E1}, +{0x000000C2, 0x000000E2}, {0x000000C3, 0x000000E3}, {0x000000C4, 0x000000E4}, {0x000000C5, 0x000000E5}, +{0x000000C6, 0x000000E6}, {0x000000C7, 0x000000E7}, {0x000000C8, 0x000000E8}, {0x000000C9, 0x000000E9}, +{0x000000CA, 0x000000EA}, {0x000000CB, 0x000000EB}, {0x000000CC, 0x000000EC}, {0x000000CD, 0x000000ED}, +{0x000000CE, 0x000000EE}, {0x000000CF, 0x000000EF}, {0x000000D0, 0x000000F0}, {0x000000D1, 0x000000F1}, +{0x000000D2, 0x000000F2}, {0x000000D3, 0x000000F3}, {0x000000D4, 0x000000F4}, {0x000000D5, 0x000000F5}, +{0x000000D6, 0x000000F6}, {0x000000D8, 0x000000F8}, {0x000000D9, 0x000000F9}, {0x000000DA, 0x000000FA}, +{0x000000DB, 0x000000FB}, {0x000000DC, 0x000000FC}, {0x000000DD, 0x000000FD}, {0x000000DE, 0x000000FE}, +{0x00000100, 0x00000101}, {0x00000102, 0x00000103}, {0x00000104, 0x00000105}, {0x00000106, 0x00000107}, +{0x00000108, 0x00000109}, {0x0000010A, 0x0000010B}, {0x0000010C, 0x0000010D}, {0x0000010E, 0x0000010F}, +{0x00000110, 0x00000111}, {0x00000112, 0x00000113}, {0x00000114, 0x00000115}, {0x00000116, 0x00000117}, +{0x00000118, 0x00000119}, {0x0000011A, 0x0000011B}, {0x0000011C, 0x0000011D}, {0x0000011E, 0x0000011F}, +{0x00000120, 0x00000121}, {0x00000122, 0x00000123}, {0x00000124, 0x00000125}, {0x00000126, 0x00000127}, +{0x00000128, 0x00000129}, {0x0000012A, 0x0000012B}, {0x0000012C, 0x0000012D}, {0x0000012E, 0x0000012F}, +{0x00000130, 0x00000069}, {0x00000132, 0x00000133}, {0x00000134, 0x00000135}, {0x00000136, 0x00000137}, +{0x00000139, 0x0000013A}, {0x0000013B, 0x0000013C}, {0x0000013D, 0x0000013E}, {0x0000013F, 0x00000140}, +{0x00000141, 0x00000142}, {0x00000143, 0x00000144}, {0x00000145, 0x00000146}, {0x00000147, 0x00000148}, +{0x0000014A, 0x0000014B}, {0x0000014C, 0x0000014D}, {0x0000014E, 0x0000014F}, {0x00000150, 0x00000151}, +{0x00000152, 0x00000153}, {0x00000154, 0x00000155}, {0x00000156, 0x00000157}, {0x00000158, 0x00000159}, +{0x0000015A, 0x0000015B}, {0x0000015C, 0x0000015D}, {0x0000015E, 0x0000015F}, {0x00000160, 0x00000161}, +{0x00000162, 0x00000163}, {0x00000164, 0x00000165}, {0x00000166, 0x00000167}, {0x00000168, 0x00000169}, +{0x0000016A, 0x0000016B}, {0x0000016C, 0x0000016D}, {0x0000016E, 0x0000016F}, {0x00000170, 0x00000171}, +{0x00000172, 0x00000173}, {0x00000174, 0x00000175}, {0x00000176, 0x00000177}, {0x00000178, 0x000000FF}, +{0x00000179, 0x0000017A}, {0x0000017B, 0x0000017C}, {0x0000017D, 0x0000017E}, {0x00000181, 0x00000253}, +{0x00000182, 0x00000183}, {0x00000184, 0x00000185}, {0x00000186, 0x00000254}, {0x00000187, 0x00000188}, +{0x00000189, 0x00000256}, {0x0000018A, 0x00000257}, {0x0000018B, 0x0000018C}, {0x0000018E, 0x000001DD}, +{0x0000018F, 0x00000259}, {0x00000190, 0x0000025B}, {0x00000191, 0x00000192}, {0x00000193, 0x00000260}, +{0x00000194, 0x00000263}, {0x00000196, 0x00000269}, {0x00000197, 0x00000268}, {0x00000198, 0x00000199}, +{0x0000019C, 0x0000026F}, {0x0000019D, 0x00000272}, {0x0000019F, 0x00000275}, {0x000001A0, 0x000001A1}, +{0x000001A2, 0x000001A3}, {0x000001A4, 0x000001A5}, {0x000001A6, 0x00000280}, {0x000001A7, 0x000001A8}, +{0x000001A9, 0x00000283}, {0x000001AC, 0x000001AD}, {0x000001AE, 0x00000288}, {0x000001AF, 0x000001B0}, +{0x000001B1, 0x0000028A}, {0x000001B2, 0x0000028B}, {0x000001B3, 0x000001B4}, {0x000001B5, 0x000001B6}, +{0x000001B7, 0x00000292}, {0x000001B8, 0x000001B9}, {0x000001BC, 0x000001BD}, {0x000001C4, 0x000001C6}, +{0x000001C5, 0x000001C6}, {0x000001C7, 0x000001C9}, {0x000001C8, 0x000001C9}, {0x000001CA, 0x000001CC}, +{0x000001CB, 0x000001CC}, {0x000001CD, 0x000001CE}, {0x000001CF, 0x000001D0}, {0x000001D1, 0x000001D2}, +{0x000001D3, 0x000001D4}, {0x000001D5, 0x000001D6}, {0x000001D7, 0x000001D8}, {0x000001D9, 0x000001DA}, +{0x000001DB, 0x000001DC}, {0x000001DE, 0x000001DF}, {0x000001E0, 0x000001E1}, {0x000001E2, 0x000001E3}, +{0x000001E4, 0x000001E5}, {0x000001E6, 0x000001E7}, {0x000001E8, 0x000001E9}, {0x000001EA, 0x000001EB}, +{0x000001EC, 0x000001ED}, {0x000001EE, 0x000001EF}, {0x000001F1, 0x000001F3}, {0x000001F2, 0x000001F3}, +{0x000001F4, 0x000001F5}, {0x000001F6, 0x00000195}, {0x000001F7, 0x000001BF}, {0x000001F8, 0x000001F9}, +{0x000001FA, 0x000001FB}, {0x000001FC, 0x000001FD}, {0x000001FE, 0x000001FF}, {0x00000200, 0x00000201}, +{0x00000202, 0x00000203}, {0x00000204, 0x00000205}, {0x00000206, 0x00000207}, {0x00000208, 0x00000209}, +{0x0000020A, 0x0000020B}, {0x0000020C, 0x0000020D}, {0x0000020E, 0x0000020F}, {0x00000210, 0x00000211}, +{0x00000212, 0x00000213}, {0x00000214, 0x00000215}, {0x00000216, 0x00000217}, {0x00000218, 0x00000219}, +{0x0000021A, 0x0000021B}, {0x0000021C, 0x0000021D}, {0x0000021E, 0x0000021F}, {0x00000220, 0x0000019E}, +{0x00000222, 0x00000223}, {0x00000224, 0x00000225}, {0x00000226, 0x00000227}, {0x00000228, 0x00000229}, +{0x0000022A, 0x0000022B}, {0x0000022C, 0x0000022D}, {0x0000022E, 0x0000022F}, {0x00000230, 0x00000231}, +{0x00000232, 0x00000233}, {0x0000023A, 0x00002C65}, {0x0000023B, 0x0000023C}, {0x0000023D, 0x0000019A}, +{0x0000023E, 0x00002C66}, {0x00000241, 0x00000242}, {0x00000243, 0x00000180}, {0x00000244, 0x00000289}, +{0x00000245, 0x0000028C}, {0x00000246, 0x00000247}, {0x00000248, 0x00000249}, {0x0000024A, 0x0000024B}, +{0x0000024C, 0x0000024D}, {0x0000024E, 0x0000024F}, {0x00000370, 0x00000371}, {0x00000372, 0x00000373}, +{0x00000376, 0x00000377}, {0x0000037F, 0x000003F3}, {0x00000386, 0x000003AC}, {0x00000388, 0x000003AD}, +{0x00000389, 0x000003AE}, {0x0000038A, 0x000003AF}, {0x0000038C, 0x000003CC}, {0x0000038E, 0x000003CD}, +{0x0000038F, 0x000003CE}, {0x00000391, 0x000003B1}, {0x00000392, 0x000003B2}, {0x00000393, 0x000003B3}, +{0x00000394, 0x000003B4}, {0x00000395, 0x000003B5}, {0x00000396, 0x000003B6}, {0x00000397, 0x000003B7}, +{0x00000398, 0x000003B8}, {0x00000399, 0x000003B9}, {0x0000039A, 0x000003BA}, {0x0000039B, 0x000003BB}, +{0x0000039C, 0x000003BC}, {0x0000039D, 0x000003BD}, {0x0000039E, 0x000003BE}, {0x0000039F, 0x000003BF}, +{0x000003A0, 0x000003C0}, {0x000003A1, 0x000003C1}, {0x000003A3, 0x000003C3}, {0x000003A4, 0x000003C4}, +{0x000003A5, 0x000003C5}, {0x000003A6, 0x000003C6}, {0x000003A7, 0x000003C7}, {0x000003A8, 0x000003C8}, +{0x000003A9, 0x000003C9}, {0x000003AA, 0x000003CA}, {0x000003AB, 0x000003CB}, {0x000003CF, 0x000003D7}, +{0x000003D8, 0x000003D9}, {0x000003DA, 0x000003DB}, {0x000003DC, 0x000003DD}, {0x000003DE, 0x000003DF}, +{0x000003E0, 0x000003E1}, {0x000003E2, 0x000003E3}, {0x000003E4, 0x000003E5}, {0x000003E6, 0x000003E7}, +{0x000003E8, 0x000003E9}, {0x000003EA, 0x000003EB}, {0x000003EC, 0x000003ED}, {0x000003EE, 0x000003EF}, +{0x000003F4, 0x000003B8}, {0x000003F7, 0x000003F8}, {0x000003F9, 0x000003F2}, {0x000003FA, 0x000003FB}, +{0x000003FD, 0x0000037B}, {0x000003FE, 0x0000037C}, {0x000003FF, 0x0000037D}, {0x00000400, 0x00000450}, +{0x00000401, 0x00000451}, {0x00000402, 0x00000452}, {0x00000403, 0x00000453}, {0x00000404, 0x00000454}, +{0x00000405, 0x00000455}, {0x00000406, 0x00000456}, {0x00000407, 0x00000457}, {0x00000408, 0x00000458}, +{0x00000409, 0x00000459}, {0x0000040A, 0x0000045A}, {0x0000040B, 0x0000045B}, {0x0000040C, 0x0000045C}, +{0x0000040D, 0x0000045D}, {0x0000040E, 0x0000045E}, {0x0000040F, 0x0000045F}, {0x00000410, 0x00000430}, +{0x00000411, 0x00000431}, {0x00000412, 0x00000432}, {0x00000413, 0x00000433}, {0x00000414, 0x00000434}, +{0x00000415, 0x00000435}, {0x00000416, 0x00000436}, {0x00000417, 0x00000437}, {0x00000418, 0x00000438}, +{0x00000419, 0x00000439}, {0x0000041A, 0x0000043A}, {0x0000041B, 0x0000043B}, {0x0000041C, 0x0000043C}, +{0x0000041D, 0x0000043D}, {0x0000041E, 0x0000043E}, {0x0000041F, 0x0000043F}, {0x00000420, 0x00000440}, +{0x00000421, 0x00000441}, {0x00000422, 0x00000442}, {0x00000423, 0x00000443}, {0x00000424, 0x00000444}, +{0x00000425, 0x00000445}, {0x00000426, 0x00000446}, {0x00000427, 0x00000447}, {0x00000428, 0x00000448}, +{0x00000429, 0x00000449}, {0x0000042A, 0x0000044A}, {0x0000042B, 0x0000044B}, {0x0000042C, 0x0000044C}, +{0x0000042D, 0x0000044D}, {0x0000042E, 0x0000044E}, {0x0000042F, 0x0000044F}, {0x00000460, 0x00000461}, +{0x00000462, 0x00000463}, {0x00000464, 0x00000465}, {0x00000466, 0x00000467}, {0x00000468, 0x00000469}, +{0x0000046A, 0x0000046B}, {0x0000046C, 0x0000046D}, {0x0000046E, 0x0000046F}, {0x00000470, 0x00000471}, +{0x00000472, 0x00000473}, {0x00000474, 0x00000475}, {0x00000476, 0x00000477}, {0x00000478, 0x00000479}, +{0x0000047A, 0x0000047B}, {0x0000047C, 0x0000047D}, {0x0000047E, 0x0000047F}, {0x00000480, 0x00000481}, +{0x0000048A, 0x0000048B}, {0x0000048C, 0x0000048D}, {0x0000048E, 0x0000048F}, {0x00000490, 0x00000491}, +{0x00000492, 0x00000493}, {0x00000494, 0x00000495}, {0x00000496, 0x00000497}, {0x00000498, 0x00000499}, +{0x0000049A, 0x0000049B}, {0x0000049C, 0x0000049D}, {0x0000049E, 0x0000049F}, {0x000004A0, 0x000004A1}, +{0x000004A2, 0x000004A3}, {0x000004A4, 0x000004A5}, {0x000004A6, 0x000004A7}, {0x000004A8, 0x000004A9}, +{0x000004AA, 0x000004AB}, {0x000004AC, 0x000004AD}, {0x000004AE, 0x000004AF}, {0x000004B0, 0x000004B1}, +{0x000004B2, 0x000004B3}, {0x000004B4, 0x000004B5}, {0x000004B6, 0x000004B7}, {0x000004B8, 0x000004B9}, +{0x000004BA, 0x000004BB}, {0x000004BC, 0x000004BD}, {0x000004BE, 0x000004BF}, {0x000004C0, 0x000004CF}, +{0x000004C1, 0x000004C2}, {0x000004C3, 0x000004C4}, {0x000004C5, 0x000004C6}, {0x000004C7, 0x000004C8}, +{0x000004C9, 0x000004CA}, {0x000004CB, 0x000004CC}, {0x000004CD, 0x000004CE}, {0x000004D0, 0x000004D1}, +{0x000004D2, 0x000004D3}, {0x000004D4, 0x000004D5}, {0x000004D6, 0x000004D7}, {0x000004D8, 0x000004D9}, +{0x000004DA, 0x000004DB}, {0x000004DC, 0x000004DD}, {0x000004DE, 0x000004DF}, {0x000004E0, 0x000004E1}, +{0x000004E2, 0x000004E3}, {0x000004E4, 0x000004E5}, {0x000004E6, 0x000004E7}, {0x000004E8, 0x000004E9}, +{0x000004EA, 0x000004EB}, {0x000004EC, 0x000004ED}, {0x000004EE, 0x000004EF}, {0x000004F0, 0x000004F1}, +{0x000004F2, 0x000004F3}, {0x000004F4, 0x000004F5}, {0x000004F6, 0x000004F7}, {0x000004F8, 0x000004F9}, +{0x000004FA, 0x000004FB}, {0x000004FC, 0x000004FD}, {0x000004FE, 0x000004FF}, {0x00000500, 0x00000501}, +{0x00000502, 0x00000503}, {0x00000504, 0x00000505}, {0x00000506, 0x00000507}, {0x00000508, 0x00000509}, +{0x0000050A, 0x0000050B}, {0x0000050C, 0x0000050D}, {0x0000050E, 0x0000050F}, {0x00000510, 0x00000511}, +{0x00000512, 0x00000513}, {0x00000514, 0x00000515}, {0x00000516, 0x00000517}, {0x00000518, 0x00000519}, +{0x0000051A, 0x0000051B}, {0x0000051C, 0x0000051D}, {0x0000051E, 0x0000051F}, {0x00000520, 0x00000521}, +{0x00000522, 0x00000523}, {0x00000524, 0x00000525}, {0x00000526, 0x00000527}, {0x00000528, 0x00000529}, +{0x0000052A, 0x0000052B}, {0x0000052C, 0x0000052D}, {0x0000052E, 0x0000052F}, {0x00000531, 0x00000561}, +{0x00000532, 0x00000562}, {0x00000533, 0x00000563}, {0x00000534, 0x00000564}, {0x00000535, 0x00000565}, +{0x00000536, 0x00000566}, {0x00000537, 0x00000567}, {0x00000538, 0x00000568}, {0x00000539, 0x00000569}, +{0x0000053A, 0x0000056A}, {0x0000053B, 0x0000056B}, {0x0000053C, 0x0000056C}, {0x0000053D, 0x0000056D}, +{0x0000053E, 0x0000056E}, {0x0000053F, 0x0000056F}, {0x00000540, 0x00000570}, {0x00000541, 0x00000571}, +{0x00000542, 0x00000572}, {0x00000543, 0x00000573}, {0x00000544, 0x00000574}, {0x00000545, 0x00000575}, +{0x00000546, 0x00000576}, {0x00000547, 0x00000577}, {0x00000548, 0x00000578}, {0x00000549, 0x00000579}, +{0x0000054A, 0x0000057A}, {0x0000054B, 0x0000057B}, {0x0000054C, 0x0000057C}, {0x0000054D, 0x0000057D}, +{0x0000054E, 0x0000057E}, {0x0000054F, 0x0000057F}, {0x00000550, 0x00000580}, {0x00000551, 0x00000581}, +{0x00000552, 0x00000582}, {0x00000553, 0x00000583}, {0x00000554, 0x00000584}, {0x00000555, 0x00000585}, +{0x00000556, 0x00000586}, {0x000010A0, 0x00002D00}, {0x000010A1, 0x00002D01}, {0x000010A2, 0x00002D02}, +{0x000010A3, 0x00002D03}, {0x000010A4, 0x00002D04}, {0x000010A5, 0x00002D05}, {0x000010A6, 0x00002D06}, +{0x000010A7, 0x00002D07}, {0x000010A8, 0x00002D08}, {0x000010A9, 0x00002D09}, {0x000010AA, 0x00002D0A}, +{0x000010AB, 0x00002D0B}, {0x000010AC, 0x00002D0C}, {0x000010AD, 0x00002D0D}, {0x000010AE, 0x00002D0E}, +{0x000010AF, 0x00002D0F}, {0x000010B0, 0x00002D10}, {0x000010B1, 0x00002D11}, {0x000010B2, 0x00002D12}, +{0x000010B3, 0x00002D13}, {0x000010B4, 0x00002D14}, {0x000010B5, 0x00002D15}, {0x000010B6, 0x00002D16}, +{0x000010B7, 0x00002D17}, {0x000010B8, 0x00002D18}, {0x000010B9, 0x00002D19}, {0x000010BA, 0x00002D1A}, +{0x000010BB, 0x00002D1B}, {0x000010BC, 0x00002D1C}, {0x000010BD, 0x00002D1D}, {0x000010BE, 0x00002D1E}, +{0x000010BF, 0x00002D1F}, {0x000010C0, 0x00002D20}, {0x000010C1, 0x00002D21}, {0x000010C2, 0x00002D22}, +{0x000010C3, 0x00002D23}, {0x000010C4, 0x00002D24}, {0x000010C5, 0x00002D25}, {0x000010C7, 0x00002D27}, +{0x000010CD, 0x00002D2D}, {0x000013A0, 0x0000AB70}, {0x000013A1, 0x0000AB71}, {0x000013A2, 0x0000AB72}, +{0x000013A3, 0x0000AB73}, {0x000013A4, 0x0000AB74}, {0x000013A5, 0x0000AB75}, {0x000013A6, 0x0000AB76}, +{0x000013A7, 0x0000AB77}, {0x000013A8, 0x0000AB78}, {0x000013A9, 0x0000AB79}, {0x000013AA, 0x0000AB7A}, +{0x000013AB, 0x0000AB7B}, {0x000013AC, 0x0000AB7C}, {0x000013AD, 0x0000AB7D}, {0x000013AE, 0x0000AB7E}, +{0x000013AF, 0x0000AB7F}, {0x000013B0, 0x0000AB80}, {0x000013B1, 0x0000AB81}, {0x000013B2, 0x0000AB82}, +{0x000013B3, 0x0000AB83}, {0x000013B4, 0x0000AB84}, {0x000013B5, 0x0000AB85}, {0x000013B6, 0x0000AB86}, +{0x000013B7, 0x0000AB87}, {0x000013B8, 0x0000AB88}, {0x000013B9, 0x0000AB89}, {0x000013BA, 0x0000AB8A}, +{0x000013BB, 0x0000AB8B}, {0x000013BC, 0x0000AB8C}, {0x000013BD, 0x0000AB8D}, {0x000013BE, 0x0000AB8E}, +{0x000013BF, 0x0000AB8F}, {0x000013C0, 0x0000AB90}, {0x000013C1, 0x0000AB91}, {0x000013C2, 0x0000AB92}, +{0x000013C3, 0x0000AB93}, {0x000013C4, 0x0000AB94}, {0x000013C5, 0x0000AB95}, {0x000013C6, 0x0000AB96}, +{0x000013C7, 0x0000AB97}, {0x000013C8, 0x0000AB98}, {0x000013C9, 0x0000AB99}, {0x000013CA, 0x0000AB9A}, +{0x000013CB, 0x0000AB9B}, {0x000013CC, 0x0000AB9C}, {0x000013CD, 0x0000AB9D}, {0x000013CE, 0x0000AB9E}, +{0x000013CF, 0x0000AB9F}, {0x000013D0, 0x0000ABA0}, {0x000013D1, 0x0000ABA1}, {0x000013D2, 0x0000ABA2}, +{0x000013D3, 0x0000ABA3}, {0x000013D4, 0x0000ABA4}, {0x000013D5, 0x0000ABA5}, {0x000013D6, 0x0000ABA6}, +{0x000013D7, 0x0000ABA7}, {0x000013D8, 0x0000ABA8}, {0x000013D9, 0x0000ABA9}, {0x000013DA, 0x0000ABAA}, +{0x000013DB, 0x0000ABAB}, {0x000013DC, 0x0000ABAC}, {0x000013DD, 0x0000ABAD}, {0x000013DE, 0x0000ABAE}, +{0x000013DF, 0x0000ABAF}, {0x000013E0, 0x0000ABB0}, {0x000013E1, 0x0000ABB1}, {0x000013E2, 0x0000ABB2}, +{0x000013E3, 0x0000ABB3}, {0x000013E4, 0x0000ABB4}, {0x000013E5, 0x0000ABB5}, {0x000013E6, 0x0000ABB6}, +{0x000013E7, 0x0000ABB7}, {0x000013E8, 0x0000ABB8}, {0x000013E9, 0x0000ABB9}, {0x000013EA, 0x0000ABBA}, +{0x000013EB, 0x0000ABBB}, {0x000013EC, 0x0000ABBC}, {0x000013ED, 0x0000ABBD}, {0x000013EE, 0x0000ABBE}, +{0x000013EF, 0x0000ABBF}, {0x000013F0, 0x000013F8}, {0x000013F1, 0x000013F9}, {0x000013F2, 0x000013FA}, +{0x000013F3, 0x000013FB}, {0x000013F4, 0x000013FC}, {0x000013F5, 0x000013FD}, {0x00001C90, 0x000010D0}, +{0x00001C91, 0x000010D1}, {0x00001C92, 0x000010D2}, {0x00001C93, 0x000010D3}, {0x00001C94, 0x000010D4}, +{0x00001C95, 0x000010D5}, {0x00001C96, 0x000010D6}, {0x00001C97, 0x000010D7}, {0x00001C98, 0x000010D8}, +{0x00001C99, 0x000010D9}, {0x00001C9A, 0x000010DA}, {0x00001C9B, 0x000010DB}, {0x00001C9C, 0x000010DC}, +{0x00001C9D, 0x000010DD}, {0x00001C9E, 0x000010DE}, {0x00001C9F, 0x000010DF}, {0x00001CA0, 0x000010E0}, +{0x00001CA1, 0x000010E1}, {0x00001CA2, 0x000010E2}, {0x00001CA3, 0x000010E3}, {0x00001CA4, 0x000010E4}, +{0x00001CA5, 0x000010E5}, {0x00001CA6, 0x000010E6}, {0x00001CA7, 0x000010E7}, {0x00001CA8, 0x000010E8}, +{0x00001CA9, 0x000010E9}, {0x00001CAA, 0x000010EA}, {0x00001CAB, 0x000010EB}, {0x00001CAC, 0x000010EC}, +{0x00001CAD, 0x000010ED}, {0x00001CAE, 0x000010EE}, {0x00001CAF, 0x000010EF}, {0x00001CB0, 0x000010F0}, +{0x00001CB1, 0x000010F1}, {0x00001CB2, 0x000010F2}, {0x00001CB3, 0x000010F3}, {0x00001CB4, 0x000010F4}, +{0x00001CB5, 0x000010F5}, {0x00001CB6, 0x000010F6}, {0x00001CB7, 0x000010F7}, {0x00001CB8, 0x000010F8}, +{0x00001CB9, 0x000010F9}, {0x00001CBA, 0x000010FA}, {0x00001CBD, 0x000010FD}, {0x00001CBE, 0x000010FE}, +{0x00001CBF, 0x000010FF}, {0x00001E00, 0x00001E01}, {0x00001E02, 0x00001E03}, {0x00001E04, 0x00001E05}, +{0x00001E06, 0x00001E07}, {0x00001E08, 0x00001E09}, {0x00001E0A, 0x00001E0B}, {0x00001E0C, 0x00001E0D}, +{0x00001E0E, 0x00001E0F}, {0x00001E10, 0x00001E11}, {0x00001E12, 0x00001E13}, {0x00001E14, 0x00001E15}, +{0x00001E16, 0x00001E17}, {0x00001E18, 0x00001E19}, {0x00001E1A, 0x00001E1B}, {0x00001E1C, 0x00001E1D}, +{0x00001E1E, 0x00001E1F}, {0x00001E20, 0x00001E21}, {0x00001E22, 0x00001E23}, {0x00001E24, 0x00001E25}, +{0x00001E26, 0x00001E27}, {0x00001E28, 0x00001E29}, {0x00001E2A, 0x00001E2B}, {0x00001E2C, 0x00001E2D}, +{0x00001E2E, 0x00001E2F}, {0x00001E30, 0x00001E31}, {0x00001E32, 0x00001E33}, {0x00001E34, 0x00001E35}, +{0x00001E36, 0x00001E37}, {0x00001E38, 0x00001E39}, {0x00001E3A, 0x00001E3B}, {0x00001E3C, 0x00001E3D}, +{0x00001E3E, 0x00001E3F}, {0x00001E40, 0x00001E41}, {0x00001E42, 0x00001E43}, {0x00001E44, 0x00001E45}, +{0x00001E46, 0x00001E47}, {0x00001E48, 0x00001E49}, {0x00001E4A, 0x00001E4B}, {0x00001E4C, 0x00001E4D}, +{0x00001E4E, 0x00001E4F}, {0x00001E50, 0x00001E51}, {0x00001E52, 0x00001E53}, {0x00001E54, 0x00001E55}, +{0x00001E56, 0x00001E57}, {0x00001E58, 0x00001E59}, {0x00001E5A, 0x00001E5B}, {0x00001E5C, 0x00001E5D}, +{0x00001E5E, 0x00001E5F}, {0x00001E60, 0x00001E61}, {0x00001E62, 0x00001E63}, {0x00001E64, 0x00001E65}, +{0x00001E66, 0x00001E67}, {0x00001E68, 0x00001E69}, {0x00001E6A, 0x00001E6B}, {0x00001E6C, 0x00001E6D}, +{0x00001E6E, 0x00001E6F}, {0x00001E70, 0x00001E71}, {0x00001E72, 0x00001E73}, {0x00001E74, 0x00001E75}, +{0x00001E76, 0x00001E77}, {0x00001E78, 0x00001E79}, {0x00001E7A, 0x00001E7B}, {0x00001E7C, 0x00001E7D}, +{0x00001E7E, 0x00001E7F}, {0x00001E80, 0x00001E81}, {0x00001E82, 0x00001E83}, {0x00001E84, 0x00001E85}, +{0x00001E86, 0x00001E87}, {0x00001E88, 0x00001E89}, {0x00001E8A, 0x00001E8B}, {0x00001E8C, 0x00001E8D}, +{0x00001E8E, 0x00001E8F}, {0x00001E90, 0x00001E91}, {0x00001E92, 0x00001E93}, {0x00001E94, 0x00001E95}, +{0x00001E9E, 0x000000DF}, {0x00001EA0, 0x00001EA1}, {0x00001EA2, 0x00001EA3}, {0x00001EA4, 0x00001EA5}, +{0x00001EA6, 0x00001EA7}, {0x00001EA8, 0x00001EA9}, {0x00001EAA, 0x00001EAB}, {0x00001EAC, 0x00001EAD}, +{0x00001EAE, 0x00001EAF}, {0x00001EB0, 0x00001EB1}, {0x00001EB2, 0x00001EB3}, {0x00001EB4, 0x00001EB5}, +{0x00001EB6, 0x00001EB7}, {0x00001EB8, 0x00001EB9}, {0x00001EBA, 0x00001EBB}, {0x00001EBC, 0x00001EBD}, +{0x00001EBE, 0x00001EBF}, {0x00001EC0, 0x00001EC1}, {0x00001EC2, 0x00001EC3}, {0x00001EC4, 0x00001EC5}, +{0x00001EC6, 0x00001EC7}, {0x00001EC8, 0x00001EC9}, {0x00001ECA, 0x00001ECB}, {0x00001ECC, 0x00001ECD}, +{0x00001ECE, 0x00001ECF}, {0x00001ED0, 0x00001ED1}, {0x00001ED2, 0x00001ED3}, {0x00001ED4, 0x00001ED5}, +{0x00001ED6, 0x00001ED7}, {0x00001ED8, 0x00001ED9}, {0x00001EDA, 0x00001EDB}, {0x00001EDC, 0x00001EDD}, +{0x00001EDE, 0x00001EDF}, {0x00001EE0, 0x00001EE1}, {0x00001EE2, 0x00001EE3}, {0x00001EE4, 0x00001EE5}, +{0x00001EE6, 0x00001EE7}, {0x00001EE8, 0x00001EE9}, {0x00001EEA, 0x00001EEB}, {0x00001EEC, 0x00001EED}, +{0x00001EEE, 0x00001EEF}, {0x00001EF0, 0x00001EF1}, {0x00001EF2, 0x00001EF3}, {0x00001EF4, 0x00001EF5}, +{0x00001EF6, 0x00001EF7}, {0x00001EF8, 0x00001EF9}, {0x00001EFA, 0x00001EFB}, {0x00001EFC, 0x00001EFD}, +{0x00001EFE, 0x00001EFF}, {0x00001F08, 0x00001F00}, {0x00001F09, 0x00001F01}, {0x00001F0A, 0x00001F02}, +{0x00001F0B, 0x00001F03}, {0x00001F0C, 0x00001F04}, {0x00001F0D, 0x00001F05}, {0x00001F0E, 0x00001F06}, +{0x00001F0F, 0x00001F07}, {0x00001F18, 0x00001F10}, {0x00001F19, 0x00001F11}, {0x00001F1A, 0x00001F12}, +{0x00001F1B, 0x00001F13}, {0x00001F1C, 0x00001F14}, {0x00001F1D, 0x00001F15}, {0x00001F28, 0x00001F20}, +{0x00001F29, 0x00001F21}, {0x00001F2A, 0x00001F22}, {0x00001F2B, 0x00001F23}, {0x00001F2C, 0x00001F24}, +{0x00001F2D, 0x00001F25}, {0x00001F2E, 0x00001F26}, {0x00001F2F, 0x00001F27}, {0x00001F38, 0x00001F30}, +{0x00001F39, 0x00001F31}, {0x00001F3A, 0x00001F32}, {0x00001F3B, 0x00001F33}, {0x00001F3C, 0x00001F34}, +{0x00001F3D, 0x00001F35}, {0x00001F3E, 0x00001F36}, {0x00001F3F, 0x00001F37}, {0x00001F48, 0x00001F40}, +{0x00001F49, 0x00001F41}, {0x00001F4A, 0x00001F42}, {0x00001F4B, 0x00001F43}, {0x00001F4C, 0x00001F44}, +{0x00001F4D, 0x00001F45}, {0x00001F59, 0x00001F51}, {0x00001F5B, 0x00001F53}, {0x00001F5D, 0x00001F55}, +{0x00001F5F, 0x00001F57}, {0x00001F68, 0x00001F60}, {0x00001F69, 0x00001F61}, {0x00001F6A, 0x00001F62}, +{0x00001F6B, 0x00001F63}, {0x00001F6C, 0x00001F64}, {0x00001F6D, 0x00001F65}, {0x00001F6E, 0x00001F66}, +{0x00001F6F, 0x00001F67}, {0x00001F88, 0x00001F80}, {0x00001F89, 0x00001F81}, {0x00001F8A, 0x00001F82}, +{0x00001F8B, 0x00001F83}, {0x00001F8C, 0x00001F84}, {0x00001F8D, 0x00001F85}, {0x00001F8E, 0x00001F86}, +{0x00001F8F, 0x00001F87}, {0x00001F98, 0x00001F90}, {0x00001F99, 0x00001F91}, {0x00001F9A, 0x00001F92}, +{0x00001F9B, 0x00001F93}, {0x00001F9C, 0x00001F94}, {0x00001F9D, 0x00001F95}, {0x00001F9E, 0x00001F96}, +{0x00001F9F, 0x00001F97}, {0x00001FA8, 0x00001FA0}, {0x00001FA9, 0x00001FA1}, {0x00001FAA, 0x00001FA2}, +{0x00001FAB, 0x00001FA3}, {0x00001FAC, 0x00001FA4}, {0x00001FAD, 0x00001FA5}, {0x00001FAE, 0x00001FA6}, +{0x00001FAF, 0x00001FA7}, {0x00001FB8, 0x00001FB0}, {0x00001FB9, 0x00001FB1}, {0x00001FBA, 0x00001F70}, +{0x00001FBB, 0x00001F71}, {0x00001FBC, 0x00001FB3}, {0x00001FC8, 0x00001F72}, {0x00001FC9, 0x00001F73}, +{0x00001FCA, 0x00001F74}, {0x00001FCB, 0x00001F75}, {0x00001FCC, 0x00001FC3}, {0x00001FD8, 0x00001FD0}, +{0x00001FD9, 0x00001FD1}, {0x00001FDA, 0x00001F76}, {0x00001FDB, 0x00001F77}, {0x00001FE8, 0x00001FE0}, +{0x00001FE9, 0x00001FE1}, {0x00001FEA, 0x00001F7A}, {0x00001FEB, 0x00001F7B}, {0x00001FEC, 0x00001FE5}, +{0x00001FF8, 0x00001F78}, {0x00001FF9, 0x00001F79}, {0x00001FFA, 0x00001F7C}, {0x00001FFB, 0x00001F7D}, +{0x00001FFC, 0x00001FF3}, {0x00002126, 0x000003C9}, {0x0000212A, 0x0000006B}, {0x0000212B, 0x000000E5}, +{0x00002132, 0x0000214E}, {0x00002160, 0x00002170}, {0x00002161, 0x00002171}, {0x00002162, 0x00002172}, +{0x00002163, 0x00002173}, {0x00002164, 0x00002174}, {0x00002165, 0x00002175}, {0x00002166, 0x00002176}, +{0x00002167, 0x00002177}, {0x00002168, 0x00002178}, {0x00002169, 0x00002179}, {0x0000216A, 0x0000217A}, +{0x0000216B, 0x0000217B}, {0x0000216C, 0x0000217C}, {0x0000216D, 0x0000217D}, {0x0000216E, 0x0000217E}, +{0x0000216F, 0x0000217F}, {0x00002183, 0x00002184}, {0x000024B6, 0x000024D0}, {0x000024B7, 0x000024D1}, +{0x000024B8, 0x000024D2}, {0x000024B9, 0x000024D3}, {0x000024BA, 0x000024D4}, {0x000024BB, 0x000024D5}, +{0x000024BC, 0x000024D6}, {0x000024BD, 0x000024D7}, {0x000024BE, 0x000024D8}, {0x000024BF, 0x000024D9}, +{0x000024C0, 0x000024DA}, {0x000024C1, 0x000024DB}, {0x000024C2, 0x000024DC}, {0x000024C3, 0x000024DD}, +{0x000024C4, 0x000024DE}, {0x000024C5, 0x000024DF}, {0x000024C6, 0x000024E0}, {0x000024C7, 0x000024E1}, +{0x000024C8, 0x000024E2}, {0x000024C9, 0x000024E3}, {0x000024CA, 0x000024E4}, {0x000024CB, 0x000024E5}, +{0x000024CC, 0x000024E6}, {0x000024CD, 0x000024E7}, {0x000024CE, 0x000024E8}, {0x000024CF, 0x000024E9}, +{0x00002C00, 0x00002C30}, {0x00002C01, 0x00002C31}, {0x00002C02, 0x00002C32}, {0x00002C03, 0x00002C33}, +{0x00002C04, 0x00002C34}, {0x00002C05, 0x00002C35}, {0x00002C06, 0x00002C36}, {0x00002C07, 0x00002C37}, +{0x00002C08, 0x00002C38}, {0x00002C09, 0x00002C39}, {0x00002C0A, 0x00002C3A}, {0x00002C0B, 0x00002C3B}, +{0x00002C0C, 0x00002C3C}, {0x00002C0D, 0x00002C3D}, {0x00002C0E, 0x00002C3E}, {0x00002C0F, 0x00002C3F}, +{0x00002C10, 0x00002C40}, {0x00002C11, 0x00002C41}, {0x00002C12, 0x00002C42}, {0x00002C13, 0x00002C43}, +{0x00002C14, 0x00002C44}, {0x00002C15, 0x00002C45}, {0x00002C16, 0x00002C46}, {0x00002C17, 0x00002C47}, +{0x00002C18, 0x00002C48}, {0x00002C19, 0x00002C49}, {0x00002C1A, 0x00002C4A}, {0x00002C1B, 0x00002C4B}, +{0x00002C1C, 0x00002C4C}, {0x00002C1D, 0x00002C4D}, {0x00002C1E, 0x00002C4E}, {0x00002C1F, 0x00002C4F}, +{0x00002C20, 0x00002C50}, {0x00002C21, 0x00002C51}, {0x00002C22, 0x00002C52}, {0x00002C23, 0x00002C53}, +{0x00002C24, 0x00002C54}, {0x00002C25, 0x00002C55}, {0x00002C26, 0x00002C56}, {0x00002C27, 0x00002C57}, +{0x00002C28, 0x00002C58}, {0x00002C29, 0x00002C59}, {0x00002C2A, 0x00002C5A}, {0x00002C2B, 0x00002C5B}, +{0x00002C2C, 0x00002C5C}, {0x00002C2D, 0x00002C5D}, {0x00002C2E, 0x00002C5E}, {0x00002C60, 0x00002C61}, +{0x00002C62, 0x0000026B}, {0x00002C63, 0x00001D7D}, {0x00002C64, 0x0000027D}, {0x00002C67, 0x00002C68}, +{0x00002C69, 0x00002C6A}, {0x00002C6B, 0x00002C6C}, {0x00002C6D, 0x00000251}, {0x00002C6E, 0x00000271}, +{0x00002C6F, 0x00000250}, {0x00002C70, 0x00000252}, {0x00002C72, 0x00002C73}, {0x00002C75, 0x00002C76}, +{0x00002C7E, 0x0000023F}, {0x00002C7F, 0x00000240}, {0x00002C80, 0x00002C81}, {0x00002C82, 0x00002C83}, +{0x00002C84, 0x00002C85}, {0x00002C86, 0x00002C87}, {0x00002C88, 0x00002C89}, {0x00002C8A, 0x00002C8B}, +{0x00002C8C, 0x00002C8D}, {0x00002C8E, 0x00002C8F}, {0x00002C90, 0x00002C91}, {0x00002C92, 0x00002C93}, +{0x00002C94, 0x00002C95}, {0x00002C96, 0x00002C97}, {0x00002C98, 0x00002C99}, {0x00002C9A, 0x00002C9B}, +{0x00002C9C, 0x00002C9D}, {0x00002C9E, 0x00002C9F}, {0x00002CA0, 0x00002CA1}, {0x00002CA2, 0x00002CA3}, +{0x00002CA4, 0x00002CA5}, {0x00002CA6, 0x00002CA7}, {0x00002CA8, 0x00002CA9}, {0x00002CAA, 0x00002CAB}, +{0x00002CAC, 0x00002CAD}, {0x00002CAE, 0x00002CAF}, {0x00002CB0, 0x00002CB1}, {0x00002CB2, 0x00002CB3}, +{0x00002CB4, 0x00002CB5}, {0x00002CB6, 0x00002CB7}, {0x00002CB8, 0x00002CB9}, {0x00002CBA, 0x00002CBB}, +{0x00002CBC, 0x00002CBD}, {0x00002CBE, 0x00002CBF}, {0x00002CC0, 0x00002CC1}, {0x00002CC2, 0x00002CC3}, +{0x00002CC4, 0x00002CC5}, {0x00002CC6, 0x00002CC7}, {0x00002CC8, 0x00002CC9}, {0x00002CCA, 0x00002CCB}, +{0x00002CCC, 0x00002CCD}, {0x00002CCE, 0x00002CCF}, {0x00002CD0, 0x00002CD1}, {0x00002CD2, 0x00002CD3}, +{0x00002CD4, 0x00002CD5}, {0x00002CD6, 0x00002CD7}, {0x00002CD8, 0x00002CD9}, {0x00002CDA, 0x00002CDB}, +{0x00002CDC, 0x00002CDD}, {0x00002CDE, 0x00002CDF}, {0x00002CE0, 0x00002CE1}, {0x00002CE2, 0x00002CE3}, +{0x00002CEB, 0x00002CEC}, {0x00002CED, 0x00002CEE}, {0x00002CF2, 0x00002CF3}, {0x0000A640, 0x0000A641}, +{0x0000A642, 0x0000A643}, {0x0000A644, 0x0000A645}, {0x0000A646, 0x0000A647}, {0x0000A648, 0x0000A649}, +{0x0000A64A, 0x0000A64B}, {0x0000A64C, 0x0000A64D}, {0x0000A64E, 0x0000A64F}, {0x0000A650, 0x0000A651}, +{0x0000A652, 0x0000A653}, {0x0000A654, 0x0000A655}, {0x0000A656, 0x0000A657}, {0x0000A658, 0x0000A659}, +{0x0000A65A, 0x0000A65B}, {0x0000A65C, 0x0000A65D}, {0x0000A65E, 0x0000A65F}, {0x0000A660, 0x0000A661}, +{0x0000A662, 0x0000A663}, {0x0000A664, 0x0000A665}, {0x0000A666, 0x0000A667}, {0x0000A668, 0x0000A669}, +{0x0000A66A, 0x0000A66B}, {0x0000A66C, 0x0000A66D}, {0x0000A680, 0x0000A681}, {0x0000A682, 0x0000A683}, +{0x0000A684, 0x0000A685}, {0x0000A686, 0x0000A687}, {0x0000A688, 0x0000A689}, {0x0000A68A, 0x0000A68B}, +{0x0000A68C, 0x0000A68D}, {0x0000A68E, 0x0000A68F}, {0x0000A690, 0x0000A691}, {0x0000A692, 0x0000A693}, +{0x0000A694, 0x0000A695}, {0x0000A696, 0x0000A697}, {0x0000A698, 0x0000A699}, {0x0000A69A, 0x0000A69B}, +{0x0000A722, 0x0000A723}, {0x0000A724, 0x0000A725}, {0x0000A726, 0x0000A727}, {0x0000A728, 0x0000A729}, +{0x0000A72A, 0x0000A72B}, {0x0000A72C, 0x0000A72D}, {0x0000A72E, 0x0000A72F}, {0x0000A732, 0x0000A733}, +{0x0000A734, 0x0000A735}, {0x0000A736, 0x0000A737}, {0x0000A738, 0x0000A739}, {0x0000A73A, 0x0000A73B}, +{0x0000A73C, 0x0000A73D}, {0x0000A73E, 0x0000A73F}, {0x0000A740, 0x0000A741}, {0x0000A742, 0x0000A743}, +{0x0000A744, 0x0000A745}, {0x0000A746, 0x0000A747}, {0x0000A748, 0x0000A749}, {0x0000A74A, 0x0000A74B}, +{0x0000A74C, 0x0000A74D}, {0x0000A74E, 0x0000A74F}, {0x0000A750, 0x0000A751}, {0x0000A752, 0x0000A753}, +{0x0000A754, 0x0000A755}, {0x0000A756, 0x0000A757}, {0x0000A758, 0x0000A759}, {0x0000A75A, 0x0000A75B}, +{0x0000A75C, 0x0000A75D}, {0x0000A75E, 0x0000A75F}, {0x0000A760, 0x0000A761}, {0x0000A762, 0x0000A763}, +{0x0000A764, 0x0000A765}, {0x0000A766, 0x0000A767}, {0x0000A768, 0x0000A769}, {0x0000A76A, 0x0000A76B}, +{0x0000A76C, 0x0000A76D}, {0x0000A76E, 0x0000A76F}, {0x0000A779, 0x0000A77A}, {0x0000A77B, 0x0000A77C}, +{0x0000A77D, 0x00001D79}, {0x0000A77E, 0x0000A77F}, {0x0000A780, 0x0000A781}, {0x0000A782, 0x0000A783}, +{0x0000A784, 0x0000A785}, {0x0000A786, 0x0000A787}, {0x0000A78B, 0x0000A78C}, {0x0000A78D, 0x00000265}, +{0x0000A790, 0x0000A791}, {0x0000A792, 0x0000A793}, {0x0000A796, 0x0000A797}, {0x0000A798, 0x0000A799}, +{0x0000A79A, 0x0000A79B}, {0x0000A79C, 0x0000A79D}, {0x0000A79E, 0x0000A79F}, {0x0000A7A0, 0x0000A7A1}, +{0x0000A7A2, 0x0000A7A3}, {0x0000A7A4, 0x0000A7A5}, {0x0000A7A6, 0x0000A7A7}, {0x0000A7A8, 0x0000A7A9}, +{0x0000A7AA, 0x00000266}, {0x0000A7AB, 0x0000025C}, {0x0000A7AC, 0x00000261}, {0x0000A7AD, 0x0000026C}, +{0x0000A7AE, 0x0000026A}, {0x0000A7B0, 0x0000029E}, {0x0000A7B1, 0x00000287}, {0x0000A7B2, 0x0000029D}, +{0x0000A7B3, 0x0000AB53}, {0x0000A7B4, 0x0000A7B5}, {0x0000A7B6, 0x0000A7B7}, {0x0000A7B8, 0x0000A7B9}, +{0x0000A7BA, 0x0000A7BB}, {0x0000A7BC, 0x0000A7BD}, {0x0000A7BE, 0x0000A7BF}, {0x0000A7C2, 0x0000A7C3}, +{0x0000A7C4, 0x0000A794}, {0x0000A7C5, 0x00000282}, {0x0000A7C6, 0x00001D8E}, {0x0000A7C7, 0x0000A7C8}, +{0x0000A7C9, 0x0000A7CA}, {0x0000A7F5, 0x0000A7F6}, {0x0000FF21, 0x0000FF41}, {0x0000FF22, 0x0000FF42}, +{0x0000FF23, 0x0000FF43}, {0x0000FF24, 0x0000FF44}, {0x0000FF25, 0x0000FF45}, {0x0000FF26, 0x0000FF46}, +{0x0000FF27, 0x0000FF47}, {0x0000FF28, 0x0000FF48}, {0x0000FF29, 0x0000FF49}, {0x0000FF2A, 0x0000FF4A}, +{0x0000FF2B, 0x0000FF4B}, {0x0000FF2C, 0x0000FF4C}, {0x0000FF2D, 0x0000FF4D}, {0x0000FF2E, 0x0000FF4E}, +{0x0000FF2F, 0x0000FF4F}, {0x0000FF30, 0x0000FF50}, {0x0000FF31, 0x0000FF51}, {0x0000FF32, 0x0000FF52}, +{0x0000FF33, 0x0000FF53}, {0x0000FF34, 0x0000FF54}, {0x0000FF35, 0x0000FF55}, {0x0000FF36, 0x0000FF56}, +{0x0000FF37, 0x0000FF57}, {0x0000FF38, 0x0000FF58}, {0x0000FF39, 0x0000FF59}, {0x0000FF3A, 0x0000FF5A}, +{0x00010400, 0x00010428}, {0x00010401, 0x00010429}, {0x00010402, 0x0001042A}, {0x00010403, 0x0001042B}, +{0x00010404, 0x0001042C}, {0x00010405, 0x0001042D}, {0x00010406, 0x0001042E}, {0x00010407, 0x0001042F}, +{0x00010408, 0x00010430}, {0x00010409, 0x00010431}, {0x0001040A, 0x00010432}, {0x0001040B, 0x00010433}, +{0x0001040C, 0x00010434}, {0x0001040D, 0x00010435}, {0x0001040E, 0x00010436}, {0x0001040F, 0x00010437}, +{0x00010410, 0x00010438}, {0x00010411, 0x00010439}, {0x00010412, 0x0001043A}, {0x00010413, 0x0001043B}, +{0x00010414, 0x0001043C}, {0x00010415, 0x0001043D}, {0x00010416, 0x0001043E}, {0x00010417, 0x0001043F}, +{0x00010418, 0x00010440}, {0x00010419, 0x00010441}, {0x0001041A, 0x00010442}, {0x0001041B, 0x00010443}, +{0x0001041C, 0x00010444}, {0x0001041D, 0x00010445}, {0x0001041E, 0x00010446}, {0x0001041F, 0x00010447}, +{0x00010420, 0x00010448}, {0x00010421, 0x00010449}, {0x00010422, 0x0001044A}, {0x00010423, 0x0001044B}, +{0x00010424, 0x0001044C}, {0x00010425, 0x0001044D}, {0x00010426, 0x0001044E}, {0x00010427, 0x0001044F}, +{0x000104B0, 0x000104D8}, {0x000104B1, 0x000104D9}, {0x000104B2, 0x000104DA}, {0x000104B3, 0x000104DB}, +{0x000104B4, 0x000104DC}, {0x000104B5, 0x000104DD}, {0x000104B6, 0x000104DE}, {0x000104B7, 0x000104DF}, +{0x000104B8, 0x000104E0}, {0x000104B9, 0x000104E1}, {0x000104BA, 0x000104E2}, {0x000104BB, 0x000104E3}, +{0x000104BC, 0x000104E4}, {0x000104BD, 0x000104E5}, {0x000104BE, 0x000104E6}, {0x000104BF, 0x000104E7}, +{0x000104C0, 0x000104E8}, {0x000104C1, 0x000104E9}, {0x000104C2, 0x000104EA}, {0x000104C3, 0x000104EB}, +{0x000104C4, 0x000104EC}, {0x000104C5, 0x000104ED}, {0x000104C6, 0x000104EE}, {0x000104C7, 0x000104EF}, +{0x000104C8, 0x000104F0}, {0x000104C9, 0x000104F1}, {0x000104CA, 0x000104F2}, {0x000104CB, 0x000104F3}, +{0x000104CC, 0x000104F4}, {0x000104CD, 0x000104F5}, {0x000104CE, 0x000104F6}, {0x000104CF, 0x000104F7}, +{0x000104D0, 0x000104F8}, {0x000104D1, 0x000104F9}, {0x000104D2, 0x000104FA}, {0x000104D3, 0x000104FB}, +{0x00010C80, 0x00010CC0}, {0x00010C81, 0x00010CC1}, {0x00010C82, 0x00010CC2}, {0x00010C83, 0x00010CC3}, +{0x00010C84, 0x00010CC4}, {0x00010C85, 0x00010CC5}, {0x00010C86, 0x00010CC6}, {0x00010C87, 0x00010CC7}, +{0x00010C88, 0x00010CC8}, {0x00010C89, 0x00010CC9}, {0x00010C8A, 0x00010CCA}, {0x00010C8B, 0x00010CCB}, +{0x00010C8C, 0x00010CCC}, {0x00010C8D, 0x00010CCD}, {0x00010C8E, 0x00010CCE}, {0x00010C8F, 0x00010CCF}, +{0x00010C90, 0x00010CD0}, {0x00010C91, 0x00010CD1}, {0x00010C92, 0x00010CD2}, {0x00010C93, 0x00010CD3}, +{0x00010C94, 0x00010CD4}, {0x00010C95, 0x00010CD5}, {0x00010C96, 0x00010CD6}, {0x00010C97, 0x00010CD7}, +{0x00010C98, 0x00010CD8}, {0x00010C99, 0x00010CD9}, {0x00010C9A, 0x00010CDA}, {0x00010C9B, 0x00010CDB}, +{0x00010C9C, 0x00010CDC}, {0x00010C9D, 0x00010CDD}, {0x00010C9E, 0x00010CDE}, {0x00010C9F, 0x00010CDF}, +{0x00010CA0, 0x00010CE0}, {0x00010CA1, 0x00010CE1}, {0x00010CA2, 0x00010CE2}, {0x00010CA3, 0x00010CE3}, +{0x00010CA4, 0x00010CE4}, {0x00010CA5, 0x00010CE5}, {0x00010CA6, 0x00010CE6}, {0x00010CA7, 0x00010CE7}, +{0x00010CA8, 0x00010CE8}, {0x00010CA9, 0x00010CE9}, {0x00010CAA, 0x00010CEA}, {0x00010CAB, 0x00010CEB}, +{0x00010CAC, 0x00010CEC}, {0x00010CAD, 0x00010CED}, {0x00010CAE, 0x00010CEE}, {0x00010CAF, 0x00010CEF}, +{0x00010CB0, 0x00010CF0}, {0x00010CB1, 0x00010CF1}, {0x00010CB2, 0x00010CF2}, {0x000118A0, 0x000118C0}, +{0x000118A1, 0x000118C1}, {0x000118A2, 0x000118C2}, {0x000118A3, 0x000118C3}, {0x000118A4, 0x000118C4}, +{0x000118A5, 0x000118C5}, {0x000118A6, 0x000118C6}, {0x000118A7, 0x000118C7}, {0x000118A8, 0x000118C8}, +{0x000118A9, 0x000118C9}, {0x000118AA, 0x000118CA}, {0x000118AB, 0x000118CB}, {0x000118AC, 0x000118CC}, +{0x000118AD, 0x000118CD}, {0x000118AE, 0x000118CE}, {0x000118AF, 0x000118CF}, {0x000118B0, 0x000118D0}, +{0x000118B1, 0x000118D1}, {0x000118B2, 0x000118D2}, {0x000118B3, 0x000118D3}, {0x000118B4, 0x000118D4}, +{0x000118B5, 0x000118D5}, {0x000118B6, 0x000118D6}, {0x000118B7, 0x000118D7}, {0x000118B8, 0x000118D8}, +{0x000118B9, 0x000118D9}, {0x000118BA, 0x000118DA}, {0x000118BB, 0x000118DB}, {0x000118BC, 0x000118DC}, +{0x000118BD, 0x000118DD}, {0x000118BE, 0x000118DE}, {0x000118BF, 0x000118DF}, {0x00016E40, 0x00016E60}, +{0x00016E41, 0x00016E61}, {0x00016E42, 0x00016E62}, {0x00016E43, 0x00016E63}, {0x00016E44, 0x00016E64}, +{0x00016E45, 0x00016E65}, {0x00016E46, 0x00016E66}, {0x00016E47, 0x00016E67}, {0x00016E48, 0x00016E68}, +{0x00016E49, 0x00016E69}, {0x00016E4A, 0x00016E6A}, {0x00016E4B, 0x00016E6B}, {0x00016E4C, 0x00016E6C}, +{0x00016E4D, 0x00016E6D}, {0x00016E4E, 0x00016E6E}, {0x00016E4F, 0x00016E6F}, {0x00016E50, 0x00016E70}, +{0x00016E51, 0x00016E71}, {0x00016E52, 0x00016E72}, {0x00016E53, 0x00016E73}, {0x00016E54, 0x00016E74}, +{0x00016E55, 0x00016E75}, {0x00016E56, 0x00016E76}, {0x00016E57, 0x00016E77}, {0x00016E58, 0x00016E78}, +{0x00016E59, 0x00016E79}, {0x00016E5A, 0x00016E7A}, {0x00016E5B, 0x00016E7B}, {0x00016E5C, 0x00016E7C}, +{0x00016E5D, 0x00016E7D}, {0x00016E5E, 0x00016E7E}, {0x00016E5F, 0x00016E7F}, {0x0001E900, 0x0001E922}, +{0x0001E901, 0x0001E923}, {0x0001E902, 0x0001E924}, {0x0001E903, 0x0001E925}, {0x0001E904, 0x0001E926}, +{0x0001E905, 0x0001E927}, {0x0001E906, 0x0001E928}, {0x0001E907, 0x0001E929}, {0x0001E908, 0x0001E92A}, +{0x0001E909, 0x0001E92B}, {0x0001E90A, 0x0001E92C}, {0x0001E90B, 0x0001E92D}, {0x0001E90C, 0x0001E92E}, +{0x0001E90D, 0x0001E92F}, {0x0001E90E, 0x0001E930}, {0x0001E90F, 0x0001E931}, {0x0001E910, 0x0001E932}, +{0x0001E911, 0x0001E933}, {0x0001E912, 0x0001E934}, {0x0001E913, 0x0001E935}, {0x0001E914, 0x0001E936}, +{0x0001E915, 0x0001E937}, {0x0001E916, 0x0001E938}, {0x0001E917, 0x0001E939}, {0x0001E918, 0x0001E93A}, +{0x0001E919, 0x0001E93B}, {0x0001E91A, 0x0001E93C}, {0x0001E91B, 0x0001E93D}, {0x0001E91C, 0x0001E93E}, +{0x0001E91D, 0x0001E93F}, {0x0001E91E, 0x0001E940}, {0x0001E91F, 0x0001E941}, {0x0001E920, 0x0001E942}, +{0x0001E921, 0x0001E943}, +}; + +const std::map unicode_map_uppercase = { +{0x00000061, 0x00000041}, {0x00000062, 0x00000042}, {0x00000063, 0x00000043}, {0x00000064, 0x00000044}, +{0x00000065, 0x00000045}, {0x00000066, 0x00000046}, {0x00000067, 0x00000047}, {0x00000068, 0x00000048}, +{0x00000069, 0x00000049}, {0x0000006A, 0x0000004A}, {0x0000006B, 0x0000004B}, {0x0000006C, 0x0000004C}, +{0x0000006D, 0x0000004D}, {0x0000006E, 0x0000004E}, {0x0000006F, 0x0000004F}, {0x00000070, 0x00000050}, +{0x00000071, 0x00000051}, {0x00000072, 0x00000052}, {0x00000073, 0x00000053}, {0x00000074, 0x00000054}, +{0x00000075, 0x00000055}, {0x00000076, 0x00000056}, {0x00000077, 0x00000057}, {0x00000078, 0x00000058}, +{0x00000079, 0x00000059}, {0x0000007A, 0x0000005A}, {0x000000B5, 0x0000039C}, {0x000000DF, 0x00000053}, +{0x000000E0, 0x000000C0}, {0x000000E1, 0x000000C1}, {0x000000E2, 0x000000C2}, {0x000000E3, 0x000000C3}, +{0x000000E4, 0x000000C4}, {0x000000E5, 0x000000C5}, {0x000000E6, 0x000000C6}, {0x000000E7, 0x000000C7}, +{0x000000E8, 0x000000C8}, {0x000000E9, 0x000000C9}, {0x000000EA, 0x000000CA}, {0x000000EB, 0x000000CB}, +{0x000000EC, 0x000000CC}, {0x000000ED, 0x000000CD}, {0x000000EE, 0x000000CE}, {0x000000EF, 0x000000CF}, +{0x000000F0, 0x000000D0}, {0x000000F1, 0x000000D1}, {0x000000F2, 0x000000D2}, {0x000000F3, 0x000000D3}, +{0x000000F4, 0x000000D4}, {0x000000F5, 0x000000D5}, {0x000000F6, 0x000000D6}, {0x000000F8, 0x000000D8}, +{0x000000F9, 0x000000D9}, {0x000000FA, 0x000000DA}, {0x000000FB, 0x000000DB}, {0x000000FC, 0x000000DC}, +{0x000000FD, 0x000000DD}, {0x000000FE, 0x000000DE}, {0x000000FF, 0x00000178}, {0x00000101, 0x00000100}, +{0x00000103, 0x00000102}, {0x00000105, 0x00000104}, {0x00000107, 0x00000106}, {0x00000109, 0x00000108}, +{0x0000010B, 0x0000010A}, {0x0000010D, 0x0000010C}, {0x0000010F, 0x0000010E}, {0x00000111, 0x00000110}, +{0x00000113, 0x00000112}, {0x00000115, 0x00000114}, {0x00000117, 0x00000116}, {0x00000119, 0x00000118}, +{0x0000011B, 0x0000011A}, {0x0000011D, 0x0000011C}, {0x0000011F, 0x0000011E}, {0x00000121, 0x00000120}, +{0x00000123, 0x00000122}, {0x00000125, 0x00000124}, {0x00000127, 0x00000126}, {0x00000129, 0x00000128}, +{0x0000012B, 0x0000012A}, {0x0000012D, 0x0000012C}, {0x0000012F, 0x0000012E}, {0x00000131, 0x00000049}, +{0x00000133, 0x00000132}, {0x00000135, 0x00000134}, {0x00000137, 0x00000136}, {0x0000013A, 0x00000139}, +{0x0000013C, 0x0000013B}, {0x0000013E, 0x0000013D}, {0x00000140, 0x0000013F}, {0x00000142, 0x00000141}, +{0x00000144, 0x00000143}, {0x00000146, 0x00000145}, {0x00000148, 0x00000147}, {0x00000149, 0x000002BC}, +{0x0000014B, 0x0000014A}, {0x0000014D, 0x0000014C}, {0x0000014F, 0x0000014E}, {0x00000151, 0x00000150}, +{0x00000153, 0x00000152}, {0x00000155, 0x00000154}, {0x00000157, 0x00000156}, {0x00000159, 0x00000158}, +{0x0000015B, 0x0000015A}, {0x0000015D, 0x0000015C}, {0x0000015F, 0x0000015E}, {0x00000161, 0x00000160}, +{0x00000163, 0x00000162}, {0x00000165, 0x00000164}, {0x00000167, 0x00000166}, {0x00000169, 0x00000168}, +{0x0000016B, 0x0000016A}, {0x0000016D, 0x0000016C}, {0x0000016F, 0x0000016E}, {0x00000171, 0x00000170}, +{0x00000173, 0x00000172}, {0x00000175, 0x00000174}, {0x00000177, 0x00000176}, {0x0000017A, 0x00000179}, +{0x0000017C, 0x0000017B}, {0x0000017E, 0x0000017D}, {0x0000017F, 0x00000053}, {0x00000180, 0x00000243}, +{0x00000183, 0x00000182}, {0x00000185, 0x00000184}, {0x00000188, 0x00000187}, {0x0000018C, 0x0000018B}, +{0x00000192, 0x00000191}, {0x00000195, 0x000001F6}, {0x00000199, 0x00000198}, {0x0000019A, 0x0000023D}, +{0x0000019E, 0x00000220}, {0x000001A1, 0x000001A0}, {0x000001A3, 0x000001A2}, {0x000001A5, 0x000001A4}, +{0x000001A8, 0x000001A7}, {0x000001AD, 0x000001AC}, {0x000001B0, 0x000001AF}, {0x000001B4, 0x000001B3}, +{0x000001B6, 0x000001B5}, {0x000001B9, 0x000001B8}, {0x000001BD, 0x000001BC}, {0x000001BF, 0x000001F7}, +{0x000001C5, 0x000001C4}, {0x000001C6, 0x000001C4}, {0x000001C8, 0x000001C7}, {0x000001C9, 0x000001C7}, +{0x000001CB, 0x000001CA}, {0x000001CC, 0x000001CA}, {0x000001CE, 0x000001CD}, {0x000001D0, 0x000001CF}, +{0x000001D2, 0x000001D1}, {0x000001D4, 0x000001D3}, {0x000001D6, 0x000001D5}, {0x000001D8, 0x000001D7}, +{0x000001DA, 0x000001D9}, {0x000001DC, 0x000001DB}, {0x000001DD, 0x0000018E}, {0x000001DF, 0x000001DE}, +{0x000001E1, 0x000001E0}, {0x000001E3, 0x000001E2}, {0x000001E5, 0x000001E4}, {0x000001E7, 0x000001E6}, +{0x000001E9, 0x000001E8}, {0x000001EB, 0x000001EA}, {0x000001ED, 0x000001EC}, {0x000001EF, 0x000001EE}, +{0x000001F0, 0x0000004A}, {0x000001F2, 0x000001F1}, {0x000001F3, 0x000001F1}, {0x000001F5, 0x000001F4}, +{0x000001F9, 0x000001F8}, {0x000001FB, 0x000001FA}, {0x000001FD, 0x000001FC}, {0x000001FF, 0x000001FE}, +{0x00000201, 0x00000200}, {0x00000203, 0x00000202}, {0x00000205, 0x00000204}, {0x00000207, 0x00000206}, +{0x00000209, 0x00000208}, {0x0000020B, 0x0000020A}, {0x0000020D, 0x0000020C}, {0x0000020F, 0x0000020E}, +{0x00000211, 0x00000210}, {0x00000213, 0x00000212}, {0x00000215, 0x00000214}, {0x00000217, 0x00000216}, +{0x00000219, 0x00000218}, {0x0000021B, 0x0000021A}, {0x0000021D, 0x0000021C}, {0x0000021F, 0x0000021E}, +{0x00000223, 0x00000222}, {0x00000225, 0x00000224}, {0x00000227, 0x00000226}, {0x00000229, 0x00000228}, +{0x0000022B, 0x0000022A}, {0x0000022D, 0x0000022C}, {0x0000022F, 0x0000022E}, {0x00000231, 0x00000230}, +{0x00000233, 0x00000232}, {0x0000023C, 0x0000023B}, {0x0000023F, 0x00002C7E}, {0x00000240, 0x00002C7F}, +{0x00000242, 0x00000241}, {0x00000247, 0x00000246}, {0x00000249, 0x00000248}, {0x0000024B, 0x0000024A}, +{0x0000024D, 0x0000024C}, {0x0000024F, 0x0000024E}, {0x00000250, 0x00002C6F}, {0x00000251, 0x00002C6D}, +{0x00000252, 0x00002C70}, {0x00000253, 0x00000181}, {0x00000254, 0x00000186}, {0x00000256, 0x00000189}, +{0x00000257, 0x0000018A}, {0x00000259, 0x0000018F}, {0x0000025B, 0x00000190}, {0x0000025C, 0x0000A7AB}, +{0x00000260, 0x00000193}, {0x00000261, 0x0000A7AC}, {0x00000263, 0x00000194}, {0x00000265, 0x0000A78D}, +{0x00000266, 0x0000A7AA}, {0x00000268, 0x00000197}, {0x00000269, 0x00000196}, {0x0000026A, 0x0000A7AE}, +{0x0000026B, 0x00002C62}, {0x0000026C, 0x0000A7AD}, {0x0000026F, 0x0000019C}, {0x00000271, 0x00002C6E}, +{0x00000272, 0x0000019D}, {0x00000275, 0x0000019F}, {0x0000027D, 0x00002C64}, {0x00000280, 0x000001A6}, +{0x00000282, 0x0000A7C5}, {0x00000283, 0x000001A9}, {0x00000287, 0x0000A7B1}, {0x00000288, 0x000001AE}, +{0x00000289, 0x00000244}, {0x0000028A, 0x000001B1}, {0x0000028B, 0x000001B2}, {0x0000028C, 0x00000245}, +{0x00000292, 0x000001B7}, {0x0000029D, 0x0000A7B2}, {0x0000029E, 0x0000A7B0}, {0x00000345, 0x00000399}, +{0x00000371, 0x00000370}, {0x00000373, 0x00000372}, {0x00000377, 0x00000376}, {0x0000037B, 0x000003FD}, +{0x0000037C, 0x000003FE}, {0x0000037D, 0x000003FF}, {0x00000390, 0x00000399}, {0x000003AC, 0x00000386}, +{0x000003AD, 0x00000388}, {0x000003AE, 0x00000389}, {0x000003AF, 0x0000038A}, {0x000003B0, 0x000003A5}, +{0x000003B1, 0x00000391}, {0x000003B2, 0x00000392}, {0x000003B3, 0x00000393}, {0x000003B4, 0x00000394}, +{0x000003B5, 0x00000395}, {0x000003B6, 0x00000396}, {0x000003B7, 0x00000397}, {0x000003B8, 0x00000398}, +{0x000003B9, 0x00000399}, {0x000003BA, 0x0000039A}, {0x000003BB, 0x0000039B}, {0x000003BC, 0x0000039C}, +{0x000003BD, 0x0000039D}, {0x000003BE, 0x0000039E}, {0x000003BF, 0x0000039F}, {0x000003C0, 0x000003A0}, +{0x000003C1, 0x000003A1}, {0x000003C2, 0x000003A3}, {0x000003C3, 0x000003A3}, {0x000003C4, 0x000003A4}, +{0x000003C5, 0x000003A5}, {0x000003C6, 0x000003A6}, {0x000003C7, 0x000003A7}, {0x000003C8, 0x000003A8}, +{0x000003C9, 0x000003A9}, {0x000003CA, 0x000003AA}, {0x000003CB, 0x000003AB}, {0x000003CC, 0x0000038C}, +{0x000003CD, 0x0000038E}, {0x000003CE, 0x0000038F}, {0x000003D0, 0x00000392}, {0x000003D1, 0x00000398}, +{0x000003D5, 0x000003A6}, {0x000003D6, 0x000003A0}, {0x000003D7, 0x000003CF}, {0x000003D9, 0x000003D8}, +{0x000003DB, 0x000003DA}, {0x000003DD, 0x000003DC}, {0x000003DF, 0x000003DE}, {0x000003E1, 0x000003E0}, +{0x000003E3, 0x000003E2}, {0x000003E5, 0x000003E4}, {0x000003E7, 0x000003E6}, {0x000003E9, 0x000003E8}, +{0x000003EB, 0x000003EA}, {0x000003ED, 0x000003EC}, {0x000003EF, 0x000003EE}, {0x000003F0, 0x0000039A}, +{0x000003F1, 0x000003A1}, {0x000003F2, 0x000003F9}, {0x000003F3, 0x0000037F}, {0x000003F5, 0x00000395}, +{0x000003F8, 0x000003F7}, {0x000003FB, 0x000003FA}, {0x00000430, 0x00000410}, {0x00000431, 0x00000411}, +{0x00000432, 0x00000412}, {0x00000433, 0x00000413}, {0x00000434, 0x00000414}, {0x00000435, 0x00000415}, +{0x00000436, 0x00000416}, {0x00000437, 0x00000417}, {0x00000438, 0x00000418}, {0x00000439, 0x00000419}, +{0x0000043A, 0x0000041A}, {0x0000043B, 0x0000041B}, {0x0000043C, 0x0000041C}, {0x0000043D, 0x0000041D}, +{0x0000043E, 0x0000041E}, {0x0000043F, 0x0000041F}, {0x00000440, 0x00000420}, {0x00000441, 0x00000421}, +{0x00000442, 0x00000422}, {0x00000443, 0x00000423}, {0x00000444, 0x00000424}, {0x00000445, 0x00000425}, +{0x00000446, 0x00000426}, {0x00000447, 0x00000427}, {0x00000448, 0x00000428}, {0x00000449, 0x00000429}, +{0x0000044A, 0x0000042A}, {0x0000044B, 0x0000042B}, {0x0000044C, 0x0000042C}, {0x0000044D, 0x0000042D}, +{0x0000044E, 0x0000042E}, {0x0000044F, 0x0000042F}, {0x00000450, 0x00000400}, {0x00000451, 0x00000401}, +{0x00000452, 0x00000402}, {0x00000453, 0x00000403}, {0x00000454, 0x00000404}, {0x00000455, 0x00000405}, +{0x00000456, 0x00000406}, {0x00000457, 0x00000407}, {0x00000458, 0x00000408}, {0x00000459, 0x00000409}, +{0x0000045A, 0x0000040A}, {0x0000045B, 0x0000040B}, {0x0000045C, 0x0000040C}, {0x0000045D, 0x0000040D}, +{0x0000045E, 0x0000040E}, {0x0000045F, 0x0000040F}, {0x00000461, 0x00000460}, {0x00000463, 0x00000462}, +{0x00000465, 0x00000464}, {0x00000467, 0x00000466}, {0x00000469, 0x00000468}, {0x0000046B, 0x0000046A}, +{0x0000046D, 0x0000046C}, {0x0000046F, 0x0000046E}, {0x00000471, 0x00000470}, {0x00000473, 0x00000472}, +{0x00000475, 0x00000474}, {0x00000477, 0x00000476}, {0x00000479, 0x00000478}, {0x0000047B, 0x0000047A}, +{0x0000047D, 0x0000047C}, {0x0000047F, 0x0000047E}, {0x00000481, 0x00000480}, {0x0000048B, 0x0000048A}, +{0x0000048D, 0x0000048C}, {0x0000048F, 0x0000048E}, {0x00000491, 0x00000490}, {0x00000493, 0x00000492}, +{0x00000495, 0x00000494}, {0x00000497, 0x00000496}, {0x00000499, 0x00000498}, {0x0000049B, 0x0000049A}, +{0x0000049D, 0x0000049C}, {0x0000049F, 0x0000049E}, {0x000004A1, 0x000004A0}, {0x000004A3, 0x000004A2}, +{0x000004A5, 0x000004A4}, {0x000004A7, 0x000004A6}, {0x000004A9, 0x000004A8}, {0x000004AB, 0x000004AA}, +{0x000004AD, 0x000004AC}, {0x000004AF, 0x000004AE}, {0x000004B1, 0x000004B0}, {0x000004B3, 0x000004B2}, +{0x000004B5, 0x000004B4}, {0x000004B7, 0x000004B6}, {0x000004B9, 0x000004B8}, {0x000004BB, 0x000004BA}, +{0x000004BD, 0x000004BC}, {0x000004BF, 0x000004BE}, {0x000004C2, 0x000004C1}, {0x000004C4, 0x000004C3}, +{0x000004C6, 0x000004C5}, {0x000004C8, 0x000004C7}, {0x000004CA, 0x000004C9}, {0x000004CC, 0x000004CB}, +{0x000004CE, 0x000004CD}, {0x000004CF, 0x000004C0}, {0x000004D1, 0x000004D0}, {0x000004D3, 0x000004D2}, +{0x000004D5, 0x000004D4}, {0x000004D7, 0x000004D6}, {0x000004D9, 0x000004D8}, {0x000004DB, 0x000004DA}, +{0x000004DD, 0x000004DC}, {0x000004DF, 0x000004DE}, {0x000004E1, 0x000004E0}, {0x000004E3, 0x000004E2}, +{0x000004E5, 0x000004E4}, {0x000004E7, 0x000004E6}, {0x000004E9, 0x000004E8}, {0x000004EB, 0x000004EA}, +{0x000004ED, 0x000004EC}, {0x000004EF, 0x000004EE}, {0x000004F1, 0x000004F0}, {0x000004F3, 0x000004F2}, +{0x000004F5, 0x000004F4}, {0x000004F7, 0x000004F6}, {0x000004F9, 0x000004F8}, {0x000004FB, 0x000004FA}, +{0x000004FD, 0x000004FC}, {0x000004FF, 0x000004FE}, {0x00000501, 0x00000500}, {0x00000503, 0x00000502}, +{0x00000505, 0x00000504}, {0x00000507, 0x00000506}, {0x00000509, 0x00000508}, {0x0000050B, 0x0000050A}, +{0x0000050D, 0x0000050C}, {0x0000050F, 0x0000050E}, {0x00000511, 0x00000510}, {0x00000513, 0x00000512}, +{0x00000515, 0x00000514}, {0x00000517, 0x00000516}, {0x00000519, 0x00000518}, {0x0000051B, 0x0000051A}, +{0x0000051D, 0x0000051C}, {0x0000051F, 0x0000051E}, {0x00000521, 0x00000520}, {0x00000523, 0x00000522}, +{0x00000525, 0x00000524}, {0x00000527, 0x00000526}, {0x00000529, 0x00000528}, {0x0000052B, 0x0000052A}, +{0x0000052D, 0x0000052C}, {0x0000052F, 0x0000052E}, {0x00000561, 0x00000531}, {0x00000562, 0x00000532}, +{0x00000563, 0x00000533}, {0x00000564, 0x00000534}, {0x00000565, 0x00000535}, {0x00000566, 0x00000536}, +{0x00000567, 0x00000537}, {0x00000568, 0x00000538}, {0x00000569, 0x00000539}, {0x0000056A, 0x0000053A}, +{0x0000056B, 0x0000053B}, {0x0000056C, 0x0000053C}, {0x0000056D, 0x0000053D}, {0x0000056E, 0x0000053E}, +{0x0000056F, 0x0000053F}, {0x00000570, 0x00000540}, {0x00000571, 0x00000541}, {0x00000572, 0x00000542}, +{0x00000573, 0x00000543}, {0x00000574, 0x00000544}, {0x00000575, 0x00000545}, {0x00000576, 0x00000546}, +{0x00000577, 0x00000547}, {0x00000578, 0x00000548}, {0x00000579, 0x00000549}, {0x0000057A, 0x0000054A}, +{0x0000057B, 0x0000054B}, {0x0000057C, 0x0000054C}, {0x0000057D, 0x0000054D}, {0x0000057E, 0x0000054E}, +{0x0000057F, 0x0000054F}, {0x00000580, 0x00000550}, {0x00000581, 0x00000551}, {0x00000582, 0x00000552}, +{0x00000583, 0x00000553}, {0x00000584, 0x00000554}, {0x00000585, 0x00000555}, {0x00000586, 0x00000556}, +{0x00000587, 0x00000535}, {0x000010D0, 0x00001C90}, {0x000010D1, 0x00001C91}, {0x000010D2, 0x00001C92}, +{0x000010D3, 0x00001C93}, {0x000010D4, 0x00001C94}, {0x000010D5, 0x00001C95}, {0x000010D6, 0x00001C96}, +{0x000010D7, 0x00001C97}, {0x000010D8, 0x00001C98}, {0x000010D9, 0x00001C99}, {0x000010DA, 0x00001C9A}, +{0x000010DB, 0x00001C9B}, {0x000010DC, 0x00001C9C}, {0x000010DD, 0x00001C9D}, {0x000010DE, 0x00001C9E}, +{0x000010DF, 0x00001C9F}, {0x000010E0, 0x00001CA0}, {0x000010E1, 0x00001CA1}, {0x000010E2, 0x00001CA2}, +{0x000010E3, 0x00001CA3}, {0x000010E4, 0x00001CA4}, {0x000010E5, 0x00001CA5}, {0x000010E6, 0x00001CA6}, +{0x000010E7, 0x00001CA7}, {0x000010E8, 0x00001CA8}, {0x000010E9, 0x00001CA9}, {0x000010EA, 0x00001CAA}, +{0x000010EB, 0x00001CAB}, {0x000010EC, 0x00001CAC}, {0x000010ED, 0x00001CAD}, {0x000010EE, 0x00001CAE}, +{0x000010EF, 0x00001CAF}, {0x000010F0, 0x00001CB0}, {0x000010F1, 0x00001CB1}, {0x000010F2, 0x00001CB2}, +{0x000010F3, 0x00001CB3}, {0x000010F4, 0x00001CB4}, {0x000010F5, 0x00001CB5}, {0x000010F6, 0x00001CB6}, +{0x000010F7, 0x00001CB7}, {0x000010F8, 0x00001CB8}, {0x000010F9, 0x00001CB9}, {0x000010FA, 0x00001CBA}, +{0x000010FD, 0x00001CBD}, {0x000010FE, 0x00001CBE}, {0x000010FF, 0x00001CBF}, {0x000013F8, 0x000013F0}, +{0x000013F9, 0x000013F1}, {0x000013FA, 0x000013F2}, {0x000013FB, 0x000013F3}, {0x000013FC, 0x000013F4}, +{0x000013FD, 0x000013F5}, {0x00001C80, 0x00000412}, {0x00001C81, 0x00000414}, {0x00001C82, 0x0000041E}, +{0x00001C83, 0x00000421}, {0x00001C84, 0x00000422}, {0x00001C85, 0x00000422}, {0x00001C86, 0x0000042A}, +{0x00001C87, 0x00000462}, {0x00001C88, 0x0000A64A}, {0x00001D79, 0x0000A77D}, {0x00001D7D, 0x00002C63}, +{0x00001D8E, 0x0000A7C6}, {0x00001E01, 0x00001E00}, {0x00001E03, 0x00001E02}, {0x00001E05, 0x00001E04}, +{0x00001E07, 0x00001E06}, {0x00001E09, 0x00001E08}, {0x00001E0B, 0x00001E0A}, {0x00001E0D, 0x00001E0C}, +{0x00001E0F, 0x00001E0E}, {0x00001E11, 0x00001E10}, {0x00001E13, 0x00001E12}, {0x00001E15, 0x00001E14}, +{0x00001E17, 0x00001E16}, {0x00001E19, 0x00001E18}, {0x00001E1B, 0x00001E1A}, {0x00001E1D, 0x00001E1C}, +{0x00001E1F, 0x00001E1E}, {0x00001E21, 0x00001E20}, {0x00001E23, 0x00001E22}, {0x00001E25, 0x00001E24}, +{0x00001E27, 0x00001E26}, {0x00001E29, 0x00001E28}, {0x00001E2B, 0x00001E2A}, {0x00001E2D, 0x00001E2C}, +{0x00001E2F, 0x00001E2E}, {0x00001E31, 0x00001E30}, {0x00001E33, 0x00001E32}, {0x00001E35, 0x00001E34}, +{0x00001E37, 0x00001E36}, {0x00001E39, 0x00001E38}, {0x00001E3B, 0x00001E3A}, {0x00001E3D, 0x00001E3C}, +{0x00001E3F, 0x00001E3E}, {0x00001E41, 0x00001E40}, {0x00001E43, 0x00001E42}, {0x00001E45, 0x00001E44}, +{0x00001E47, 0x00001E46}, {0x00001E49, 0x00001E48}, {0x00001E4B, 0x00001E4A}, {0x00001E4D, 0x00001E4C}, +{0x00001E4F, 0x00001E4E}, {0x00001E51, 0x00001E50}, {0x00001E53, 0x00001E52}, {0x00001E55, 0x00001E54}, +{0x00001E57, 0x00001E56}, {0x00001E59, 0x00001E58}, {0x00001E5B, 0x00001E5A}, {0x00001E5D, 0x00001E5C}, +{0x00001E5F, 0x00001E5E}, {0x00001E61, 0x00001E60}, {0x00001E63, 0x00001E62}, {0x00001E65, 0x00001E64}, +{0x00001E67, 0x00001E66}, {0x00001E69, 0x00001E68}, {0x00001E6B, 0x00001E6A}, {0x00001E6D, 0x00001E6C}, +{0x00001E6F, 0x00001E6E}, {0x00001E71, 0x00001E70}, {0x00001E73, 0x00001E72}, {0x00001E75, 0x00001E74}, +{0x00001E77, 0x00001E76}, {0x00001E79, 0x00001E78}, {0x00001E7B, 0x00001E7A}, {0x00001E7D, 0x00001E7C}, +{0x00001E7F, 0x00001E7E}, {0x00001E81, 0x00001E80}, {0x00001E83, 0x00001E82}, {0x00001E85, 0x00001E84}, +{0x00001E87, 0x00001E86}, {0x00001E89, 0x00001E88}, {0x00001E8B, 0x00001E8A}, {0x00001E8D, 0x00001E8C}, +{0x00001E8F, 0x00001E8E}, {0x00001E91, 0x00001E90}, {0x00001E93, 0x00001E92}, {0x00001E95, 0x00001E94}, +{0x00001E96, 0x00000048}, {0x00001E97, 0x00000054}, {0x00001E98, 0x00000057}, {0x00001E99, 0x00000059}, +{0x00001E9A, 0x00000041}, {0x00001E9B, 0x00001E60}, {0x00001EA1, 0x00001EA0}, {0x00001EA3, 0x00001EA2}, +{0x00001EA5, 0x00001EA4}, {0x00001EA7, 0x00001EA6}, {0x00001EA9, 0x00001EA8}, {0x00001EAB, 0x00001EAA}, +{0x00001EAD, 0x00001EAC}, {0x00001EAF, 0x00001EAE}, {0x00001EB1, 0x00001EB0}, {0x00001EB3, 0x00001EB2}, +{0x00001EB5, 0x00001EB4}, {0x00001EB7, 0x00001EB6}, {0x00001EB9, 0x00001EB8}, {0x00001EBB, 0x00001EBA}, +{0x00001EBD, 0x00001EBC}, {0x00001EBF, 0x00001EBE}, {0x00001EC1, 0x00001EC0}, {0x00001EC3, 0x00001EC2}, +{0x00001EC5, 0x00001EC4}, {0x00001EC7, 0x00001EC6}, {0x00001EC9, 0x00001EC8}, {0x00001ECB, 0x00001ECA}, +{0x00001ECD, 0x00001ECC}, {0x00001ECF, 0x00001ECE}, {0x00001ED1, 0x00001ED0}, {0x00001ED3, 0x00001ED2}, +{0x00001ED5, 0x00001ED4}, {0x00001ED7, 0x00001ED6}, {0x00001ED9, 0x00001ED8}, {0x00001EDB, 0x00001EDA}, +{0x00001EDD, 0x00001EDC}, {0x00001EDF, 0x00001EDE}, {0x00001EE1, 0x00001EE0}, {0x00001EE3, 0x00001EE2}, +{0x00001EE5, 0x00001EE4}, {0x00001EE7, 0x00001EE6}, {0x00001EE9, 0x00001EE8}, {0x00001EEB, 0x00001EEA}, +{0x00001EED, 0x00001EEC}, {0x00001EEF, 0x00001EEE}, {0x00001EF1, 0x00001EF0}, {0x00001EF3, 0x00001EF2}, +{0x00001EF5, 0x00001EF4}, {0x00001EF7, 0x00001EF6}, {0x00001EF9, 0x00001EF8}, {0x00001EFB, 0x00001EFA}, +{0x00001EFD, 0x00001EFC}, {0x00001EFF, 0x00001EFE}, {0x00001F00, 0x00001F08}, {0x00001F01, 0x00001F09}, +{0x00001F02, 0x00001F0A}, {0x00001F03, 0x00001F0B}, {0x00001F04, 0x00001F0C}, {0x00001F05, 0x00001F0D}, +{0x00001F06, 0x00001F0E}, {0x00001F07, 0x00001F0F}, {0x00001F10, 0x00001F18}, {0x00001F11, 0x00001F19}, +{0x00001F12, 0x00001F1A}, {0x00001F13, 0x00001F1B}, {0x00001F14, 0x00001F1C}, {0x00001F15, 0x00001F1D}, +{0x00001F20, 0x00001F28}, {0x00001F21, 0x00001F29}, {0x00001F22, 0x00001F2A}, {0x00001F23, 0x00001F2B}, +{0x00001F24, 0x00001F2C}, {0x00001F25, 0x00001F2D}, {0x00001F26, 0x00001F2E}, {0x00001F27, 0x00001F2F}, +{0x00001F30, 0x00001F38}, {0x00001F31, 0x00001F39}, {0x00001F32, 0x00001F3A}, {0x00001F33, 0x00001F3B}, +{0x00001F34, 0x00001F3C}, {0x00001F35, 0x00001F3D}, {0x00001F36, 0x00001F3E}, {0x00001F37, 0x00001F3F}, +{0x00001F40, 0x00001F48}, {0x00001F41, 0x00001F49}, {0x00001F42, 0x00001F4A}, {0x00001F43, 0x00001F4B}, +{0x00001F44, 0x00001F4C}, {0x00001F45, 0x00001F4D}, {0x00001F50, 0x000003A5}, {0x00001F51, 0x00001F59}, +{0x00001F52, 0x000003A5}, {0x00001F53, 0x00001F5B}, {0x00001F54, 0x000003A5}, {0x00001F55, 0x00001F5D}, +{0x00001F56, 0x000003A5}, {0x00001F57, 0x00001F5F}, {0x00001F60, 0x00001F68}, {0x00001F61, 0x00001F69}, +{0x00001F62, 0x00001F6A}, {0x00001F63, 0x00001F6B}, {0x00001F64, 0x00001F6C}, {0x00001F65, 0x00001F6D}, +{0x00001F66, 0x00001F6E}, {0x00001F67, 0x00001F6F}, {0x00001F70, 0x00001FBA}, {0x00001F71, 0x00001FBB}, +{0x00001F72, 0x00001FC8}, {0x00001F73, 0x00001FC9}, {0x00001F74, 0x00001FCA}, {0x00001F75, 0x00001FCB}, +{0x00001F76, 0x00001FDA}, {0x00001F77, 0x00001FDB}, {0x00001F78, 0x00001FF8}, {0x00001F79, 0x00001FF9}, +{0x00001F7A, 0x00001FEA}, {0x00001F7B, 0x00001FEB}, {0x00001F7C, 0x00001FFA}, {0x00001F7D, 0x00001FFB}, +{0x00001F80, 0x00001F08}, {0x00001F81, 0x00001F09}, {0x00001F82, 0x00001F0A}, {0x00001F83, 0x00001F0B}, +{0x00001F84, 0x00001F0C}, {0x00001F85, 0x00001F0D}, {0x00001F86, 0x00001F0E}, {0x00001F87, 0x00001F0F}, +{0x00001F88, 0x00001F08}, {0x00001F89, 0x00001F09}, {0x00001F8A, 0x00001F0A}, {0x00001F8B, 0x00001F0B}, +{0x00001F8C, 0x00001F0C}, {0x00001F8D, 0x00001F0D}, {0x00001F8E, 0x00001F0E}, {0x00001F8F, 0x00001F0F}, +{0x00001F90, 0x00001F28}, {0x00001F91, 0x00001F29}, {0x00001F92, 0x00001F2A}, {0x00001F93, 0x00001F2B}, +{0x00001F94, 0x00001F2C}, {0x00001F95, 0x00001F2D}, {0x00001F96, 0x00001F2E}, {0x00001F97, 0x00001F2F}, +{0x00001F98, 0x00001F28}, {0x00001F99, 0x00001F29}, {0x00001F9A, 0x00001F2A}, {0x00001F9B, 0x00001F2B}, +{0x00001F9C, 0x00001F2C}, {0x00001F9D, 0x00001F2D}, {0x00001F9E, 0x00001F2E}, {0x00001F9F, 0x00001F2F}, +{0x00001FA0, 0x00001F68}, {0x00001FA1, 0x00001F69}, {0x00001FA2, 0x00001F6A}, {0x00001FA3, 0x00001F6B}, +{0x00001FA4, 0x00001F6C}, {0x00001FA5, 0x00001F6D}, {0x00001FA6, 0x00001F6E}, {0x00001FA7, 0x00001F6F}, +{0x00001FA8, 0x00001F68}, {0x00001FA9, 0x00001F69}, {0x00001FAA, 0x00001F6A}, {0x00001FAB, 0x00001F6B}, +{0x00001FAC, 0x00001F6C}, {0x00001FAD, 0x00001F6D}, {0x00001FAE, 0x00001F6E}, {0x00001FAF, 0x00001F6F}, +{0x00001FB0, 0x00001FB8}, {0x00001FB1, 0x00001FB9}, {0x00001FB2, 0x00001FBA}, {0x00001FB3, 0x00000391}, +{0x00001FB4, 0x00000386}, {0x00001FB6, 0x00000391}, {0x00001FB7, 0x00000391}, {0x00001FBC, 0x00000391}, +{0x00001FBE, 0x00000399}, {0x00001FC2, 0x00001FCA}, {0x00001FC3, 0x00000397}, {0x00001FC4, 0x00000389}, +{0x00001FC6, 0x00000397}, {0x00001FC7, 0x00000397}, {0x00001FCC, 0x00000397}, {0x00001FD0, 0x00001FD8}, +{0x00001FD1, 0x00001FD9}, {0x00001FD2, 0x00000399}, {0x00001FD3, 0x00000399}, {0x00001FD6, 0x00000399}, +{0x00001FD7, 0x00000399}, {0x00001FE0, 0x00001FE8}, {0x00001FE1, 0x00001FE9}, {0x00001FE2, 0x000003A5}, +{0x00001FE3, 0x000003A5}, {0x00001FE4, 0x000003A1}, {0x00001FE5, 0x00001FEC}, {0x00001FE6, 0x000003A5}, +{0x00001FE7, 0x000003A5}, {0x00001FF2, 0x00001FFA}, {0x00001FF3, 0x000003A9}, {0x00001FF4, 0x0000038F}, +{0x00001FF6, 0x000003A9}, {0x00001FF7, 0x000003A9}, {0x00001FFC, 0x000003A9}, {0x0000214E, 0x00002132}, +{0x00002170, 0x00002160}, {0x00002171, 0x00002161}, {0x00002172, 0x00002162}, {0x00002173, 0x00002163}, +{0x00002174, 0x00002164}, {0x00002175, 0x00002165}, {0x00002176, 0x00002166}, {0x00002177, 0x00002167}, +{0x00002178, 0x00002168}, {0x00002179, 0x00002169}, {0x0000217A, 0x0000216A}, {0x0000217B, 0x0000216B}, +{0x0000217C, 0x0000216C}, {0x0000217D, 0x0000216D}, {0x0000217E, 0x0000216E}, {0x0000217F, 0x0000216F}, +{0x00002184, 0x00002183}, {0x000024D0, 0x000024B6}, {0x000024D1, 0x000024B7}, {0x000024D2, 0x000024B8}, +{0x000024D3, 0x000024B9}, {0x000024D4, 0x000024BA}, {0x000024D5, 0x000024BB}, {0x000024D6, 0x000024BC}, +{0x000024D7, 0x000024BD}, {0x000024D8, 0x000024BE}, {0x000024D9, 0x000024BF}, {0x000024DA, 0x000024C0}, +{0x000024DB, 0x000024C1}, {0x000024DC, 0x000024C2}, {0x000024DD, 0x000024C3}, {0x000024DE, 0x000024C4}, +{0x000024DF, 0x000024C5}, {0x000024E0, 0x000024C6}, {0x000024E1, 0x000024C7}, {0x000024E2, 0x000024C8}, +{0x000024E3, 0x000024C9}, {0x000024E4, 0x000024CA}, {0x000024E5, 0x000024CB}, {0x000024E6, 0x000024CC}, +{0x000024E7, 0x000024CD}, {0x000024E8, 0x000024CE}, {0x000024E9, 0x000024CF}, {0x00002C30, 0x00002C00}, +{0x00002C31, 0x00002C01}, {0x00002C32, 0x00002C02}, {0x00002C33, 0x00002C03}, {0x00002C34, 0x00002C04}, +{0x00002C35, 0x00002C05}, {0x00002C36, 0x00002C06}, {0x00002C37, 0x00002C07}, {0x00002C38, 0x00002C08}, +{0x00002C39, 0x00002C09}, {0x00002C3A, 0x00002C0A}, {0x00002C3B, 0x00002C0B}, {0x00002C3C, 0x00002C0C}, +{0x00002C3D, 0x00002C0D}, {0x00002C3E, 0x00002C0E}, {0x00002C3F, 0x00002C0F}, {0x00002C40, 0x00002C10}, +{0x00002C41, 0x00002C11}, {0x00002C42, 0x00002C12}, {0x00002C43, 0x00002C13}, {0x00002C44, 0x00002C14}, +{0x00002C45, 0x00002C15}, {0x00002C46, 0x00002C16}, {0x00002C47, 0x00002C17}, {0x00002C48, 0x00002C18}, +{0x00002C49, 0x00002C19}, {0x00002C4A, 0x00002C1A}, {0x00002C4B, 0x00002C1B}, {0x00002C4C, 0x00002C1C}, +{0x00002C4D, 0x00002C1D}, {0x00002C4E, 0x00002C1E}, {0x00002C4F, 0x00002C1F}, {0x00002C50, 0x00002C20}, +{0x00002C51, 0x00002C21}, {0x00002C52, 0x00002C22}, {0x00002C53, 0x00002C23}, {0x00002C54, 0x00002C24}, +{0x00002C55, 0x00002C25}, {0x00002C56, 0x00002C26}, {0x00002C57, 0x00002C27}, {0x00002C58, 0x00002C28}, +{0x00002C59, 0x00002C29}, {0x00002C5A, 0x00002C2A}, {0x00002C5B, 0x00002C2B}, {0x00002C5C, 0x00002C2C}, +{0x00002C5D, 0x00002C2D}, {0x00002C5E, 0x00002C2E}, {0x00002C61, 0x00002C60}, {0x00002C65, 0x0000023A}, +{0x00002C66, 0x0000023E}, {0x00002C68, 0x00002C67}, {0x00002C6A, 0x00002C69}, {0x00002C6C, 0x00002C6B}, +{0x00002C73, 0x00002C72}, {0x00002C76, 0x00002C75}, {0x00002C81, 0x00002C80}, {0x00002C83, 0x00002C82}, +{0x00002C85, 0x00002C84}, {0x00002C87, 0x00002C86}, {0x00002C89, 0x00002C88}, {0x00002C8B, 0x00002C8A}, +{0x00002C8D, 0x00002C8C}, {0x00002C8F, 0x00002C8E}, {0x00002C91, 0x00002C90}, {0x00002C93, 0x00002C92}, +{0x00002C95, 0x00002C94}, {0x00002C97, 0x00002C96}, {0x00002C99, 0x00002C98}, {0x00002C9B, 0x00002C9A}, +{0x00002C9D, 0x00002C9C}, {0x00002C9F, 0x00002C9E}, {0x00002CA1, 0x00002CA0}, {0x00002CA3, 0x00002CA2}, +{0x00002CA5, 0x00002CA4}, {0x00002CA7, 0x00002CA6}, {0x00002CA9, 0x00002CA8}, {0x00002CAB, 0x00002CAA}, +{0x00002CAD, 0x00002CAC}, {0x00002CAF, 0x00002CAE}, {0x00002CB1, 0x00002CB0}, {0x00002CB3, 0x00002CB2}, +{0x00002CB5, 0x00002CB4}, {0x00002CB7, 0x00002CB6}, {0x00002CB9, 0x00002CB8}, {0x00002CBB, 0x00002CBA}, +{0x00002CBD, 0x00002CBC}, {0x00002CBF, 0x00002CBE}, {0x00002CC1, 0x00002CC0}, {0x00002CC3, 0x00002CC2}, +{0x00002CC5, 0x00002CC4}, {0x00002CC7, 0x00002CC6}, {0x00002CC9, 0x00002CC8}, {0x00002CCB, 0x00002CCA}, +{0x00002CCD, 0x00002CCC}, {0x00002CCF, 0x00002CCE}, {0x00002CD1, 0x00002CD0}, {0x00002CD3, 0x00002CD2}, +{0x00002CD5, 0x00002CD4}, {0x00002CD7, 0x00002CD6}, {0x00002CD9, 0x00002CD8}, {0x00002CDB, 0x00002CDA}, +{0x00002CDD, 0x00002CDC}, {0x00002CDF, 0x00002CDE}, {0x00002CE1, 0x00002CE0}, {0x00002CE3, 0x00002CE2}, +{0x00002CEC, 0x00002CEB}, {0x00002CEE, 0x00002CED}, {0x00002CF3, 0x00002CF2}, {0x00002D00, 0x000010A0}, +{0x00002D01, 0x000010A1}, {0x00002D02, 0x000010A2}, {0x00002D03, 0x000010A3}, {0x00002D04, 0x000010A4}, +{0x00002D05, 0x000010A5}, {0x00002D06, 0x000010A6}, {0x00002D07, 0x000010A7}, {0x00002D08, 0x000010A8}, +{0x00002D09, 0x000010A9}, {0x00002D0A, 0x000010AA}, {0x00002D0B, 0x000010AB}, {0x00002D0C, 0x000010AC}, +{0x00002D0D, 0x000010AD}, {0x00002D0E, 0x000010AE}, {0x00002D0F, 0x000010AF}, {0x00002D10, 0x000010B0}, +{0x00002D11, 0x000010B1}, {0x00002D12, 0x000010B2}, {0x00002D13, 0x000010B3}, {0x00002D14, 0x000010B4}, +{0x00002D15, 0x000010B5}, {0x00002D16, 0x000010B6}, {0x00002D17, 0x000010B7}, {0x00002D18, 0x000010B8}, +{0x00002D19, 0x000010B9}, {0x00002D1A, 0x000010BA}, {0x00002D1B, 0x000010BB}, {0x00002D1C, 0x000010BC}, +{0x00002D1D, 0x000010BD}, {0x00002D1E, 0x000010BE}, {0x00002D1F, 0x000010BF}, {0x00002D20, 0x000010C0}, +{0x00002D21, 0x000010C1}, {0x00002D22, 0x000010C2}, {0x00002D23, 0x000010C3}, {0x00002D24, 0x000010C4}, +{0x00002D25, 0x000010C5}, {0x00002D27, 0x000010C7}, {0x00002D2D, 0x000010CD}, {0x0000A641, 0x0000A640}, +{0x0000A643, 0x0000A642}, {0x0000A645, 0x0000A644}, {0x0000A647, 0x0000A646}, {0x0000A649, 0x0000A648}, +{0x0000A64B, 0x0000A64A}, {0x0000A64D, 0x0000A64C}, {0x0000A64F, 0x0000A64E}, {0x0000A651, 0x0000A650}, +{0x0000A653, 0x0000A652}, {0x0000A655, 0x0000A654}, {0x0000A657, 0x0000A656}, {0x0000A659, 0x0000A658}, +{0x0000A65B, 0x0000A65A}, {0x0000A65D, 0x0000A65C}, {0x0000A65F, 0x0000A65E}, {0x0000A661, 0x0000A660}, +{0x0000A663, 0x0000A662}, {0x0000A665, 0x0000A664}, {0x0000A667, 0x0000A666}, {0x0000A669, 0x0000A668}, +{0x0000A66B, 0x0000A66A}, {0x0000A66D, 0x0000A66C}, {0x0000A681, 0x0000A680}, {0x0000A683, 0x0000A682}, +{0x0000A685, 0x0000A684}, {0x0000A687, 0x0000A686}, {0x0000A689, 0x0000A688}, {0x0000A68B, 0x0000A68A}, +{0x0000A68D, 0x0000A68C}, {0x0000A68F, 0x0000A68E}, {0x0000A691, 0x0000A690}, {0x0000A693, 0x0000A692}, +{0x0000A695, 0x0000A694}, {0x0000A697, 0x0000A696}, {0x0000A699, 0x0000A698}, {0x0000A69B, 0x0000A69A}, +{0x0000A723, 0x0000A722}, {0x0000A725, 0x0000A724}, {0x0000A727, 0x0000A726}, {0x0000A729, 0x0000A728}, +{0x0000A72B, 0x0000A72A}, {0x0000A72D, 0x0000A72C}, {0x0000A72F, 0x0000A72E}, {0x0000A733, 0x0000A732}, +{0x0000A735, 0x0000A734}, {0x0000A737, 0x0000A736}, {0x0000A739, 0x0000A738}, {0x0000A73B, 0x0000A73A}, +{0x0000A73D, 0x0000A73C}, {0x0000A73F, 0x0000A73E}, {0x0000A741, 0x0000A740}, {0x0000A743, 0x0000A742}, +{0x0000A745, 0x0000A744}, {0x0000A747, 0x0000A746}, {0x0000A749, 0x0000A748}, {0x0000A74B, 0x0000A74A}, +{0x0000A74D, 0x0000A74C}, {0x0000A74F, 0x0000A74E}, {0x0000A751, 0x0000A750}, {0x0000A753, 0x0000A752}, +{0x0000A755, 0x0000A754}, {0x0000A757, 0x0000A756}, {0x0000A759, 0x0000A758}, {0x0000A75B, 0x0000A75A}, +{0x0000A75D, 0x0000A75C}, {0x0000A75F, 0x0000A75E}, {0x0000A761, 0x0000A760}, {0x0000A763, 0x0000A762}, +{0x0000A765, 0x0000A764}, {0x0000A767, 0x0000A766}, {0x0000A769, 0x0000A768}, {0x0000A76B, 0x0000A76A}, +{0x0000A76D, 0x0000A76C}, {0x0000A76F, 0x0000A76E}, {0x0000A77A, 0x0000A779}, {0x0000A77C, 0x0000A77B}, +{0x0000A77F, 0x0000A77E}, {0x0000A781, 0x0000A780}, {0x0000A783, 0x0000A782}, {0x0000A785, 0x0000A784}, +{0x0000A787, 0x0000A786}, {0x0000A78C, 0x0000A78B}, {0x0000A791, 0x0000A790}, {0x0000A793, 0x0000A792}, +{0x0000A794, 0x0000A7C4}, {0x0000A797, 0x0000A796}, {0x0000A799, 0x0000A798}, {0x0000A79B, 0x0000A79A}, +{0x0000A79D, 0x0000A79C}, {0x0000A79F, 0x0000A79E}, {0x0000A7A1, 0x0000A7A0}, {0x0000A7A3, 0x0000A7A2}, +{0x0000A7A5, 0x0000A7A4}, {0x0000A7A7, 0x0000A7A6}, {0x0000A7A9, 0x0000A7A8}, {0x0000A7B5, 0x0000A7B4}, +{0x0000A7B7, 0x0000A7B6}, {0x0000A7B9, 0x0000A7B8}, {0x0000A7BB, 0x0000A7BA}, {0x0000A7BD, 0x0000A7BC}, +{0x0000A7BF, 0x0000A7BE}, {0x0000A7C3, 0x0000A7C2}, {0x0000A7C8, 0x0000A7C7}, {0x0000A7CA, 0x0000A7C9}, +{0x0000A7F6, 0x0000A7F5}, {0x0000AB53, 0x0000A7B3}, {0x0000AB70, 0x000013A0}, {0x0000AB71, 0x000013A1}, +{0x0000AB72, 0x000013A2}, {0x0000AB73, 0x000013A3}, {0x0000AB74, 0x000013A4}, {0x0000AB75, 0x000013A5}, +{0x0000AB76, 0x000013A6}, {0x0000AB77, 0x000013A7}, {0x0000AB78, 0x000013A8}, {0x0000AB79, 0x000013A9}, +{0x0000AB7A, 0x000013AA}, {0x0000AB7B, 0x000013AB}, {0x0000AB7C, 0x000013AC}, {0x0000AB7D, 0x000013AD}, +{0x0000AB7E, 0x000013AE}, {0x0000AB7F, 0x000013AF}, {0x0000AB80, 0x000013B0}, {0x0000AB81, 0x000013B1}, +{0x0000AB82, 0x000013B2}, {0x0000AB83, 0x000013B3}, {0x0000AB84, 0x000013B4}, {0x0000AB85, 0x000013B5}, +{0x0000AB86, 0x000013B6}, {0x0000AB87, 0x000013B7}, {0x0000AB88, 0x000013B8}, {0x0000AB89, 0x000013B9}, +{0x0000AB8A, 0x000013BA}, {0x0000AB8B, 0x000013BB}, {0x0000AB8C, 0x000013BC}, {0x0000AB8D, 0x000013BD}, +{0x0000AB8E, 0x000013BE}, {0x0000AB8F, 0x000013BF}, {0x0000AB90, 0x000013C0}, {0x0000AB91, 0x000013C1}, +{0x0000AB92, 0x000013C2}, {0x0000AB93, 0x000013C3}, {0x0000AB94, 0x000013C4}, {0x0000AB95, 0x000013C5}, +{0x0000AB96, 0x000013C6}, {0x0000AB97, 0x000013C7}, {0x0000AB98, 0x000013C8}, {0x0000AB99, 0x000013C9}, +{0x0000AB9A, 0x000013CA}, {0x0000AB9B, 0x000013CB}, {0x0000AB9C, 0x000013CC}, {0x0000AB9D, 0x000013CD}, +{0x0000AB9E, 0x000013CE}, {0x0000AB9F, 0x000013CF}, {0x0000ABA0, 0x000013D0}, {0x0000ABA1, 0x000013D1}, +{0x0000ABA2, 0x000013D2}, {0x0000ABA3, 0x000013D3}, {0x0000ABA4, 0x000013D4}, {0x0000ABA5, 0x000013D5}, +{0x0000ABA6, 0x000013D6}, {0x0000ABA7, 0x000013D7}, {0x0000ABA8, 0x000013D8}, {0x0000ABA9, 0x000013D9}, +{0x0000ABAA, 0x000013DA}, {0x0000ABAB, 0x000013DB}, {0x0000ABAC, 0x000013DC}, {0x0000ABAD, 0x000013DD}, +{0x0000ABAE, 0x000013DE}, {0x0000ABAF, 0x000013DF}, {0x0000ABB0, 0x000013E0}, {0x0000ABB1, 0x000013E1}, +{0x0000ABB2, 0x000013E2}, {0x0000ABB3, 0x000013E3}, {0x0000ABB4, 0x000013E4}, {0x0000ABB5, 0x000013E5}, +{0x0000ABB6, 0x000013E6}, {0x0000ABB7, 0x000013E7}, {0x0000ABB8, 0x000013E8}, {0x0000ABB9, 0x000013E9}, +{0x0000ABBA, 0x000013EA}, {0x0000ABBB, 0x000013EB}, {0x0000ABBC, 0x000013EC}, {0x0000ABBD, 0x000013ED}, +{0x0000ABBE, 0x000013EE}, {0x0000ABBF, 0x000013EF}, {0x0000FB00, 0x00000046}, {0x0000FB01, 0x00000046}, +{0x0000FB02, 0x00000046}, {0x0000FB03, 0x00000046}, {0x0000FB04, 0x00000046}, {0x0000FB05, 0x00000053}, +{0x0000FB06, 0x00000053}, {0x0000FB13, 0x00000544}, {0x0000FB14, 0x00000544}, {0x0000FB15, 0x00000544}, +{0x0000FB16, 0x0000054E}, {0x0000FB17, 0x00000544}, {0x0000FF41, 0x0000FF21}, {0x0000FF42, 0x0000FF22}, +{0x0000FF43, 0x0000FF23}, {0x0000FF44, 0x0000FF24}, {0x0000FF45, 0x0000FF25}, {0x0000FF46, 0x0000FF26}, +{0x0000FF47, 0x0000FF27}, {0x0000FF48, 0x0000FF28}, {0x0000FF49, 0x0000FF29}, {0x0000FF4A, 0x0000FF2A}, +{0x0000FF4B, 0x0000FF2B}, {0x0000FF4C, 0x0000FF2C}, {0x0000FF4D, 0x0000FF2D}, {0x0000FF4E, 0x0000FF2E}, +{0x0000FF4F, 0x0000FF2F}, {0x0000FF50, 0x0000FF30}, {0x0000FF51, 0x0000FF31}, {0x0000FF52, 0x0000FF32}, +{0x0000FF53, 0x0000FF33}, {0x0000FF54, 0x0000FF34}, {0x0000FF55, 0x0000FF35}, {0x0000FF56, 0x0000FF36}, +{0x0000FF57, 0x0000FF37}, {0x0000FF58, 0x0000FF38}, {0x0000FF59, 0x0000FF39}, {0x0000FF5A, 0x0000FF3A}, +{0x00010428, 0x00010400}, {0x00010429, 0x00010401}, {0x0001042A, 0x00010402}, {0x0001042B, 0x00010403}, +{0x0001042C, 0x00010404}, {0x0001042D, 0x00010405}, {0x0001042E, 0x00010406}, {0x0001042F, 0x00010407}, +{0x00010430, 0x00010408}, {0x00010431, 0x00010409}, {0x00010432, 0x0001040A}, {0x00010433, 0x0001040B}, +{0x00010434, 0x0001040C}, {0x00010435, 0x0001040D}, {0x00010436, 0x0001040E}, {0x00010437, 0x0001040F}, +{0x00010438, 0x00010410}, {0x00010439, 0x00010411}, {0x0001043A, 0x00010412}, {0x0001043B, 0x00010413}, +{0x0001043C, 0x00010414}, {0x0001043D, 0x00010415}, {0x0001043E, 0x00010416}, {0x0001043F, 0x00010417}, +{0x00010440, 0x00010418}, {0x00010441, 0x00010419}, {0x00010442, 0x0001041A}, {0x00010443, 0x0001041B}, +{0x00010444, 0x0001041C}, {0x00010445, 0x0001041D}, {0x00010446, 0x0001041E}, {0x00010447, 0x0001041F}, +{0x00010448, 0x00010420}, {0x00010449, 0x00010421}, {0x0001044A, 0x00010422}, {0x0001044B, 0x00010423}, +{0x0001044C, 0x00010424}, {0x0001044D, 0x00010425}, {0x0001044E, 0x00010426}, {0x0001044F, 0x00010427}, +{0x000104D8, 0x000104B0}, {0x000104D9, 0x000104B1}, {0x000104DA, 0x000104B2}, {0x000104DB, 0x000104B3}, +{0x000104DC, 0x000104B4}, {0x000104DD, 0x000104B5}, {0x000104DE, 0x000104B6}, {0x000104DF, 0x000104B7}, +{0x000104E0, 0x000104B8}, {0x000104E1, 0x000104B9}, {0x000104E2, 0x000104BA}, {0x000104E3, 0x000104BB}, +{0x000104E4, 0x000104BC}, {0x000104E5, 0x000104BD}, {0x000104E6, 0x000104BE}, {0x000104E7, 0x000104BF}, +{0x000104E8, 0x000104C0}, {0x000104E9, 0x000104C1}, {0x000104EA, 0x000104C2}, {0x000104EB, 0x000104C3}, +{0x000104EC, 0x000104C4}, {0x000104ED, 0x000104C5}, {0x000104EE, 0x000104C6}, {0x000104EF, 0x000104C7}, +{0x000104F0, 0x000104C8}, {0x000104F1, 0x000104C9}, {0x000104F2, 0x000104CA}, {0x000104F3, 0x000104CB}, +{0x000104F4, 0x000104CC}, {0x000104F5, 0x000104CD}, {0x000104F6, 0x000104CE}, {0x000104F7, 0x000104CF}, +{0x000104F8, 0x000104D0}, {0x000104F9, 0x000104D1}, {0x000104FA, 0x000104D2}, {0x000104FB, 0x000104D3}, +{0x00010CC0, 0x00010C80}, {0x00010CC1, 0x00010C81}, {0x00010CC2, 0x00010C82}, {0x00010CC3, 0x00010C83}, +{0x00010CC4, 0x00010C84}, {0x00010CC5, 0x00010C85}, {0x00010CC6, 0x00010C86}, {0x00010CC7, 0x00010C87}, +{0x00010CC8, 0x00010C88}, {0x00010CC9, 0x00010C89}, {0x00010CCA, 0x00010C8A}, {0x00010CCB, 0x00010C8B}, +{0x00010CCC, 0x00010C8C}, {0x00010CCD, 0x00010C8D}, {0x00010CCE, 0x00010C8E}, {0x00010CCF, 0x00010C8F}, +{0x00010CD0, 0x00010C90}, {0x00010CD1, 0x00010C91}, {0x00010CD2, 0x00010C92}, {0x00010CD3, 0x00010C93}, +{0x00010CD4, 0x00010C94}, {0x00010CD5, 0x00010C95}, {0x00010CD6, 0x00010C96}, {0x00010CD7, 0x00010C97}, +{0x00010CD8, 0x00010C98}, {0x00010CD9, 0x00010C99}, {0x00010CDA, 0x00010C9A}, {0x00010CDB, 0x00010C9B}, +{0x00010CDC, 0x00010C9C}, {0x00010CDD, 0x00010C9D}, {0x00010CDE, 0x00010C9E}, {0x00010CDF, 0x00010C9F}, +{0x00010CE0, 0x00010CA0}, {0x00010CE1, 0x00010CA1}, {0x00010CE2, 0x00010CA2}, {0x00010CE3, 0x00010CA3}, +{0x00010CE4, 0x00010CA4}, {0x00010CE5, 0x00010CA5}, {0x00010CE6, 0x00010CA6}, {0x00010CE7, 0x00010CA7}, +{0x00010CE8, 0x00010CA8}, {0x00010CE9, 0x00010CA9}, {0x00010CEA, 0x00010CAA}, {0x00010CEB, 0x00010CAB}, +{0x00010CEC, 0x00010CAC}, {0x00010CED, 0x00010CAD}, {0x00010CEE, 0x00010CAE}, {0x00010CEF, 0x00010CAF}, +{0x00010CF0, 0x00010CB0}, {0x00010CF1, 0x00010CB1}, {0x00010CF2, 0x00010CB2}, {0x000118C0, 0x000118A0}, +{0x000118C1, 0x000118A1}, {0x000118C2, 0x000118A2}, {0x000118C3, 0x000118A3}, {0x000118C4, 0x000118A4}, +{0x000118C5, 0x000118A5}, {0x000118C6, 0x000118A6}, {0x000118C7, 0x000118A7}, {0x000118C8, 0x000118A8}, +{0x000118C9, 0x000118A9}, {0x000118CA, 0x000118AA}, {0x000118CB, 0x000118AB}, {0x000118CC, 0x000118AC}, +{0x000118CD, 0x000118AD}, {0x000118CE, 0x000118AE}, {0x000118CF, 0x000118AF}, {0x000118D0, 0x000118B0}, +{0x000118D1, 0x000118B1}, {0x000118D2, 0x000118B2}, {0x000118D3, 0x000118B3}, {0x000118D4, 0x000118B4}, +{0x000118D5, 0x000118B5}, {0x000118D6, 0x000118B6}, {0x000118D7, 0x000118B7}, {0x000118D8, 0x000118B8}, +{0x000118D9, 0x000118B9}, {0x000118DA, 0x000118BA}, {0x000118DB, 0x000118BB}, {0x000118DC, 0x000118BC}, +{0x000118DD, 0x000118BD}, {0x000118DE, 0x000118BE}, {0x000118DF, 0x000118BF}, {0x00016E60, 0x00016E40}, +{0x00016E61, 0x00016E41}, {0x00016E62, 0x00016E42}, {0x00016E63, 0x00016E43}, {0x00016E64, 0x00016E44}, +{0x00016E65, 0x00016E45}, {0x00016E66, 0x00016E46}, {0x00016E67, 0x00016E47}, {0x00016E68, 0x00016E48}, +{0x00016E69, 0x00016E49}, {0x00016E6A, 0x00016E4A}, {0x00016E6B, 0x00016E4B}, {0x00016E6C, 0x00016E4C}, +{0x00016E6D, 0x00016E4D}, {0x00016E6E, 0x00016E4E}, {0x00016E6F, 0x00016E4F}, {0x00016E70, 0x00016E50}, +{0x00016E71, 0x00016E51}, {0x00016E72, 0x00016E52}, {0x00016E73, 0x00016E53}, {0x00016E74, 0x00016E54}, +{0x00016E75, 0x00016E55}, {0x00016E76, 0x00016E56}, {0x00016E77, 0x00016E57}, {0x00016E78, 0x00016E58}, +{0x00016E79, 0x00016E59}, {0x00016E7A, 0x00016E5A}, {0x00016E7B, 0x00016E5B}, {0x00016E7C, 0x00016E5C}, +{0x00016E7D, 0x00016E5D}, {0x00016E7E, 0x00016E5E}, {0x00016E7F, 0x00016E5F}, {0x0001E922, 0x0001E900}, +{0x0001E923, 0x0001E901}, {0x0001E924, 0x0001E902}, {0x0001E925, 0x0001E903}, {0x0001E926, 0x0001E904}, +{0x0001E927, 0x0001E905}, {0x0001E928, 0x0001E906}, {0x0001E929, 0x0001E907}, {0x0001E92A, 0x0001E908}, +{0x0001E92B, 0x0001E909}, {0x0001E92C, 0x0001E90A}, {0x0001E92D, 0x0001E90B}, {0x0001E92E, 0x0001E90C}, +{0x0001E92F, 0x0001E90D}, {0x0001E930, 0x0001E90E}, {0x0001E931, 0x0001E90F}, {0x0001E932, 0x0001E910}, +{0x0001E933, 0x0001E911}, {0x0001E934, 0x0001E912}, {0x0001E935, 0x0001E913}, {0x0001E936, 0x0001E914}, +{0x0001E937, 0x0001E915}, {0x0001E938, 0x0001E916}, {0x0001E939, 0x0001E917}, {0x0001E93A, 0x0001E918}, +{0x0001E93B, 0x0001E919}, {0x0001E93C, 0x0001E91A}, {0x0001E93D, 0x0001E91B}, {0x0001E93E, 0x0001E91C}, +{0x0001E93F, 0x0001E91D}, {0x0001E940, 0x0001E91E}, {0x0001E941, 0x0001E91F}, {0x0001E942, 0x0001E920}, +{0x0001E943, 0x0001E921}, +}; + +const std::multimap unicode_map_nfd = { +{0x000000C0, 0x00000041}, {0x000000C0, 0x00000300}, {0x000000C1, 0x00000041}, {0x000000C1, 0x00000301}, +{0x000000C2, 0x00000041}, {0x000000C2, 0x00000302}, {0x000000C3, 0x00000041}, {0x000000C3, 0x00000303}, +{0x000000C4, 0x00000041}, {0x000000C4, 0x00000308}, {0x000000C5, 0x00000041}, {0x000000C5, 0x0000030A}, +{0x000000C7, 0x00000043}, {0x000000C7, 0x00000327}, {0x000000C8, 0x00000045}, {0x000000C8, 0x00000300}, +{0x000000C9, 0x00000045}, {0x000000C9, 0x00000301}, {0x000000CA, 0x00000045}, {0x000000CA, 0x00000302}, +{0x000000CB, 0x00000045}, {0x000000CB, 0x00000308}, {0x000000CC, 0x00000049}, {0x000000CC, 0x00000300}, +{0x000000CD, 0x00000049}, {0x000000CD, 0x00000301}, {0x000000CE, 0x00000049}, {0x000000CE, 0x00000302}, +{0x000000CF, 0x00000049}, {0x000000CF, 0x00000308}, {0x000000D1, 0x0000004E}, {0x000000D1, 0x00000303}, +{0x000000D2, 0x0000004F}, {0x000000D2, 0x00000300}, {0x000000D3, 0x0000004F}, {0x000000D3, 0x00000301}, +{0x000000D4, 0x0000004F}, {0x000000D4, 0x00000302}, {0x000000D5, 0x0000004F}, {0x000000D5, 0x00000303}, +{0x000000D6, 0x0000004F}, {0x000000D6, 0x00000308}, {0x000000D9, 0x00000055}, {0x000000D9, 0x00000300}, +{0x000000DA, 0x00000055}, {0x000000DA, 0x00000301}, {0x000000DB, 0x00000055}, {0x000000DB, 0x00000302}, +{0x000000DC, 0x00000055}, {0x000000DC, 0x00000308}, {0x000000DD, 0x00000059}, {0x000000DD, 0x00000301}, +{0x000000E0, 0x00000061}, {0x000000E0, 0x00000300}, {0x000000E1, 0x00000061}, {0x000000E1, 0x00000301}, +{0x000000E2, 0x00000061}, {0x000000E2, 0x00000302}, {0x000000E3, 0x00000061}, {0x000000E3, 0x00000303}, +{0x000000E4, 0x00000061}, {0x000000E4, 0x00000308}, {0x000000E5, 0x00000061}, {0x000000E5, 0x0000030A}, +{0x000000E7, 0x00000063}, {0x000000E7, 0x00000327}, {0x000000E8, 0x00000065}, {0x000000E8, 0x00000300}, +{0x000000E9, 0x00000065}, {0x000000E9, 0x00000301}, {0x000000EA, 0x00000065}, {0x000000EA, 0x00000302}, +{0x000000EB, 0x00000065}, {0x000000EB, 0x00000308}, {0x000000EC, 0x00000069}, {0x000000EC, 0x00000300}, +{0x000000ED, 0x00000069}, {0x000000ED, 0x00000301}, {0x000000EE, 0x00000069}, {0x000000EE, 0x00000302}, +{0x000000EF, 0x00000069}, {0x000000EF, 0x00000308}, {0x000000F1, 0x0000006E}, {0x000000F1, 0x00000303}, +{0x000000F2, 0x0000006F}, {0x000000F2, 0x00000300}, {0x000000F3, 0x0000006F}, {0x000000F3, 0x00000301}, +{0x000000F4, 0x0000006F}, {0x000000F4, 0x00000302}, {0x000000F5, 0x0000006F}, {0x000000F5, 0x00000303}, +{0x000000F6, 0x0000006F}, {0x000000F6, 0x00000308}, {0x000000F9, 0x00000075}, {0x000000F9, 0x00000300}, +{0x000000FA, 0x00000075}, {0x000000FA, 0x00000301}, {0x000000FB, 0x00000075}, {0x000000FB, 0x00000302}, +{0x000000FC, 0x00000075}, {0x000000FC, 0x00000308}, {0x000000FD, 0x00000079}, {0x000000FD, 0x00000301}, +{0x000000FF, 0x00000079}, {0x000000FF, 0x00000308}, {0x00000100, 0x00000041}, {0x00000100, 0x00000304}, +{0x00000101, 0x00000061}, {0x00000101, 0x00000304}, {0x00000102, 0x00000041}, {0x00000102, 0x00000306}, +{0x00000103, 0x00000061}, {0x00000103, 0x00000306}, {0x00000104, 0x00000041}, {0x00000104, 0x00000328}, +{0x00000105, 0x00000061}, {0x00000105, 0x00000328}, {0x00000106, 0x00000043}, {0x00000106, 0x00000301}, +{0x00000107, 0x00000063}, {0x00000107, 0x00000301}, {0x00000108, 0x00000043}, {0x00000108, 0x00000302}, +{0x00000109, 0x00000063}, {0x00000109, 0x00000302}, {0x0000010A, 0x00000043}, {0x0000010A, 0x00000307}, +{0x0000010B, 0x00000063}, {0x0000010B, 0x00000307}, {0x0000010C, 0x00000043}, {0x0000010C, 0x0000030C}, +{0x0000010D, 0x00000063}, {0x0000010D, 0x0000030C}, {0x0000010E, 0x00000044}, {0x0000010E, 0x0000030C}, +{0x0000010F, 0x00000064}, {0x0000010F, 0x0000030C}, {0x00000112, 0x00000045}, {0x00000112, 0x00000304}, +{0x00000113, 0x00000065}, {0x00000113, 0x00000304}, {0x00000114, 0x00000045}, {0x00000114, 0x00000306}, +{0x00000115, 0x00000065}, {0x00000115, 0x00000306}, {0x00000116, 0x00000045}, {0x00000116, 0x00000307}, +{0x00000117, 0x00000065}, {0x00000117, 0x00000307}, {0x00000118, 0x00000045}, {0x00000118, 0x00000328}, +{0x00000119, 0x00000065}, {0x00000119, 0x00000328}, {0x0000011A, 0x00000045}, {0x0000011A, 0x0000030C}, +{0x0000011B, 0x00000065}, {0x0000011B, 0x0000030C}, {0x0000011C, 0x00000047}, {0x0000011C, 0x00000302}, +{0x0000011D, 0x00000067}, {0x0000011D, 0x00000302}, {0x0000011E, 0x00000047}, {0x0000011E, 0x00000306}, +{0x0000011F, 0x00000067}, {0x0000011F, 0x00000306}, {0x00000120, 0x00000047}, {0x00000120, 0x00000307}, +{0x00000121, 0x00000067}, {0x00000121, 0x00000307}, {0x00000122, 0x00000047}, {0x00000122, 0x00000327}, +{0x00000123, 0x00000067}, {0x00000123, 0x00000327}, {0x00000124, 0x00000048}, {0x00000124, 0x00000302}, +{0x00000125, 0x00000068}, {0x00000125, 0x00000302}, {0x00000128, 0x00000049}, {0x00000128, 0x00000303}, +{0x00000129, 0x00000069}, {0x00000129, 0x00000303}, {0x0000012A, 0x00000049}, {0x0000012A, 0x00000304}, +{0x0000012B, 0x00000069}, {0x0000012B, 0x00000304}, {0x0000012C, 0x00000049}, {0x0000012C, 0x00000306}, +{0x0000012D, 0x00000069}, {0x0000012D, 0x00000306}, {0x0000012E, 0x00000049}, {0x0000012E, 0x00000328}, +{0x0000012F, 0x00000069}, {0x0000012F, 0x00000328}, {0x00000130, 0x00000049}, {0x00000130, 0x00000307}, +{0x00000134, 0x0000004A}, {0x00000134, 0x00000302}, {0x00000135, 0x0000006A}, {0x00000135, 0x00000302}, +{0x00000136, 0x0000004B}, {0x00000136, 0x00000327}, {0x00000137, 0x0000006B}, {0x00000137, 0x00000327}, +{0x00000139, 0x0000004C}, {0x00000139, 0x00000301}, {0x0000013A, 0x0000006C}, {0x0000013A, 0x00000301}, +{0x0000013B, 0x0000004C}, {0x0000013B, 0x00000327}, {0x0000013C, 0x0000006C}, {0x0000013C, 0x00000327}, +{0x0000013D, 0x0000004C}, {0x0000013D, 0x0000030C}, {0x0000013E, 0x0000006C}, {0x0000013E, 0x0000030C}, +{0x00000143, 0x0000004E}, {0x00000143, 0x00000301}, {0x00000144, 0x0000006E}, {0x00000144, 0x00000301}, +{0x00000145, 0x0000004E}, {0x00000145, 0x00000327}, {0x00000146, 0x0000006E}, {0x00000146, 0x00000327}, +{0x00000147, 0x0000004E}, {0x00000147, 0x0000030C}, {0x00000148, 0x0000006E}, {0x00000148, 0x0000030C}, +{0x0000014C, 0x0000004F}, {0x0000014C, 0x00000304}, {0x0000014D, 0x0000006F}, {0x0000014D, 0x00000304}, +{0x0000014E, 0x0000004F}, {0x0000014E, 0x00000306}, {0x0000014F, 0x0000006F}, {0x0000014F, 0x00000306}, +{0x00000150, 0x0000004F}, {0x00000150, 0x0000030B}, {0x00000151, 0x0000006F}, {0x00000151, 0x0000030B}, +{0x00000154, 0x00000052}, {0x00000154, 0x00000301}, {0x00000155, 0x00000072}, {0x00000155, 0x00000301}, +{0x00000156, 0x00000052}, {0x00000156, 0x00000327}, {0x00000157, 0x00000072}, {0x00000157, 0x00000327}, +{0x00000158, 0x00000052}, {0x00000158, 0x0000030C}, {0x00000159, 0x00000072}, {0x00000159, 0x0000030C}, +{0x0000015A, 0x00000053}, {0x0000015A, 0x00000301}, {0x0000015B, 0x00000073}, {0x0000015B, 0x00000301}, +{0x0000015C, 0x00000053}, {0x0000015C, 0x00000302}, {0x0000015D, 0x00000073}, {0x0000015D, 0x00000302}, +{0x0000015E, 0x00000053}, {0x0000015E, 0x00000327}, {0x0000015F, 0x00000073}, {0x0000015F, 0x00000327}, +{0x00000160, 0x00000053}, {0x00000160, 0x0000030C}, {0x00000161, 0x00000073}, {0x00000161, 0x0000030C}, +{0x00000162, 0x00000054}, {0x00000162, 0x00000327}, {0x00000163, 0x00000074}, {0x00000163, 0x00000327}, +{0x00000164, 0x00000054}, {0x00000164, 0x0000030C}, {0x00000165, 0x00000074}, {0x00000165, 0x0000030C}, +{0x00000168, 0x00000055}, {0x00000168, 0x00000303}, {0x00000169, 0x00000075}, {0x00000169, 0x00000303}, +{0x0000016A, 0x00000055}, {0x0000016A, 0x00000304}, {0x0000016B, 0x00000075}, {0x0000016B, 0x00000304}, +{0x0000016C, 0x00000055}, {0x0000016C, 0x00000306}, {0x0000016D, 0x00000075}, {0x0000016D, 0x00000306}, +{0x0000016E, 0x00000055}, {0x0000016E, 0x0000030A}, {0x0000016F, 0x00000075}, {0x0000016F, 0x0000030A}, +{0x00000170, 0x00000055}, {0x00000170, 0x0000030B}, {0x00000171, 0x00000075}, {0x00000171, 0x0000030B}, +{0x00000172, 0x00000055}, {0x00000172, 0x00000328}, {0x00000173, 0x00000075}, {0x00000173, 0x00000328}, +{0x00000174, 0x00000057}, {0x00000174, 0x00000302}, {0x00000175, 0x00000077}, {0x00000175, 0x00000302}, +{0x00000176, 0x00000059}, {0x00000176, 0x00000302}, {0x00000177, 0x00000079}, {0x00000177, 0x00000302}, +{0x00000178, 0x00000059}, {0x00000178, 0x00000308}, {0x00000179, 0x0000005A}, {0x00000179, 0x00000301}, +{0x0000017A, 0x0000007A}, {0x0000017A, 0x00000301}, {0x0000017B, 0x0000005A}, {0x0000017B, 0x00000307}, +{0x0000017C, 0x0000007A}, {0x0000017C, 0x00000307}, {0x0000017D, 0x0000005A}, {0x0000017D, 0x0000030C}, +{0x0000017E, 0x0000007A}, {0x0000017E, 0x0000030C}, {0x000001A0, 0x0000004F}, {0x000001A0, 0x0000031B}, +{0x000001A1, 0x0000006F}, {0x000001A1, 0x0000031B}, {0x000001AF, 0x00000055}, {0x000001AF, 0x0000031B}, +{0x000001B0, 0x00000075}, {0x000001B0, 0x0000031B}, {0x000001CD, 0x00000041}, {0x000001CD, 0x0000030C}, +{0x000001CE, 0x00000061}, {0x000001CE, 0x0000030C}, {0x000001CF, 0x00000049}, {0x000001CF, 0x0000030C}, +{0x000001D0, 0x00000069}, {0x000001D0, 0x0000030C}, {0x000001D1, 0x0000004F}, {0x000001D1, 0x0000030C}, +{0x000001D2, 0x0000006F}, {0x000001D2, 0x0000030C}, {0x000001D3, 0x00000055}, {0x000001D3, 0x0000030C}, +{0x000001D4, 0x00000075}, {0x000001D4, 0x0000030C}, {0x000001D5, 0x00000055}, {0x000001D5, 0x00000308}, +{0x000001D5, 0x00000304}, {0x000001D6, 0x00000075}, {0x000001D6, 0x00000308}, {0x000001D6, 0x00000304}, +{0x000001D7, 0x00000055}, {0x000001D7, 0x00000308}, {0x000001D7, 0x00000301}, {0x000001D8, 0x00000075}, +{0x000001D8, 0x00000308}, {0x000001D8, 0x00000301}, {0x000001D9, 0x00000055}, {0x000001D9, 0x00000308}, +{0x000001D9, 0x0000030C}, {0x000001DA, 0x00000075}, {0x000001DA, 0x00000308}, {0x000001DA, 0x0000030C}, +{0x000001DB, 0x00000055}, {0x000001DB, 0x00000308}, {0x000001DB, 0x00000300}, {0x000001DC, 0x00000075}, +{0x000001DC, 0x00000308}, {0x000001DC, 0x00000300}, {0x000001DE, 0x00000041}, {0x000001DE, 0x00000308}, +{0x000001DE, 0x00000304}, {0x000001DF, 0x00000061}, {0x000001DF, 0x00000308}, {0x000001DF, 0x00000304}, +{0x000001E0, 0x00000041}, {0x000001E0, 0x00000307}, {0x000001E0, 0x00000304}, {0x000001E1, 0x00000061}, +{0x000001E1, 0x00000307}, {0x000001E1, 0x00000304}, {0x000001E2, 0x000000C6}, {0x000001E2, 0x00000304}, +{0x000001E3, 0x000000E6}, {0x000001E3, 0x00000304}, {0x000001E6, 0x00000047}, {0x000001E6, 0x0000030C}, +{0x000001E7, 0x00000067}, {0x000001E7, 0x0000030C}, {0x000001E8, 0x0000004B}, {0x000001E8, 0x0000030C}, +{0x000001E9, 0x0000006B}, {0x000001E9, 0x0000030C}, {0x000001EA, 0x0000004F}, {0x000001EA, 0x00000328}, +{0x000001EB, 0x0000006F}, {0x000001EB, 0x00000328}, {0x000001EC, 0x0000004F}, {0x000001EC, 0x00000328}, +{0x000001EC, 0x00000304}, {0x000001ED, 0x0000006F}, {0x000001ED, 0x00000328}, {0x000001ED, 0x00000304}, +{0x000001EE, 0x000001B7}, {0x000001EE, 0x0000030C}, {0x000001EF, 0x00000292}, {0x000001EF, 0x0000030C}, +{0x000001F0, 0x0000006A}, {0x000001F0, 0x0000030C}, {0x000001F4, 0x00000047}, {0x000001F4, 0x00000301}, +{0x000001F5, 0x00000067}, {0x000001F5, 0x00000301}, {0x000001F8, 0x0000004E}, {0x000001F8, 0x00000300}, +{0x000001F9, 0x0000006E}, {0x000001F9, 0x00000300}, {0x000001FA, 0x00000041}, {0x000001FA, 0x0000030A}, +{0x000001FA, 0x00000301}, {0x000001FB, 0x00000061}, {0x000001FB, 0x0000030A}, {0x000001FB, 0x00000301}, +{0x000001FC, 0x000000C6}, {0x000001FC, 0x00000301}, {0x000001FD, 0x000000E6}, {0x000001FD, 0x00000301}, +{0x000001FE, 0x000000D8}, {0x000001FE, 0x00000301}, {0x000001FF, 0x000000F8}, {0x000001FF, 0x00000301}, +{0x00000200, 0x00000041}, {0x00000200, 0x0000030F}, {0x00000201, 0x00000061}, {0x00000201, 0x0000030F}, +{0x00000202, 0x00000041}, {0x00000202, 0x00000311}, {0x00000203, 0x00000061}, {0x00000203, 0x00000311}, +{0x00000204, 0x00000045}, {0x00000204, 0x0000030F}, {0x00000205, 0x00000065}, {0x00000205, 0x0000030F}, +{0x00000206, 0x00000045}, {0x00000206, 0x00000311}, {0x00000207, 0x00000065}, {0x00000207, 0x00000311}, +{0x00000208, 0x00000049}, {0x00000208, 0x0000030F}, {0x00000209, 0x00000069}, {0x00000209, 0x0000030F}, +{0x0000020A, 0x00000049}, {0x0000020A, 0x00000311}, {0x0000020B, 0x00000069}, {0x0000020B, 0x00000311}, +{0x0000020C, 0x0000004F}, {0x0000020C, 0x0000030F}, {0x0000020D, 0x0000006F}, {0x0000020D, 0x0000030F}, +{0x0000020E, 0x0000004F}, {0x0000020E, 0x00000311}, {0x0000020F, 0x0000006F}, {0x0000020F, 0x00000311}, +{0x00000210, 0x00000052}, {0x00000210, 0x0000030F}, {0x00000211, 0x00000072}, {0x00000211, 0x0000030F}, +{0x00000212, 0x00000052}, {0x00000212, 0x00000311}, {0x00000213, 0x00000072}, {0x00000213, 0x00000311}, +{0x00000214, 0x00000055}, {0x00000214, 0x0000030F}, {0x00000215, 0x00000075}, {0x00000215, 0x0000030F}, +{0x00000216, 0x00000055}, {0x00000216, 0x00000311}, {0x00000217, 0x00000075}, {0x00000217, 0x00000311}, +{0x00000218, 0x00000053}, {0x00000218, 0x00000326}, {0x00000219, 0x00000073}, {0x00000219, 0x00000326}, +{0x0000021A, 0x00000054}, {0x0000021A, 0x00000326}, {0x0000021B, 0x00000074}, {0x0000021B, 0x00000326}, +{0x0000021E, 0x00000048}, {0x0000021E, 0x0000030C}, {0x0000021F, 0x00000068}, {0x0000021F, 0x0000030C}, +{0x00000226, 0x00000041}, {0x00000226, 0x00000307}, {0x00000227, 0x00000061}, {0x00000227, 0x00000307}, +{0x00000228, 0x00000045}, {0x00000228, 0x00000327}, {0x00000229, 0x00000065}, {0x00000229, 0x00000327}, +{0x0000022A, 0x0000004F}, {0x0000022A, 0x00000308}, {0x0000022A, 0x00000304}, {0x0000022B, 0x0000006F}, +{0x0000022B, 0x00000308}, {0x0000022B, 0x00000304}, {0x0000022C, 0x0000004F}, {0x0000022C, 0x00000303}, +{0x0000022C, 0x00000304}, {0x0000022D, 0x0000006F}, {0x0000022D, 0x00000303}, {0x0000022D, 0x00000304}, +{0x0000022E, 0x0000004F}, {0x0000022E, 0x00000307}, {0x0000022F, 0x0000006F}, {0x0000022F, 0x00000307}, +{0x00000230, 0x0000004F}, {0x00000230, 0x00000307}, {0x00000230, 0x00000304}, {0x00000231, 0x0000006F}, +{0x00000231, 0x00000307}, {0x00000231, 0x00000304}, {0x00000232, 0x00000059}, {0x00000232, 0x00000304}, +{0x00000233, 0x00000079}, {0x00000233, 0x00000304}, {0x00000340, 0x00000300}, {0x00000341, 0x00000301}, +{0x00000343, 0x00000313}, {0x00000344, 0x00000308}, {0x00000344, 0x00000301}, {0x00000374, 0x000002B9}, +{0x0000037E, 0x0000003B}, {0x00000385, 0x000000A8}, {0x00000385, 0x00000301}, {0x00000386, 0x00000391}, +{0x00000386, 0x00000301}, {0x00000387, 0x000000B7}, {0x00000388, 0x00000395}, {0x00000388, 0x00000301}, +{0x00000389, 0x00000397}, {0x00000389, 0x00000301}, {0x0000038A, 0x00000399}, {0x0000038A, 0x00000301}, +{0x0000038C, 0x0000039F}, {0x0000038C, 0x00000301}, {0x0000038E, 0x000003A5}, {0x0000038E, 0x00000301}, +{0x0000038F, 0x000003A9}, {0x0000038F, 0x00000301}, {0x00000390, 0x000003B9}, {0x00000390, 0x00000308}, +{0x00000390, 0x00000301}, {0x000003AA, 0x00000399}, {0x000003AA, 0x00000308}, {0x000003AB, 0x000003A5}, +{0x000003AB, 0x00000308}, {0x000003AC, 0x000003B1}, {0x000003AC, 0x00000301}, {0x000003AD, 0x000003B5}, +{0x000003AD, 0x00000301}, {0x000003AE, 0x000003B7}, {0x000003AE, 0x00000301}, {0x000003AF, 0x000003B9}, +{0x000003AF, 0x00000301}, {0x000003B0, 0x000003C5}, {0x000003B0, 0x00000308}, {0x000003B0, 0x00000301}, +{0x000003CA, 0x000003B9}, {0x000003CA, 0x00000308}, {0x000003CB, 0x000003C5}, {0x000003CB, 0x00000308}, +{0x000003CC, 0x000003BF}, {0x000003CC, 0x00000301}, {0x000003CD, 0x000003C5}, {0x000003CD, 0x00000301}, +{0x000003CE, 0x000003C9}, {0x000003CE, 0x00000301}, {0x000003D3, 0x000003D2}, {0x000003D3, 0x00000301}, +{0x000003D4, 0x000003D2}, {0x000003D4, 0x00000308}, {0x00000400, 0x00000415}, {0x00000400, 0x00000300}, +{0x00000401, 0x00000415}, {0x00000401, 0x00000308}, {0x00000403, 0x00000413}, {0x00000403, 0x00000301}, +{0x00000407, 0x00000406}, {0x00000407, 0x00000308}, {0x0000040C, 0x0000041A}, {0x0000040C, 0x00000301}, +{0x0000040D, 0x00000418}, {0x0000040D, 0x00000300}, {0x0000040E, 0x00000423}, {0x0000040E, 0x00000306}, +{0x00000419, 0x00000418}, {0x00000419, 0x00000306}, {0x00000439, 0x00000438}, {0x00000439, 0x00000306}, +{0x00000450, 0x00000435}, {0x00000450, 0x00000300}, {0x00000451, 0x00000435}, {0x00000451, 0x00000308}, +{0x00000453, 0x00000433}, {0x00000453, 0x00000301}, {0x00000457, 0x00000456}, {0x00000457, 0x00000308}, +{0x0000045C, 0x0000043A}, {0x0000045C, 0x00000301}, {0x0000045D, 0x00000438}, {0x0000045D, 0x00000300}, +{0x0000045E, 0x00000443}, {0x0000045E, 0x00000306}, {0x00000476, 0x00000474}, {0x00000476, 0x0000030F}, +{0x00000477, 0x00000475}, {0x00000477, 0x0000030F}, {0x000004C1, 0x00000416}, {0x000004C1, 0x00000306}, +{0x000004C2, 0x00000436}, {0x000004C2, 0x00000306}, {0x000004D0, 0x00000410}, {0x000004D0, 0x00000306}, +{0x000004D1, 0x00000430}, {0x000004D1, 0x00000306}, {0x000004D2, 0x00000410}, {0x000004D2, 0x00000308}, +{0x000004D3, 0x00000430}, {0x000004D3, 0x00000308}, {0x000004D6, 0x00000415}, {0x000004D6, 0x00000306}, +{0x000004D7, 0x00000435}, {0x000004D7, 0x00000306}, {0x000004DA, 0x000004D8}, {0x000004DA, 0x00000308}, +{0x000004DB, 0x000004D9}, {0x000004DB, 0x00000308}, {0x000004DC, 0x00000416}, {0x000004DC, 0x00000308}, +{0x000004DD, 0x00000436}, {0x000004DD, 0x00000308}, {0x000004DE, 0x00000417}, {0x000004DE, 0x00000308}, +{0x000004DF, 0x00000437}, {0x000004DF, 0x00000308}, {0x000004E2, 0x00000418}, {0x000004E2, 0x00000304}, +{0x000004E3, 0x00000438}, {0x000004E3, 0x00000304}, {0x000004E4, 0x00000418}, {0x000004E4, 0x00000308}, +{0x000004E5, 0x00000438}, {0x000004E5, 0x00000308}, {0x000004E6, 0x0000041E}, {0x000004E6, 0x00000308}, +{0x000004E7, 0x0000043E}, {0x000004E7, 0x00000308}, {0x000004EA, 0x000004E8}, {0x000004EA, 0x00000308}, +{0x000004EB, 0x000004E9}, {0x000004EB, 0x00000308}, {0x000004EC, 0x0000042D}, {0x000004EC, 0x00000308}, +{0x000004ED, 0x0000044D}, {0x000004ED, 0x00000308}, {0x000004EE, 0x00000423}, {0x000004EE, 0x00000304}, +{0x000004EF, 0x00000443}, {0x000004EF, 0x00000304}, {0x000004F0, 0x00000423}, {0x000004F0, 0x00000308}, +{0x000004F1, 0x00000443}, {0x000004F1, 0x00000308}, {0x000004F2, 0x00000423}, {0x000004F2, 0x0000030B}, +{0x000004F3, 0x00000443}, {0x000004F3, 0x0000030B}, {0x000004F4, 0x00000427}, {0x000004F4, 0x00000308}, +{0x000004F5, 0x00000447}, {0x000004F5, 0x00000308}, {0x000004F8, 0x0000042B}, {0x000004F8, 0x00000308}, +{0x000004F9, 0x0000044B}, {0x000004F9, 0x00000308}, {0x00000622, 0x00000627}, {0x00000622, 0x00000653}, +{0x00000623, 0x00000627}, {0x00000623, 0x00000654}, {0x00000624, 0x00000648}, {0x00000624, 0x00000654}, +{0x00000625, 0x00000627}, {0x00000625, 0x00000655}, {0x00000626, 0x0000064A}, {0x00000626, 0x00000654}, +{0x000006C0, 0x000006D5}, {0x000006C0, 0x00000654}, {0x000006C2, 0x000006C1}, {0x000006C2, 0x00000654}, +{0x000006D3, 0x000006D2}, {0x000006D3, 0x00000654}, {0x00000929, 0x00000928}, {0x00000929, 0x0000093C}, +{0x00000931, 0x00000930}, {0x00000931, 0x0000093C}, {0x00000934, 0x00000933}, {0x00000934, 0x0000093C}, +{0x00000958, 0x00000915}, {0x00000958, 0x0000093C}, {0x00000959, 0x00000916}, {0x00000959, 0x0000093C}, +{0x0000095A, 0x00000917}, {0x0000095A, 0x0000093C}, {0x0000095B, 0x0000091C}, {0x0000095B, 0x0000093C}, +{0x0000095C, 0x00000921}, {0x0000095C, 0x0000093C}, {0x0000095D, 0x00000922}, {0x0000095D, 0x0000093C}, +{0x0000095E, 0x0000092B}, {0x0000095E, 0x0000093C}, {0x0000095F, 0x0000092F}, {0x0000095F, 0x0000093C}, +{0x000009CB, 0x000009C7}, {0x000009CB, 0x000009BE}, {0x000009CC, 0x000009C7}, {0x000009CC, 0x000009D7}, +{0x000009DC, 0x000009A1}, {0x000009DC, 0x000009BC}, {0x000009DD, 0x000009A2}, {0x000009DD, 0x000009BC}, +{0x000009DF, 0x000009AF}, {0x000009DF, 0x000009BC}, {0x00000A33, 0x00000A32}, {0x00000A33, 0x00000A3C}, +{0x00000A36, 0x00000A38}, {0x00000A36, 0x00000A3C}, {0x00000A59, 0x00000A16}, {0x00000A59, 0x00000A3C}, +{0x00000A5A, 0x00000A17}, {0x00000A5A, 0x00000A3C}, {0x00000A5B, 0x00000A1C}, {0x00000A5B, 0x00000A3C}, +{0x00000A5E, 0x00000A2B}, {0x00000A5E, 0x00000A3C}, {0x00000B48, 0x00000B47}, {0x00000B48, 0x00000B56}, +{0x00000B4B, 0x00000B47}, {0x00000B4B, 0x00000B3E}, {0x00000B4C, 0x00000B47}, {0x00000B4C, 0x00000B57}, +{0x00000B5C, 0x00000B21}, {0x00000B5C, 0x00000B3C}, {0x00000B5D, 0x00000B22}, {0x00000B5D, 0x00000B3C}, +{0x00000B94, 0x00000B92}, {0x00000B94, 0x00000BD7}, {0x00000BCA, 0x00000BC6}, {0x00000BCA, 0x00000BBE}, +{0x00000BCB, 0x00000BC7}, {0x00000BCB, 0x00000BBE}, {0x00000BCC, 0x00000BC6}, {0x00000BCC, 0x00000BD7}, +{0x00000C48, 0x00000C46}, {0x00000C48, 0x00000C56}, {0x00000CC0, 0x00000CBF}, {0x00000CC0, 0x00000CD5}, +{0x00000CC7, 0x00000CC6}, {0x00000CC7, 0x00000CD5}, {0x00000CC8, 0x00000CC6}, {0x00000CC8, 0x00000CD6}, +{0x00000CCA, 0x00000CC6}, {0x00000CCA, 0x00000CC2}, {0x00000CCB, 0x00000CC6}, {0x00000CCB, 0x00000CC2}, +{0x00000CCB, 0x00000CD5}, {0x00000D4A, 0x00000D46}, {0x00000D4A, 0x00000D3E}, {0x00000D4B, 0x00000D47}, +{0x00000D4B, 0x00000D3E}, {0x00000D4C, 0x00000D46}, {0x00000D4C, 0x00000D57}, {0x00000DDA, 0x00000DD9}, +{0x00000DDA, 0x00000DCA}, {0x00000DDC, 0x00000DD9}, {0x00000DDC, 0x00000DCF}, {0x00000DDD, 0x00000DD9}, +{0x00000DDD, 0x00000DCF}, {0x00000DDD, 0x00000DCA}, {0x00000DDE, 0x00000DD9}, {0x00000DDE, 0x00000DDF}, +{0x00000F43, 0x00000F42}, {0x00000F43, 0x00000FB7}, {0x00000F4D, 0x00000F4C}, {0x00000F4D, 0x00000FB7}, +{0x00000F52, 0x00000F51}, {0x00000F52, 0x00000FB7}, {0x00000F57, 0x00000F56}, {0x00000F57, 0x00000FB7}, +{0x00000F5C, 0x00000F5B}, {0x00000F5C, 0x00000FB7}, {0x00000F69, 0x00000F40}, {0x00000F69, 0x00000FB5}, +{0x00000F73, 0x00000F71}, {0x00000F73, 0x00000F72}, {0x00000F75, 0x00000F71}, {0x00000F75, 0x00000F74}, +{0x00000F76, 0x00000FB2}, {0x00000F76, 0x00000F80}, {0x00000F78, 0x00000FB3}, {0x00000F78, 0x00000F80}, +{0x00000F81, 0x00000F71}, {0x00000F81, 0x00000F80}, {0x00000F93, 0x00000F92}, {0x00000F93, 0x00000FB7}, +{0x00000F9D, 0x00000F9C}, {0x00000F9D, 0x00000FB7}, {0x00000FA2, 0x00000FA1}, {0x00000FA2, 0x00000FB7}, +{0x00000FA7, 0x00000FA6}, {0x00000FA7, 0x00000FB7}, {0x00000FAC, 0x00000FAB}, {0x00000FAC, 0x00000FB7}, +{0x00000FB9, 0x00000F90}, {0x00000FB9, 0x00000FB5}, {0x00001026, 0x00001025}, {0x00001026, 0x0000102E}, +{0x00001B06, 0x00001B05}, {0x00001B06, 0x00001B35}, {0x00001B08, 0x00001B07}, {0x00001B08, 0x00001B35}, +{0x00001B0A, 0x00001B09}, {0x00001B0A, 0x00001B35}, {0x00001B0C, 0x00001B0B}, {0x00001B0C, 0x00001B35}, +{0x00001B0E, 0x00001B0D}, {0x00001B0E, 0x00001B35}, {0x00001B12, 0x00001B11}, {0x00001B12, 0x00001B35}, +{0x00001B3B, 0x00001B3A}, {0x00001B3B, 0x00001B35}, {0x00001B3D, 0x00001B3C}, {0x00001B3D, 0x00001B35}, +{0x00001B40, 0x00001B3E}, {0x00001B40, 0x00001B35}, {0x00001B41, 0x00001B3F}, {0x00001B41, 0x00001B35}, +{0x00001B43, 0x00001B42}, {0x00001B43, 0x00001B35}, {0x00001E00, 0x00000041}, {0x00001E00, 0x00000325}, +{0x00001E01, 0x00000061}, {0x00001E01, 0x00000325}, {0x00001E02, 0x00000042}, {0x00001E02, 0x00000307}, +{0x00001E03, 0x00000062}, {0x00001E03, 0x00000307}, {0x00001E04, 0x00000042}, {0x00001E04, 0x00000323}, +{0x00001E05, 0x00000062}, {0x00001E05, 0x00000323}, {0x00001E06, 0x00000042}, {0x00001E06, 0x00000331}, +{0x00001E07, 0x00000062}, {0x00001E07, 0x00000331}, {0x00001E08, 0x00000043}, {0x00001E08, 0x00000327}, +{0x00001E08, 0x00000301}, {0x00001E09, 0x00000063}, {0x00001E09, 0x00000327}, {0x00001E09, 0x00000301}, +{0x00001E0A, 0x00000044}, {0x00001E0A, 0x00000307}, {0x00001E0B, 0x00000064}, {0x00001E0B, 0x00000307}, +{0x00001E0C, 0x00000044}, {0x00001E0C, 0x00000323}, {0x00001E0D, 0x00000064}, {0x00001E0D, 0x00000323}, +{0x00001E0E, 0x00000044}, {0x00001E0E, 0x00000331}, {0x00001E0F, 0x00000064}, {0x00001E0F, 0x00000331}, +{0x00001E10, 0x00000044}, {0x00001E10, 0x00000327}, {0x00001E11, 0x00000064}, {0x00001E11, 0x00000327}, +{0x00001E12, 0x00000044}, {0x00001E12, 0x0000032D}, {0x00001E13, 0x00000064}, {0x00001E13, 0x0000032D}, +{0x00001E14, 0x00000045}, {0x00001E14, 0x00000304}, {0x00001E14, 0x00000300}, {0x00001E15, 0x00000065}, +{0x00001E15, 0x00000304}, {0x00001E15, 0x00000300}, {0x00001E16, 0x00000045}, {0x00001E16, 0x00000304}, +{0x00001E16, 0x00000301}, {0x00001E17, 0x00000065}, {0x00001E17, 0x00000304}, {0x00001E17, 0x00000301}, +{0x00001E18, 0x00000045}, {0x00001E18, 0x0000032D}, {0x00001E19, 0x00000065}, {0x00001E19, 0x0000032D}, +{0x00001E1A, 0x00000045}, {0x00001E1A, 0x00000330}, {0x00001E1B, 0x00000065}, {0x00001E1B, 0x00000330}, +{0x00001E1C, 0x00000045}, {0x00001E1C, 0x00000327}, {0x00001E1C, 0x00000306}, {0x00001E1D, 0x00000065}, +{0x00001E1D, 0x00000327}, {0x00001E1D, 0x00000306}, {0x00001E1E, 0x00000046}, {0x00001E1E, 0x00000307}, +{0x00001E1F, 0x00000066}, {0x00001E1F, 0x00000307}, {0x00001E20, 0x00000047}, {0x00001E20, 0x00000304}, +{0x00001E21, 0x00000067}, {0x00001E21, 0x00000304}, {0x00001E22, 0x00000048}, {0x00001E22, 0x00000307}, +{0x00001E23, 0x00000068}, {0x00001E23, 0x00000307}, {0x00001E24, 0x00000048}, {0x00001E24, 0x00000323}, +{0x00001E25, 0x00000068}, {0x00001E25, 0x00000323}, {0x00001E26, 0x00000048}, {0x00001E26, 0x00000308}, +{0x00001E27, 0x00000068}, {0x00001E27, 0x00000308}, {0x00001E28, 0x00000048}, {0x00001E28, 0x00000327}, +{0x00001E29, 0x00000068}, {0x00001E29, 0x00000327}, {0x00001E2A, 0x00000048}, {0x00001E2A, 0x0000032E}, +{0x00001E2B, 0x00000068}, {0x00001E2B, 0x0000032E}, {0x00001E2C, 0x00000049}, {0x00001E2C, 0x00000330}, +{0x00001E2D, 0x00000069}, {0x00001E2D, 0x00000330}, {0x00001E2E, 0x00000049}, {0x00001E2E, 0x00000308}, +{0x00001E2E, 0x00000301}, {0x00001E2F, 0x00000069}, {0x00001E2F, 0x00000308}, {0x00001E2F, 0x00000301}, +{0x00001E30, 0x0000004B}, {0x00001E30, 0x00000301}, {0x00001E31, 0x0000006B}, {0x00001E31, 0x00000301}, +{0x00001E32, 0x0000004B}, {0x00001E32, 0x00000323}, {0x00001E33, 0x0000006B}, {0x00001E33, 0x00000323}, +{0x00001E34, 0x0000004B}, {0x00001E34, 0x00000331}, {0x00001E35, 0x0000006B}, {0x00001E35, 0x00000331}, +{0x00001E36, 0x0000004C}, {0x00001E36, 0x00000323}, {0x00001E37, 0x0000006C}, {0x00001E37, 0x00000323}, +{0x00001E38, 0x0000004C}, {0x00001E38, 0x00000323}, {0x00001E38, 0x00000304}, {0x00001E39, 0x0000006C}, +{0x00001E39, 0x00000323}, {0x00001E39, 0x00000304}, {0x00001E3A, 0x0000004C}, {0x00001E3A, 0x00000331}, +{0x00001E3B, 0x0000006C}, {0x00001E3B, 0x00000331}, {0x00001E3C, 0x0000004C}, {0x00001E3C, 0x0000032D}, +{0x00001E3D, 0x0000006C}, {0x00001E3D, 0x0000032D}, {0x00001E3E, 0x0000004D}, {0x00001E3E, 0x00000301}, +{0x00001E3F, 0x0000006D}, {0x00001E3F, 0x00000301}, {0x00001E40, 0x0000004D}, {0x00001E40, 0x00000307}, +{0x00001E41, 0x0000006D}, {0x00001E41, 0x00000307}, {0x00001E42, 0x0000004D}, {0x00001E42, 0x00000323}, +{0x00001E43, 0x0000006D}, {0x00001E43, 0x00000323}, {0x00001E44, 0x0000004E}, {0x00001E44, 0x00000307}, +{0x00001E45, 0x0000006E}, {0x00001E45, 0x00000307}, {0x00001E46, 0x0000004E}, {0x00001E46, 0x00000323}, +{0x00001E47, 0x0000006E}, {0x00001E47, 0x00000323}, {0x00001E48, 0x0000004E}, {0x00001E48, 0x00000331}, +{0x00001E49, 0x0000006E}, {0x00001E49, 0x00000331}, {0x00001E4A, 0x0000004E}, {0x00001E4A, 0x0000032D}, +{0x00001E4B, 0x0000006E}, {0x00001E4B, 0x0000032D}, {0x00001E4C, 0x0000004F}, {0x00001E4C, 0x00000303}, +{0x00001E4C, 0x00000301}, {0x00001E4D, 0x0000006F}, {0x00001E4D, 0x00000303}, {0x00001E4D, 0x00000301}, +{0x00001E4E, 0x0000004F}, {0x00001E4E, 0x00000303}, {0x00001E4E, 0x00000308}, {0x00001E4F, 0x0000006F}, +{0x00001E4F, 0x00000303}, {0x00001E4F, 0x00000308}, {0x00001E50, 0x0000004F}, {0x00001E50, 0x00000304}, +{0x00001E50, 0x00000300}, {0x00001E51, 0x0000006F}, {0x00001E51, 0x00000304}, {0x00001E51, 0x00000300}, +{0x00001E52, 0x0000004F}, {0x00001E52, 0x00000304}, {0x00001E52, 0x00000301}, {0x00001E53, 0x0000006F}, +{0x00001E53, 0x00000304}, {0x00001E53, 0x00000301}, {0x00001E54, 0x00000050}, {0x00001E54, 0x00000301}, +{0x00001E55, 0x00000070}, {0x00001E55, 0x00000301}, {0x00001E56, 0x00000050}, {0x00001E56, 0x00000307}, +{0x00001E57, 0x00000070}, {0x00001E57, 0x00000307}, {0x00001E58, 0x00000052}, {0x00001E58, 0x00000307}, +{0x00001E59, 0x00000072}, {0x00001E59, 0x00000307}, {0x00001E5A, 0x00000052}, {0x00001E5A, 0x00000323}, +{0x00001E5B, 0x00000072}, {0x00001E5B, 0x00000323}, {0x00001E5C, 0x00000052}, {0x00001E5C, 0x00000323}, +{0x00001E5C, 0x00000304}, {0x00001E5D, 0x00000072}, {0x00001E5D, 0x00000323}, {0x00001E5D, 0x00000304}, +{0x00001E5E, 0x00000052}, {0x00001E5E, 0x00000331}, {0x00001E5F, 0x00000072}, {0x00001E5F, 0x00000331}, +{0x00001E60, 0x00000053}, {0x00001E60, 0x00000307}, {0x00001E61, 0x00000073}, {0x00001E61, 0x00000307}, +{0x00001E62, 0x00000053}, {0x00001E62, 0x00000323}, {0x00001E63, 0x00000073}, {0x00001E63, 0x00000323}, +{0x00001E64, 0x00000053}, {0x00001E64, 0x00000301}, {0x00001E64, 0x00000307}, {0x00001E65, 0x00000073}, +{0x00001E65, 0x00000301}, {0x00001E65, 0x00000307}, {0x00001E66, 0x00000053}, {0x00001E66, 0x0000030C}, +{0x00001E66, 0x00000307}, {0x00001E67, 0x00000073}, {0x00001E67, 0x0000030C}, {0x00001E67, 0x00000307}, +{0x00001E68, 0x00000053}, {0x00001E68, 0x00000323}, {0x00001E68, 0x00000307}, {0x00001E69, 0x00000073}, +{0x00001E69, 0x00000323}, {0x00001E69, 0x00000307}, {0x00001E6A, 0x00000054}, {0x00001E6A, 0x00000307}, +{0x00001E6B, 0x00000074}, {0x00001E6B, 0x00000307}, {0x00001E6C, 0x00000054}, {0x00001E6C, 0x00000323}, +{0x00001E6D, 0x00000074}, {0x00001E6D, 0x00000323}, {0x00001E6E, 0x00000054}, {0x00001E6E, 0x00000331}, +{0x00001E6F, 0x00000074}, {0x00001E6F, 0x00000331}, {0x00001E70, 0x00000054}, {0x00001E70, 0x0000032D}, +{0x00001E71, 0x00000074}, {0x00001E71, 0x0000032D}, {0x00001E72, 0x00000055}, {0x00001E72, 0x00000324}, +{0x00001E73, 0x00000075}, {0x00001E73, 0x00000324}, {0x00001E74, 0x00000055}, {0x00001E74, 0x00000330}, +{0x00001E75, 0x00000075}, {0x00001E75, 0x00000330}, {0x00001E76, 0x00000055}, {0x00001E76, 0x0000032D}, +{0x00001E77, 0x00000075}, {0x00001E77, 0x0000032D}, {0x00001E78, 0x00000055}, {0x00001E78, 0x00000303}, +{0x00001E78, 0x00000301}, {0x00001E79, 0x00000075}, {0x00001E79, 0x00000303}, {0x00001E79, 0x00000301}, +{0x00001E7A, 0x00000055}, {0x00001E7A, 0x00000304}, {0x00001E7A, 0x00000308}, {0x00001E7B, 0x00000075}, +{0x00001E7B, 0x00000304}, {0x00001E7B, 0x00000308}, {0x00001E7C, 0x00000056}, {0x00001E7C, 0x00000303}, +{0x00001E7D, 0x00000076}, {0x00001E7D, 0x00000303}, {0x00001E7E, 0x00000056}, {0x00001E7E, 0x00000323}, +{0x00001E7F, 0x00000076}, {0x00001E7F, 0x00000323}, {0x00001E80, 0x00000057}, {0x00001E80, 0x00000300}, +{0x00001E81, 0x00000077}, {0x00001E81, 0x00000300}, {0x00001E82, 0x00000057}, {0x00001E82, 0x00000301}, +{0x00001E83, 0x00000077}, {0x00001E83, 0x00000301}, {0x00001E84, 0x00000057}, {0x00001E84, 0x00000308}, +{0x00001E85, 0x00000077}, {0x00001E85, 0x00000308}, {0x00001E86, 0x00000057}, {0x00001E86, 0x00000307}, +{0x00001E87, 0x00000077}, {0x00001E87, 0x00000307}, {0x00001E88, 0x00000057}, {0x00001E88, 0x00000323}, +{0x00001E89, 0x00000077}, {0x00001E89, 0x00000323}, {0x00001E8A, 0x00000058}, {0x00001E8A, 0x00000307}, +{0x00001E8B, 0x00000078}, {0x00001E8B, 0x00000307}, {0x00001E8C, 0x00000058}, {0x00001E8C, 0x00000308}, +{0x00001E8D, 0x00000078}, {0x00001E8D, 0x00000308}, {0x00001E8E, 0x00000059}, {0x00001E8E, 0x00000307}, +{0x00001E8F, 0x00000079}, {0x00001E8F, 0x00000307}, {0x00001E90, 0x0000005A}, {0x00001E90, 0x00000302}, +{0x00001E91, 0x0000007A}, {0x00001E91, 0x00000302}, {0x00001E92, 0x0000005A}, {0x00001E92, 0x00000323}, +{0x00001E93, 0x0000007A}, {0x00001E93, 0x00000323}, {0x00001E94, 0x0000005A}, {0x00001E94, 0x00000331}, +{0x00001E95, 0x0000007A}, {0x00001E95, 0x00000331}, {0x00001E96, 0x00000068}, {0x00001E96, 0x00000331}, +{0x00001E97, 0x00000074}, {0x00001E97, 0x00000308}, {0x00001E98, 0x00000077}, {0x00001E98, 0x0000030A}, +{0x00001E99, 0x00000079}, {0x00001E99, 0x0000030A}, {0x00001E9B, 0x0000017F}, {0x00001E9B, 0x00000307}, +{0x00001EA0, 0x00000041}, {0x00001EA0, 0x00000323}, {0x00001EA1, 0x00000061}, {0x00001EA1, 0x00000323}, +{0x00001EA2, 0x00000041}, {0x00001EA2, 0x00000309}, {0x00001EA3, 0x00000061}, {0x00001EA3, 0x00000309}, +{0x00001EA4, 0x00000041}, {0x00001EA4, 0x00000302}, {0x00001EA4, 0x00000301}, {0x00001EA5, 0x00000061}, +{0x00001EA5, 0x00000302}, {0x00001EA5, 0x00000301}, {0x00001EA6, 0x00000041}, {0x00001EA6, 0x00000302}, +{0x00001EA6, 0x00000300}, {0x00001EA7, 0x00000061}, {0x00001EA7, 0x00000302}, {0x00001EA7, 0x00000300}, +{0x00001EA8, 0x00000041}, {0x00001EA8, 0x00000302}, {0x00001EA8, 0x00000309}, {0x00001EA9, 0x00000061}, +{0x00001EA9, 0x00000302}, {0x00001EA9, 0x00000309}, {0x00001EAA, 0x00000041}, {0x00001EAA, 0x00000302}, +{0x00001EAA, 0x00000303}, {0x00001EAB, 0x00000061}, {0x00001EAB, 0x00000302}, {0x00001EAB, 0x00000303}, +{0x00001EAC, 0x00000041}, {0x00001EAC, 0x00000323}, {0x00001EAC, 0x00000302}, {0x00001EAD, 0x00000061}, +{0x00001EAD, 0x00000323}, {0x00001EAD, 0x00000302}, {0x00001EAE, 0x00000041}, {0x00001EAE, 0x00000306}, +{0x00001EAE, 0x00000301}, {0x00001EAF, 0x00000061}, {0x00001EAF, 0x00000306}, {0x00001EAF, 0x00000301}, +{0x00001EB0, 0x00000041}, {0x00001EB0, 0x00000306}, {0x00001EB0, 0x00000300}, {0x00001EB1, 0x00000061}, +{0x00001EB1, 0x00000306}, {0x00001EB1, 0x00000300}, {0x00001EB2, 0x00000041}, {0x00001EB2, 0x00000306}, +{0x00001EB2, 0x00000309}, {0x00001EB3, 0x00000061}, {0x00001EB3, 0x00000306}, {0x00001EB3, 0x00000309}, +{0x00001EB4, 0x00000041}, {0x00001EB4, 0x00000306}, {0x00001EB4, 0x00000303}, {0x00001EB5, 0x00000061}, +{0x00001EB5, 0x00000306}, {0x00001EB5, 0x00000303}, {0x00001EB6, 0x00000041}, {0x00001EB6, 0x00000323}, +{0x00001EB6, 0x00000306}, {0x00001EB7, 0x00000061}, {0x00001EB7, 0x00000323}, {0x00001EB7, 0x00000306}, +{0x00001EB8, 0x00000045}, {0x00001EB8, 0x00000323}, {0x00001EB9, 0x00000065}, {0x00001EB9, 0x00000323}, +{0x00001EBA, 0x00000045}, {0x00001EBA, 0x00000309}, {0x00001EBB, 0x00000065}, {0x00001EBB, 0x00000309}, +{0x00001EBC, 0x00000045}, {0x00001EBC, 0x00000303}, {0x00001EBD, 0x00000065}, {0x00001EBD, 0x00000303}, +{0x00001EBE, 0x00000045}, {0x00001EBE, 0x00000302}, {0x00001EBE, 0x00000301}, {0x00001EBF, 0x00000065}, +{0x00001EBF, 0x00000302}, {0x00001EBF, 0x00000301}, {0x00001EC0, 0x00000045}, {0x00001EC0, 0x00000302}, +{0x00001EC0, 0x00000300}, {0x00001EC1, 0x00000065}, {0x00001EC1, 0x00000302}, {0x00001EC1, 0x00000300}, +{0x00001EC2, 0x00000045}, {0x00001EC2, 0x00000302}, {0x00001EC2, 0x00000309}, {0x00001EC3, 0x00000065}, +{0x00001EC3, 0x00000302}, {0x00001EC3, 0x00000309}, {0x00001EC4, 0x00000045}, {0x00001EC4, 0x00000302}, +{0x00001EC4, 0x00000303}, {0x00001EC5, 0x00000065}, {0x00001EC5, 0x00000302}, {0x00001EC5, 0x00000303}, +{0x00001EC6, 0x00000045}, {0x00001EC6, 0x00000323}, {0x00001EC6, 0x00000302}, {0x00001EC7, 0x00000065}, +{0x00001EC7, 0x00000323}, {0x00001EC7, 0x00000302}, {0x00001EC8, 0x00000049}, {0x00001EC8, 0x00000309}, +{0x00001EC9, 0x00000069}, {0x00001EC9, 0x00000309}, {0x00001ECA, 0x00000049}, {0x00001ECA, 0x00000323}, +{0x00001ECB, 0x00000069}, {0x00001ECB, 0x00000323}, {0x00001ECC, 0x0000004F}, {0x00001ECC, 0x00000323}, +{0x00001ECD, 0x0000006F}, {0x00001ECD, 0x00000323}, {0x00001ECE, 0x0000004F}, {0x00001ECE, 0x00000309}, +{0x00001ECF, 0x0000006F}, {0x00001ECF, 0x00000309}, {0x00001ED0, 0x0000004F}, {0x00001ED0, 0x00000302}, +{0x00001ED0, 0x00000301}, {0x00001ED1, 0x0000006F}, {0x00001ED1, 0x00000302}, {0x00001ED1, 0x00000301}, +{0x00001ED2, 0x0000004F}, {0x00001ED2, 0x00000302}, {0x00001ED2, 0x00000300}, {0x00001ED3, 0x0000006F}, +{0x00001ED3, 0x00000302}, {0x00001ED3, 0x00000300}, {0x00001ED4, 0x0000004F}, {0x00001ED4, 0x00000302}, +{0x00001ED4, 0x00000309}, {0x00001ED5, 0x0000006F}, {0x00001ED5, 0x00000302}, {0x00001ED5, 0x00000309}, +{0x00001ED6, 0x0000004F}, {0x00001ED6, 0x00000302}, {0x00001ED6, 0x00000303}, {0x00001ED7, 0x0000006F}, +{0x00001ED7, 0x00000302}, {0x00001ED7, 0x00000303}, {0x00001ED8, 0x0000004F}, {0x00001ED8, 0x00000323}, +{0x00001ED8, 0x00000302}, {0x00001ED9, 0x0000006F}, {0x00001ED9, 0x00000323}, {0x00001ED9, 0x00000302}, +{0x00001EDA, 0x0000004F}, {0x00001EDA, 0x0000031B}, {0x00001EDA, 0x00000301}, {0x00001EDB, 0x0000006F}, +{0x00001EDB, 0x0000031B}, {0x00001EDB, 0x00000301}, {0x00001EDC, 0x0000004F}, {0x00001EDC, 0x0000031B}, +{0x00001EDC, 0x00000300}, {0x00001EDD, 0x0000006F}, {0x00001EDD, 0x0000031B}, {0x00001EDD, 0x00000300}, +{0x00001EDE, 0x0000004F}, {0x00001EDE, 0x0000031B}, {0x00001EDE, 0x00000309}, {0x00001EDF, 0x0000006F}, +{0x00001EDF, 0x0000031B}, {0x00001EDF, 0x00000309}, {0x00001EE0, 0x0000004F}, {0x00001EE0, 0x0000031B}, +{0x00001EE0, 0x00000303}, {0x00001EE1, 0x0000006F}, {0x00001EE1, 0x0000031B}, {0x00001EE1, 0x00000303}, +{0x00001EE2, 0x0000004F}, {0x00001EE2, 0x0000031B}, {0x00001EE2, 0x00000323}, {0x00001EE3, 0x0000006F}, +{0x00001EE3, 0x0000031B}, {0x00001EE3, 0x00000323}, {0x00001EE4, 0x00000055}, {0x00001EE4, 0x00000323}, +{0x00001EE5, 0x00000075}, {0x00001EE5, 0x00000323}, {0x00001EE6, 0x00000055}, {0x00001EE6, 0x00000309}, +{0x00001EE7, 0x00000075}, {0x00001EE7, 0x00000309}, {0x00001EE8, 0x00000055}, {0x00001EE8, 0x0000031B}, +{0x00001EE8, 0x00000301}, {0x00001EE9, 0x00000075}, {0x00001EE9, 0x0000031B}, {0x00001EE9, 0x00000301}, +{0x00001EEA, 0x00000055}, {0x00001EEA, 0x0000031B}, {0x00001EEA, 0x00000300}, {0x00001EEB, 0x00000075}, +{0x00001EEB, 0x0000031B}, {0x00001EEB, 0x00000300}, {0x00001EEC, 0x00000055}, {0x00001EEC, 0x0000031B}, +{0x00001EEC, 0x00000309}, {0x00001EED, 0x00000075}, {0x00001EED, 0x0000031B}, {0x00001EED, 0x00000309}, +{0x00001EEE, 0x00000055}, {0x00001EEE, 0x0000031B}, {0x00001EEE, 0x00000303}, {0x00001EEF, 0x00000075}, +{0x00001EEF, 0x0000031B}, {0x00001EEF, 0x00000303}, {0x00001EF0, 0x00000055}, {0x00001EF0, 0x0000031B}, +{0x00001EF0, 0x00000323}, {0x00001EF1, 0x00000075}, {0x00001EF1, 0x0000031B}, {0x00001EF1, 0x00000323}, +{0x00001EF2, 0x00000059}, {0x00001EF2, 0x00000300}, {0x00001EF3, 0x00000079}, {0x00001EF3, 0x00000300}, +{0x00001EF4, 0x00000059}, {0x00001EF4, 0x00000323}, {0x00001EF5, 0x00000079}, {0x00001EF5, 0x00000323}, +{0x00001EF6, 0x00000059}, {0x00001EF6, 0x00000309}, {0x00001EF7, 0x00000079}, {0x00001EF7, 0x00000309}, +{0x00001EF8, 0x00000059}, {0x00001EF8, 0x00000303}, {0x00001EF9, 0x00000079}, {0x00001EF9, 0x00000303}, +{0x00001F00, 0x000003B1}, {0x00001F00, 0x00000313}, {0x00001F01, 0x000003B1}, {0x00001F01, 0x00000314}, +{0x00001F02, 0x000003B1}, {0x00001F02, 0x00000313}, {0x00001F02, 0x00000300}, {0x00001F03, 0x000003B1}, +{0x00001F03, 0x00000314}, {0x00001F03, 0x00000300}, {0x00001F04, 0x000003B1}, {0x00001F04, 0x00000313}, +{0x00001F04, 0x00000301}, {0x00001F05, 0x000003B1}, {0x00001F05, 0x00000314}, {0x00001F05, 0x00000301}, +{0x00001F06, 0x000003B1}, {0x00001F06, 0x00000313}, {0x00001F06, 0x00000342}, {0x00001F07, 0x000003B1}, +{0x00001F07, 0x00000314}, {0x00001F07, 0x00000342}, {0x00001F08, 0x00000391}, {0x00001F08, 0x00000313}, +{0x00001F09, 0x00000391}, {0x00001F09, 0x00000314}, {0x00001F0A, 0x00000391}, {0x00001F0A, 0x00000313}, +{0x00001F0A, 0x00000300}, {0x00001F0B, 0x00000391}, {0x00001F0B, 0x00000314}, {0x00001F0B, 0x00000300}, +{0x00001F0C, 0x00000391}, {0x00001F0C, 0x00000313}, {0x00001F0C, 0x00000301}, {0x00001F0D, 0x00000391}, +{0x00001F0D, 0x00000314}, {0x00001F0D, 0x00000301}, {0x00001F0E, 0x00000391}, {0x00001F0E, 0x00000313}, +{0x00001F0E, 0x00000342}, {0x00001F0F, 0x00000391}, {0x00001F0F, 0x00000314}, {0x00001F0F, 0x00000342}, +{0x00001F10, 0x000003B5}, {0x00001F10, 0x00000313}, {0x00001F11, 0x000003B5}, {0x00001F11, 0x00000314}, +{0x00001F12, 0x000003B5}, {0x00001F12, 0x00000313}, {0x00001F12, 0x00000300}, {0x00001F13, 0x000003B5}, +{0x00001F13, 0x00000314}, {0x00001F13, 0x00000300}, {0x00001F14, 0x000003B5}, {0x00001F14, 0x00000313}, +{0x00001F14, 0x00000301}, {0x00001F15, 0x000003B5}, {0x00001F15, 0x00000314}, {0x00001F15, 0x00000301}, +{0x00001F18, 0x00000395}, {0x00001F18, 0x00000313}, {0x00001F19, 0x00000395}, {0x00001F19, 0x00000314}, +{0x00001F1A, 0x00000395}, {0x00001F1A, 0x00000313}, {0x00001F1A, 0x00000300}, {0x00001F1B, 0x00000395}, +{0x00001F1B, 0x00000314}, {0x00001F1B, 0x00000300}, {0x00001F1C, 0x00000395}, {0x00001F1C, 0x00000313}, +{0x00001F1C, 0x00000301}, {0x00001F1D, 0x00000395}, {0x00001F1D, 0x00000314}, {0x00001F1D, 0x00000301}, +{0x00001F20, 0x000003B7}, {0x00001F20, 0x00000313}, {0x00001F21, 0x000003B7}, {0x00001F21, 0x00000314}, +{0x00001F22, 0x000003B7}, {0x00001F22, 0x00000313}, {0x00001F22, 0x00000300}, {0x00001F23, 0x000003B7}, +{0x00001F23, 0x00000314}, {0x00001F23, 0x00000300}, {0x00001F24, 0x000003B7}, {0x00001F24, 0x00000313}, +{0x00001F24, 0x00000301}, {0x00001F25, 0x000003B7}, {0x00001F25, 0x00000314}, {0x00001F25, 0x00000301}, +{0x00001F26, 0x000003B7}, {0x00001F26, 0x00000313}, {0x00001F26, 0x00000342}, {0x00001F27, 0x000003B7}, +{0x00001F27, 0x00000314}, {0x00001F27, 0x00000342}, {0x00001F28, 0x00000397}, {0x00001F28, 0x00000313}, +{0x00001F29, 0x00000397}, {0x00001F29, 0x00000314}, {0x00001F2A, 0x00000397}, {0x00001F2A, 0x00000313}, +{0x00001F2A, 0x00000300}, {0x00001F2B, 0x00000397}, {0x00001F2B, 0x00000314}, {0x00001F2B, 0x00000300}, +{0x00001F2C, 0x00000397}, {0x00001F2C, 0x00000313}, {0x00001F2C, 0x00000301}, {0x00001F2D, 0x00000397}, +{0x00001F2D, 0x00000314}, {0x00001F2D, 0x00000301}, {0x00001F2E, 0x00000397}, {0x00001F2E, 0x00000313}, +{0x00001F2E, 0x00000342}, {0x00001F2F, 0x00000397}, {0x00001F2F, 0x00000314}, {0x00001F2F, 0x00000342}, +{0x00001F30, 0x000003B9}, {0x00001F30, 0x00000313}, {0x00001F31, 0x000003B9}, {0x00001F31, 0x00000314}, +{0x00001F32, 0x000003B9}, {0x00001F32, 0x00000313}, {0x00001F32, 0x00000300}, {0x00001F33, 0x000003B9}, +{0x00001F33, 0x00000314}, {0x00001F33, 0x00000300}, {0x00001F34, 0x000003B9}, {0x00001F34, 0x00000313}, +{0x00001F34, 0x00000301}, {0x00001F35, 0x000003B9}, {0x00001F35, 0x00000314}, {0x00001F35, 0x00000301}, +{0x00001F36, 0x000003B9}, {0x00001F36, 0x00000313}, {0x00001F36, 0x00000342}, {0x00001F37, 0x000003B9}, +{0x00001F37, 0x00000314}, {0x00001F37, 0x00000342}, {0x00001F38, 0x00000399}, {0x00001F38, 0x00000313}, +{0x00001F39, 0x00000399}, {0x00001F39, 0x00000314}, {0x00001F3A, 0x00000399}, {0x00001F3A, 0x00000313}, +{0x00001F3A, 0x00000300}, {0x00001F3B, 0x00000399}, {0x00001F3B, 0x00000314}, {0x00001F3B, 0x00000300}, +{0x00001F3C, 0x00000399}, {0x00001F3C, 0x00000313}, {0x00001F3C, 0x00000301}, {0x00001F3D, 0x00000399}, +{0x00001F3D, 0x00000314}, {0x00001F3D, 0x00000301}, {0x00001F3E, 0x00000399}, {0x00001F3E, 0x00000313}, +{0x00001F3E, 0x00000342}, {0x00001F3F, 0x00000399}, {0x00001F3F, 0x00000314}, {0x00001F3F, 0x00000342}, +{0x00001F40, 0x000003BF}, {0x00001F40, 0x00000313}, {0x00001F41, 0x000003BF}, {0x00001F41, 0x00000314}, +{0x00001F42, 0x000003BF}, {0x00001F42, 0x00000313}, {0x00001F42, 0x00000300}, {0x00001F43, 0x000003BF}, +{0x00001F43, 0x00000314}, {0x00001F43, 0x00000300}, {0x00001F44, 0x000003BF}, {0x00001F44, 0x00000313}, +{0x00001F44, 0x00000301}, {0x00001F45, 0x000003BF}, {0x00001F45, 0x00000314}, {0x00001F45, 0x00000301}, +{0x00001F48, 0x0000039F}, {0x00001F48, 0x00000313}, {0x00001F49, 0x0000039F}, {0x00001F49, 0x00000314}, +{0x00001F4A, 0x0000039F}, {0x00001F4A, 0x00000313}, {0x00001F4A, 0x00000300}, {0x00001F4B, 0x0000039F}, +{0x00001F4B, 0x00000314}, {0x00001F4B, 0x00000300}, {0x00001F4C, 0x0000039F}, {0x00001F4C, 0x00000313}, +{0x00001F4C, 0x00000301}, {0x00001F4D, 0x0000039F}, {0x00001F4D, 0x00000314}, {0x00001F4D, 0x00000301}, +{0x00001F50, 0x000003C5}, {0x00001F50, 0x00000313}, {0x00001F51, 0x000003C5}, {0x00001F51, 0x00000314}, +{0x00001F52, 0x000003C5}, {0x00001F52, 0x00000313}, {0x00001F52, 0x00000300}, {0x00001F53, 0x000003C5}, +{0x00001F53, 0x00000314}, {0x00001F53, 0x00000300}, {0x00001F54, 0x000003C5}, {0x00001F54, 0x00000313}, +{0x00001F54, 0x00000301}, {0x00001F55, 0x000003C5}, {0x00001F55, 0x00000314}, {0x00001F55, 0x00000301}, +{0x00001F56, 0x000003C5}, {0x00001F56, 0x00000313}, {0x00001F56, 0x00000342}, {0x00001F57, 0x000003C5}, +{0x00001F57, 0x00000314}, {0x00001F57, 0x00000342}, {0x00001F59, 0x000003A5}, {0x00001F59, 0x00000314}, +{0x00001F5B, 0x000003A5}, {0x00001F5B, 0x00000314}, {0x00001F5B, 0x00000300}, {0x00001F5D, 0x000003A5}, +{0x00001F5D, 0x00000314}, {0x00001F5D, 0x00000301}, {0x00001F5F, 0x000003A5}, {0x00001F5F, 0x00000314}, +{0x00001F5F, 0x00000342}, {0x00001F60, 0x000003C9}, {0x00001F60, 0x00000313}, {0x00001F61, 0x000003C9}, +{0x00001F61, 0x00000314}, {0x00001F62, 0x000003C9}, {0x00001F62, 0x00000313}, {0x00001F62, 0x00000300}, +{0x00001F63, 0x000003C9}, {0x00001F63, 0x00000314}, {0x00001F63, 0x00000300}, {0x00001F64, 0x000003C9}, +{0x00001F64, 0x00000313}, {0x00001F64, 0x00000301}, {0x00001F65, 0x000003C9}, {0x00001F65, 0x00000314}, +{0x00001F65, 0x00000301}, {0x00001F66, 0x000003C9}, {0x00001F66, 0x00000313}, {0x00001F66, 0x00000342}, +{0x00001F67, 0x000003C9}, {0x00001F67, 0x00000314}, {0x00001F67, 0x00000342}, {0x00001F68, 0x000003A9}, +{0x00001F68, 0x00000313}, {0x00001F69, 0x000003A9}, {0x00001F69, 0x00000314}, {0x00001F6A, 0x000003A9}, +{0x00001F6A, 0x00000313}, {0x00001F6A, 0x00000300}, {0x00001F6B, 0x000003A9}, {0x00001F6B, 0x00000314}, +{0x00001F6B, 0x00000300}, {0x00001F6C, 0x000003A9}, {0x00001F6C, 0x00000313}, {0x00001F6C, 0x00000301}, +{0x00001F6D, 0x000003A9}, {0x00001F6D, 0x00000314}, {0x00001F6D, 0x00000301}, {0x00001F6E, 0x000003A9}, +{0x00001F6E, 0x00000313}, {0x00001F6E, 0x00000342}, {0x00001F6F, 0x000003A9}, {0x00001F6F, 0x00000314}, +{0x00001F6F, 0x00000342}, {0x00001F70, 0x000003B1}, {0x00001F70, 0x00000300}, {0x00001F71, 0x000003B1}, +{0x00001F71, 0x00000301}, {0x00001F72, 0x000003B5}, {0x00001F72, 0x00000300}, {0x00001F73, 0x000003B5}, +{0x00001F73, 0x00000301}, {0x00001F74, 0x000003B7}, {0x00001F74, 0x00000300}, {0x00001F75, 0x000003B7}, +{0x00001F75, 0x00000301}, {0x00001F76, 0x000003B9}, {0x00001F76, 0x00000300}, {0x00001F77, 0x000003B9}, +{0x00001F77, 0x00000301}, {0x00001F78, 0x000003BF}, {0x00001F78, 0x00000300}, {0x00001F79, 0x000003BF}, +{0x00001F79, 0x00000301}, {0x00001F7A, 0x000003C5}, {0x00001F7A, 0x00000300}, {0x00001F7B, 0x000003C5}, +{0x00001F7B, 0x00000301}, {0x00001F7C, 0x000003C9}, {0x00001F7C, 0x00000300}, {0x00001F7D, 0x000003C9}, +{0x00001F7D, 0x00000301}, {0x00001F80, 0x000003B1}, {0x00001F80, 0x00000313}, {0x00001F80, 0x00000345}, +{0x00001F81, 0x000003B1}, {0x00001F81, 0x00000314}, {0x00001F81, 0x00000345}, {0x00001F82, 0x000003B1}, +{0x00001F82, 0x00000313}, {0x00001F82, 0x00000300}, {0x00001F82, 0x00000345}, {0x00001F83, 0x000003B1}, +{0x00001F83, 0x00000314}, {0x00001F83, 0x00000300}, {0x00001F83, 0x00000345}, {0x00001F84, 0x000003B1}, +{0x00001F84, 0x00000313}, {0x00001F84, 0x00000301}, {0x00001F84, 0x00000345}, {0x00001F85, 0x000003B1}, +{0x00001F85, 0x00000314}, {0x00001F85, 0x00000301}, {0x00001F85, 0x00000345}, {0x00001F86, 0x000003B1}, +{0x00001F86, 0x00000313}, {0x00001F86, 0x00000342}, {0x00001F86, 0x00000345}, {0x00001F87, 0x000003B1}, +{0x00001F87, 0x00000314}, {0x00001F87, 0x00000342}, {0x00001F87, 0x00000345}, {0x00001F88, 0x00000391}, +{0x00001F88, 0x00000313}, {0x00001F88, 0x00000345}, {0x00001F89, 0x00000391}, {0x00001F89, 0x00000314}, +{0x00001F89, 0x00000345}, {0x00001F8A, 0x00000391}, {0x00001F8A, 0x00000313}, {0x00001F8A, 0x00000300}, +{0x00001F8A, 0x00000345}, {0x00001F8B, 0x00000391}, {0x00001F8B, 0x00000314}, {0x00001F8B, 0x00000300}, +{0x00001F8B, 0x00000345}, {0x00001F8C, 0x00000391}, {0x00001F8C, 0x00000313}, {0x00001F8C, 0x00000301}, +{0x00001F8C, 0x00000345}, {0x00001F8D, 0x00000391}, {0x00001F8D, 0x00000314}, {0x00001F8D, 0x00000301}, +{0x00001F8D, 0x00000345}, {0x00001F8E, 0x00000391}, {0x00001F8E, 0x00000313}, {0x00001F8E, 0x00000342}, +{0x00001F8E, 0x00000345}, {0x00001F8F, 0x00000391}, {0x00001F8F, 0x00000314}, {0x00001F8F, 0x00000342}, +{0x00001F8F, 0x00000345}, {0x00001F90, 0x000003B7}, {0x00001F90, 0x00000313}, {0x00001F90, 0x00000345}, +{0x00001F91, 0x000003B7}, {0x00001F91, 0x00000314}, {0x00001F91, 0x00000345}, {0x00001F92, 0x000003B7}, +{0x00001F92, 0x00000313}, {0x00001F92, 0x00000300}, {0x00001F92, 0x00000345}, {0x00001F93, 0x000003B7}, +{0x00001F93, 0x00000314}, {0x00001F93, 0x00000300}, {0x00001F93, 0x00000345}, {0x00001F94, 0x000003B7}, +{0x00001F94, 0x00000313}, {0x00001F94, 0x00000301}, {0x00001F94, 0x00000345}, {0x00001F95, 0x000003B7}, +{0x00001F95, 0x00000314}, {0x00001F95, 0x00000301}, {0x00001F95, 0x00000345}, {0x00001F96, 0x000003B7}, +{0x00001F96, 0x00000313}, {0x00001F96, 0x00000342}, {0x00001F96, 0x00000345}, {0x00001F97, 0x000003B7}, +{0x00001F97, 0x00000314}, {0x00001F97, 0x00000342}, {0x00001F97, 0x00000345}, {0x00001F98, 0x00000397}, +{0x00001F98, 0x00000313}, {0x00001F98, 0x00000345}, {0x00001F99, 0x00000397}, {0x00001F99, 0x00000314}, +{0x00001F99, 0x00000345}, {0x00001F9A, 0x00000397}, {0x00001F9A, 0x00000313}, {0x00001F9A, 0x00000300}, +{0x00001F9A, 0x00000345}, {0x00001F9B, 0x00000397}, {0x00001F9B, 0x00000314}, {0x00001F9B, 0x00000300}, +{0x00001F9B, 0x00000345}, {0x00001F9C, 0x00000397}, {0x00001F9C, 0x00000313}, {0x00001F9C, 0x00000301}, +{0x00001F9C, 0x00000345}, {0x00001F9D, 0x00000397}, {0x00001F9D, 0x00000314}, {0x00001F9D, 0x00000301}, +{0x00001F9D, 0x00000345}, {0x00001F9E, 0x00000397}, {0x00001F9E, 0x00000313}, {0x00001F9E, 0x00000342}, +{0x00001F9E, 0x00000345}, {0x00001F9F, 0x00000397}, {0x00001F9F, 0x00000314}, {0x00001F9F, 0x00000342}, +{0x00001F9F, 0x00000345}, {0x00001FA0, 0x000003C9}, {0x00001FA0, 0x00000313}, {0x00001FA0, 0x00000345}, +{0x00001FA1, 0x000003C9}, {0x00001FA1, 0x00000314}, {0x00001FA1, 0x00000345}, {0x00001FA2, 0x000003C9}, +{0x00001FA2, 0x00000313}, {0x00001FA2, 0x00000300}, {0x00001FA2, 0x00000345}, {0x00001FA3, 0x000003C9}, +{0x00001FA3, 0x00000314}, {0x00001FA3, 0x00000300}, {0x00001FA3, 0x00000345}, {0x00001FA4, 0x000003C9}, +{0x00001FA4, 0x00000313}, {0x00001FA4, 0x00000301}, {0x00001FA4, 0x00000345}, {0x00001FA5, 0x000003C9}, +{0x00001FA5, 0x00000314}, {0x00001FA5, 0x00000301}, {0x00001FA5, 0x00000345}, {0x00001FA6, 0x000003C9}, +{0x00001FA6, 0x00000313}, {0x00001FA6, 0x00000342}, {0x00001FA6, 0x00000345}, {0x00001FA7, 0x000003C9}, +{0x00001FA7, 0x00000314}, {0x00001FA7, 0x00000342}, {0x00001FA7, 0x00000345}, {0x00001FA8, 0x000003A9}, +{0x00001FA8, 0x00000313}, {0x00001FA8, 0x00000345}, {0x00001FA9, 0x000003A9}, {0x00001FA9, 0x00000314}, +{0x00001FA9, 0x00000345}, {0x00001FAA, 0x000003A9}, {0x00001FAA, 0x00000313}, {0x00001FAA, 0x00000300}, +{0x00001FAA, 0x00000345}, {0x00001FAB, 0x000003A9}, {0x00001FAB, 0x00000314}, {0x00001FAB, 0x00000300}, +{0x00001FAB, 0x00000345}, {0x00001FAC, 0x000003A9}, {0x00001FAC, 0x00000313}, {0x00001FAC, 0x00000301}, +{0x00001FAC, 0x00000345}, {0x00001FAD, 0x000003A9}, {0x00001FAD, 0x00000314}, {0x00001FAD, 0x00000301}, +{0x00001FAD, 0x00000345}, {0x00001FAE, 0x000003A9}, {0x00001FAE, 0x00000313}, {0x00001FAE, 0x00000342}, +{0x00001FAE, 0x00000345}, {0x00001FAF, 0x000003A9}, {0x00001FAF, 0x00000314}, {0x00001FAF, 0x00000342}, +{0x00001FAF, 0x00000345}, {0x00001FB0, 0x000003B1}, {0x00001FB0, 0x00000306}, {0x00001FB1, 0x000003B1}, +{0x00001FB1, 0x00000304}, {0x00001FB2, 0x000003B1}, {0x00001FB2, 0x00000300}, {0x00001FB2, 0x00000345}, +{0x00001FB3, 0x000003B1}, {0x00001FB3, 0x00000345}, {0x00001FB4, 0x000003B1}, {0x00001FB4, 0x00000301}, +{0x00001FB4, 0x00000345}, {0x00001FB6, 0x000003B1}, {0x00001FB6, 0x00000342}, {0x00001FB7, 0x000003B1}, +{0x00001FB7, 0x00000342}, {0x00001FB7, 0x00000345}, {0x00001FB8, 0x00000391}, {0x00001FB8, 0x00000306}, +{0x00001FB9, 0x00000391}, {0x00001FB9, 0x00000304}, {0x00001FBA, 0x00000391}, {0x00001FBA, 0x00000300}, +{0x00001FBB, 0x00000391}, {0x00001FBB, 0x00000301}, {0x00001FBC, 0x00000391}, {0x00001FBC, 0x00000345}, +{0x00001FBE, 0x000003B9}, {0x00001FC1, 0x000000A8}, {0x00001FC1, 0x00000342}, {0x00001FC2, 0x000003B7}, +{0x00001FC2, 0x00000300}, {0x00001FC2, 0x00000345}, {0x00001FC3, 0x000003B7}, {0x00001FC3, 0x00000345}, +{0x00001FC4, 0x000003B7}, {0x00001FC4, 0x00000301}, {0x00001FC4, 0x00000345}, {0x00001FC6, 0x000003B7}, +{0x00001FC6, 0x00000342}, {0x00001FC7, 0x000003B7}, {0x00001FC7, 0x00000342}, {0x00001FC7, 0x00000345}, +{0x00001FC8, 0x00000395}, {0x00001FC8, 0x00000300}, {0x00001FC9, 0x00000395}, {0x00001FC9, 0x00000301}, +{0x00001FCA, 0x00000397}, {0x00001FCA, 0x00000300}, {0x00001FCB, 0x00000397}, {0x00001FCB, 0x00000301}, +{0x00001FCC, 0x00000397}, {0x00001FCC, 0x00000345}, {0x00001FCD, 0x00001FBF}, {0x00001FCD, 0x00000300}, +{0x00001FCE, 0x00001FBF}, {0x00001FCE, 0x00000301}, {0x00001FCF, 0x00001FBF}, {0x00001FCF, 0x00000342}, +{0x00001FD0, 0x000003B9}, {0x00001FD0, 0x00000306}, {0x00001FD1, 0x000003B9}, {0x00001FD1, 0x00000304}, +{0x00001FD2, 0x000003B9}, {0x00001FD2, 0x00000308}, {0x00001FD2, 0x00000300}, {0x00001FD3, 0x000003B9}, +{0x00001FD3, 0x00000308}, {0x00001FD3, 0x00000301}, {0x00001FD6, 0x000003B9}, {0x00001FD6, 0x00000342}, +{0x00001FD7, 0x000003B9}, {0x00001FD7, 0x00000308}, {0x00001FD7, 0x00000342}, {0x00001FD8, 0x00000399}, +{0x00001FD8, 0x00000306}, {0x00001FD9, 0x00000399}, {0x00001FD9, 0x00000304}, {0x00001FDA, 0x00000399}, +{0x00001FDA, 0x00000300}, {0x00001FDB, 0x00000399}, {0x00001FDB, 0x00000301}, {0x00001FDD, 0x00001FFE}, +{0x00001FDD, 0x00000300}, {0x00001FDE, 0x00001FFE}, {0x00001FDE, 0x00000301}, {0x00001FDF, 0x00001FFE}, +{0x00001FDF, 0x00000342}, {0x00001FE0, 0x000003C5}, {0x00001FE0, 0x00000306}, {0x00001FE1, 0x000003C5}, +{0x00001FE1, 0x00000304}, {0x00001FE2, 0x000003C5}, {0x00001FE2, 0x00000308}, {0x00001FE2, 0x00000300}, +{0x00001FE3, 0x000003C5}, {0x00001FE3, 0x00000308}, {0x00001FE3, 0x00000301}, {0x00001FE4, 0x000003C1}, +{0x00001FE4, 0x00000313}, {0x00001FE5, 0x000003C1}, {0x00001FE5, 0x00000314}, {0x00001FE6, 0x000003C5}, +{0x00001FE6, 0x00000342}, {0x00001FE7, 0x000003C5}, {0x00001FE7, 0x00000308}, {0x00001FE7, 0x00000342}, +{0x00001FE8, 0x000003A5}, {0x00001FE8, 0x00000306}, {0x00001FE9, 0x000003A5}, {0x00001FE9, 0x00000304}, +{0x00001FEA, 0x000003A5}, {0x00001FEA, 0x00000300}, {0x00001FEB, 0x000003A5}, {0x00001FEB, 0x00000301}, +{0x00001FEC, 0x000003A1}, {0x00001FEC, 0x00000314}, {0x00001FED, 0x000000A8}, {0x00001FED, 0x00000300}, +{0x00001FEE, 0x000000A8}, {0x00001FEE, 0x00000301}, {0x00001FEF, 0x00000060}, {0x00001FF2, 0x000003C9}, +{0x00001FF2, 0x00000300}, {0x00001FF2, 0x00000345}, {0x00001FF3, 0x000003C9}, {0x00001FF3, 0x00000345}, +{0x00001FF4, 0x000003C9}, {0x00001FF4, 0x00000301}, {0x00001FF4, 0x00000345}, {0x00001FF6, 0x000003C9}, +{0x00001FF6, 0x00000342}, {0x00001FF7, 0x000003C9}, {0x00001FF7, 0x00000342}, {0x00001FF7, 0x00000345}, +{0x00001FF8, 0x0000039F}, {0x00001FF8, 0x00000300}, {0x00001FF9, 0x0000039F}, {0x00001FF9, 0x00000301}, +{0x00001FFA, 0x000003A9}, {0x00001FFA, 0x00000300}, {0x00001FFB, 0x000003A9}, {0x00001FFB, 0x00000301}, +{0x00001FFC, 0x000003A9}, {0x00001FFC, 0x00000345}, {0x00001FFD, 0x000000B4}, {0x00002000, 0x00002002}, +{0x00002001, 0x00002003}, {0x00002126, 0x000003A9}, {0x0000212A, 0x0000004B}, {0x0000212B, 0x00000041}, +{0x0000212B, 0x0000030A}, {0x0000219A, 0x00002190}, {0x0000219A, 0x00000338}, {0x0000219B, 0x00002192}, +{0x0000219B, 0x00000338}, {0x000021AE, 0x00002194}, {0x000021AE, 0x00000338}, {0x000021CD, 0x000021D0}, +{0x000021CD, 0x00000338}, {0x000021CE, 0x000021D4}, {0x000021CE, 0x00000338}, {0x000021CF, 0x000021D2}, +{0x000021CF, 0x00000338}, {0x00002204, 0x00002203}, {0x00002204, 0x00000338}, {0x00002209, 0x00002208}, +{0x00002209, 0x00000338}, {0x0000220C, 0x0000220B}, {0x0000220C, 0x00000338}, {0x00002224, 0x00002223}, +{0x00002224, 0x00000338}, {0x00002226, 0x00002225}, {0x00002226, 0x00000338}, {0x00002241, 0x0000223C}, +{0x00002241, 0x00000338}, {0x00002244, 0x00002243}, {0x00002244, 0x00000338}, {0x00002247, 0x00002245}, +{0x00002247, 0x00000338}, {0x00002249, 0x00002248}, {0x00002249, 0x00000338}, {0x00002260, 0x0000003D}, +{0x00002260, 0x00000338}, {0x00002262, 0x00002261}, {0x00002262, 0x00000338}, {0x0000226D, 0x0000224D}, +{0x0000226D, 0x00000338}, {0x0000226E, 0x0000003C}, {0x0000226E, 0x00000338}, {0x0000226F, 0x0000003E}, +{0x0000226F, 0x00000338}, {0x00002270, 0x00002264}, {0x00002270, 0x00000338}, {0x00002271, 0x00002265}, +{0x00002271, 0x00000338}, {0x00002274, 0x00002272}, {0x00002274, 0x00000338}, {0x00002275, 0x00002273}, +{0x00002275, 0x00000338}, {0x00002278, 0x00002276}, {0x00002278, 0x00000338}, {0x00002279, 0x00002277}, +{0x00002279, 0x00000338}, {0x00002280, 0x0000227A}, {0x00002280, 0x00000338}, {0x00002281, 0x0000227B}, +{0x00002281, 0x00000338}, {0x00002284, 0x00002282}, {0x00002284, 0x00000338}, {0x00002285, 0x00002283}, +{0x00002285, 0x00000338}, {0x00002288, 0x00002286}, {0x00002288, 0x00000338}, {0x00002289, 0x00002287}, +{0x00002289, 0x00000338}, {0x000022AC, 0x000022A2}, {0x000022AC, 0x00000338}, {0x000022AD, 0x000022A8}, +{0x000022AD, 0x00000338}, {0x000022AE, 0x000022A9}, {0x000022AE, 0x00000338}, {0x000022AF, 0x000022AB}, +{0x000022AF, 0x00000338}, {0x000022E0, 0x0000227C}, {0x000022E0, 0x00000338}, {0x000022E1, 0x0000227D}, +{0x000022E1, 0x00000338}, {0x000022E2, 0x00002291}, {0x000022E2, 0x00000338}, {0x000022E3, 0x00002292}, +{0x000022E3, 0x00000338}, {0x000022EA, 0x000022B2}, {0x000022EA, 0x00000338}, {0x000022EB, 0x000022B3}, +{0x000022EB, 0x00000338}, {0x000022EC, 0x000022B4}, {0x000022EC, 0x00000338}, {0x000022ED, 0x000022B5}, +{0x000022ED, 0x00000338}, {0x00002329, 0x00003008}, {0x0000232A, 0x00003009}, {0x00002ADC, 0x00002ADD}, +{0x00002ADC, 0x00000338}, {0x0000304C, 0x0000304B}, {0x0000304C, 0x00003099}, {0x0000304E, 0x0000304D}, +{0x0000304E, 0x00003099}, {0x00003050, 0x0000304F}, {0x00003050, 0x00003099}, {0x00003052, 0x00003051}, +{0x00003052, 0x00003099}, {0x00003054, 0x00003053}, {0x00003054, 0x00003099}, {0x00003056, 0x00003055}, +{0x00003056, 0x00003099}, {0x00003058, 0x00003057}, {0x00003058, 0x00003099}, {0x0000305A, 0x00003059}, +{0x0000305A, 0x00003099}, {0x0000305C, 0x0000305B}, {0x0000305C, 0x00003099}, {0x0000305E, 0x0000305D}, +{0x0000305E, 0x00003099}, {0x00003060, 0x0000305F}, {0x00003060, 0x00003099}, {0x00003062, 0x00003061}, +{0x00003062, 0x00003099}, {0x00003065, 0x00003064}, {0x00003065, 0x00003099}, {0x00003067, 0x00003066}, +{0x00003067, 0x00003099}, {0x00003069, 0x00003068}, {0x00003069, 0x00003099}, {0x00003070, 0x0000306F}, +{0x00003070, 0x00003099}, {0x00003071, 0x0000306F}, {0x00003071, 0x0000309A}, {0x00003073, 0x00003072}, +{0x00003073, 0x00003099}, {0x00003074, 0x00003072}, {0x00003074, 0x0000309A}, {0x00003076, 0x00003075}, +{0x00003076, 0x00003099}, {0x00003077, 0x00003075}, {0x00003077, 0x0000309A}, {0x00003079, 0x00003078}, +{0x00003079, 0x00003099}, {0x0000307A, 0x00003078}, {0x0000307A, 0x0000309A}, {0x0000307C, 0x0000307B}, +{0x0000307C, 0x00003099}, {0x0000307D, 0x0000307B}, {0x0000307D, 0x0000309A}, {0x00003094, 0x00003046}, +{0x00003094, 0x00003099}, {0x0000309E, 0x0000309D}, {0x0000309E, 0x00003099}, {0x000030AC, 0x000030AB}, +{0x000030AC, 0x00003099}, {0x000030AE, 0x000030AD}, {0x000030AE, 0x00003099}, {0x000030B0, 0x000030AF}, +{0x000030B0, 0x00003099}, {0x000030B2, 0x000030B1}, {0x000030B2, 0x00003099}, {0x000030B4, 0x000030B3}, +{0x000030B4, 0x00003099}, {0x000030B6, 0x000030B5}, {0x000030B6, 0x00003099}, {0x000030B8, 0x000030B7}, +{0x000030B8, 0x00003099}, {0x000030BA, 0x000030B9}, {0x000030BA, 0x00003099}, {0x000030BC, 0x000030BB}, +{0x000030BC, 0x00003099}, {0x000030BE, 0x000030BD}, {0x000030BE, 0x00003099}, {0x000030C0, 0x000030BF}, +{0x000030C0, 0x00003099}, {0x000030C2, 0x000030C1}, {0x000030C2, 0x00003099}, {0x000030C5, 0x000030C4}, +{0x000030C5, 0x00003099}, {0x000030C7, 0x000030C6}, {0x000030C7, 0x00003099}, {0x000030C9, 0x000030C8}, +{0x000030C9, 0x00003099}, {0x000030D0, 0x000030CF}, {0x000030D0, 0x00003099}, {0x000030D1, 0x000030CF}, +{0x000030D1, 0x0000309A}, {0x000030D3, 0x000030D2}, {0x000030D3, 0x00003099}, {0x000030D4, 0x000030D2}, +{0x000030D4, 0x0000309A}, {0x000030D6, 0x000030D5}, {0x000030D6, 0x00003099}, {0x000030D7, 0x000030D5}, +{0x000030D7, 0x0000309A}, {0x000030D9, 0x000030D8}, {0x000030D9, 0x00003099}, {0x000030DA, 0x000030D8}, +{0x000030DA, 0x0000309A}, {0x000030DC, 0x000030DB}, {0x000030DC, 0x00003099}, {0x000030DD, 0x000030DB}, +{0x000030DD, 0x0000309A}, {0x000030F4, 0x000030A6}, {0x000030F4, 0x00003099}, {0x000030F7, 0x000030EF}, +{0x000030F7, 0x00003099}, {0x000030F8, 0x000030F0}, {0x000030F8, 0x00003099}, {0x000030F9, 0x000030F1}, +{0x000030F9, 0x00003099}, {0x000030FA, 0x000030F2}, {0x000030FA, 0x00003099}, {0x000030FE, 0x000030FD}, +{0x000030FE, 0x00003099}, {0x0000F900, 0x00008C48}, {0x0000F901, 0x000066F4}, {0x0000F902, 0x00008ECA}, +{0x0000F903, 0x00008CC8}, {0x0000F904, 0x00006ED1}, {0x0000F905, 0x00004E32}, {0x0000F906, 0x000053E5}, +{0x0000F907, 0x00009F9C}, {0x0000F908, 0x00009F9C}, {0x0000F909, 0x00005951}, {0x0000F90A, 0x000091D1}, +{0x0000F90B, 0x00005587}, {0x0000F90C, 0x00005948}, {0x0000F90D, 0x000061F6}, {0x0000F90E, 0x00007669}, +{0x0000F90F, 0x00007F85}, {0x0000F910, 0x0000863F}, {0x0000F911, 0x000087BA}, {0x0000F912, 0x000088F8}, +{0x0000F913, 0x0000908F}, {0x0000F914, 0x00006A02}, {0x0000F915, 0x00006D1B}, {0x0000F916, 0x000070D9}, +{0x0000F917, 0x000073DE}, {0x0000F918, 0x0000843D}, {0x0000F919, 0x0000916A}, {0x0000F91A, 0x000099F1}, +{0x0000F91B, 0x00004E82}, {0x0000F91C, 0x00005375}, {0x0000F91D, 0x00006B04}, {0x0000F91E, 0x0000721B}, +{0x0000F91F, 0x0000862D}, {0x0000F920, 0x00009E1E}, {0x0000F921, 0x00005D50}, {0x0000F922, 0x00006FEB}, +{0x0000F923, 0x000085CD}, {0x0000F924, 0x00008964}, {0x0000F925, 0x000062C9}, {0x0000F926, 0x000081D8}, +{0x0000F927, 0x0000881F}, {0x0000F928, 0x00005ECA}, {0x0000F929, 0x00006717}, {0x0000F92A, 0x00006D6A}, +{0x0000F92B, 0x000072FC}, {0x0000F92C, 0x000090CE}, {0x0000F92D, 0x00004F86}, {0x0000F92E, 0x000051B7}, +{0x0000F92F, 0x000052DE}, {0x0000F930, 0x000064C4}, {0x0000F931, 0x00006AD3}, {0x0000F932, 0x00007210}, +{0x0000F933, 0x000076E7}, {0x0000F934, 0x00008001}, {0x0000F935, 0x00008606}, {0x0000F936, 0x0000865C}, +{0x0000F937, 0x00008DEF}, {0x0000F938, 0x00009732}, {0x0000F939, 0x00009B6F}, {0x0000F93A, 0x00009DFA}, +{0x0000F93B, 0x0000788C}, {0x0000F93C, 0x0000797F}, {0x0000F93D, 0x00007DA0}, {0x0000F93E, 0x000083C9}, +{0x0000F93F, 0x00009304}, {0x0000F940, 0x00009E7F}, {0x0000F941, 0x00008AD6}, {0x0000F942, 0x000058DF}, +{0x0000F943, 0x00005F04}, {0x0000F944, 0x00007C60}, {0x0000F945, 0x0000807E}, {0x0000F946, 0x00007262}, +{0x0000F947, 0x000078CA}, {0x0000F948, 0x00008CC2}, {0x0000F949, 0x000096F7}, {0x0000F94A, 0x000058D8}, +{0x0000F94B, 0x00005C62}, {0x0000F94C, 0x00006A13}, {0x0000F94D, 0x00006DDA}, {0x0000F94E, 0x00006F0F}, +{0x0000F94F, 0x00007D2F}, {0x0000F950, 0x00007E37}, {0x0000F951, 0x0000964B}, {0x0000F952, 0x000052D2}, +{0x0000F953, 0x0000808B}, {0x0000F954, 0x000051DC}, {0x0000F955, 0x000051CC}, {0x0000F956, 0x00007A1C}, +{0x0000F957, 0x00007DBE}, {0x0000F958, 0x000083F1}, {0x0000F959, 0x00009675}, {0x0000F95A, 0x00008B80}, +{0x0000F95B, 0x000062CF}, {0x0000F95C, 0x00006A02}, {0x0000F95D, 0x00008AFE}, {0x0000F95E, 0x00004E39}, +{0x0000F95F, 0x00005BE7}, {0x0000F960, 0x00006012}, {0x0000F961, 0x00007387}, {0x0000F962, 0x00007570}, +{0x0000F963, 0x00005317}, {0x0000F964, 0x000078FB}, {0x0000F965, 0x00004FBF}, {0x0000F966, 0x00005FA9}, +{0x0000F967, 0x00004E0D}, {0x0000F968, 0x00006CCC}, {0x0000F969, 0x00006578}, {0x0000F96A, 0x00007D22}, +{0x0000F96B, 0x000053C3}, {0x0000F96C, 0x0000585E}, {0x0000F96D, 0x00007701}, {0x0000F96E, 0x00008449}, +{0x0000F96F, 0x00008AAA}, {0x0000F970, 0x00006BBA}, {0x0000F971, 0x00008FB0}, {0x0000F972, 0x00006C88}, +{0x0000F973, 0x000062FE}, {0x0000F974, 0x000082E5}, {0x0000F975, 0x000063A0}, {0x0000F976, 0x00007565}, +{0x0000F977, 0x00004EAE}, {0x0000F978, 0x00005169}, {0x0000F979, 0x000051C9}, {0x0000F97A, 0x00006881}, +{0x0000F97B, 0x00007CE7}, {0x0000F97C, 0x0000826F}, {0x0000F97D, 0x00008AD2}, {0x0000F97E, 0x000091CF}, +{0x0000F97F, 0x000052F5}, {0x0000F980, 0x00005442}, {0x0000F981, 0x00005973}, {0x0000F982, 0x00005EEC}, +{0x0000F983, 0x000065C5}, {0x0000F984, 0x00006FFE}, {0x0000F985, 0x0000792A}, {0x0000F986, 0x000095AD}, +{0x0000F987, 0x00009A6A}, {0x0000F988, 0x00009E97}, {0x0000F989, 0x00009ECE}, {0x0000F98A, 0x0000529B}, +{0x0000F98B, 0x000066C6}, {0x0000F98C, 0x00006B77}, {0x0000F98D, 0x00008F62}, {0x0000F98E, 0x00005E74}, +{0x0000F98F, 0x00006190}, {0x0000F990, 0x00006200}, {0x0000F991, 0x0000649A}, {0x0000F992, 0x00006F23}, +{0x0000F993, 0x00007149}, {0x0000F994, 0x00007489}, {0x0000F995, 0x000079CA}, {0x0000F996, 0x00007DF4}, +{0x0000F997, 0x0000806F}, {0x0000F998, 0x00008F26}, {0x0000F999, 0x000084EE}, {0x0000F99A, 0x00009023}, +{0x0000F99B, 0x0000934A}, {0x0000F99C, 0x00005217}, {0x0000F99D, 0x000052A3}, {0x0000F99E, 0x000054BD}, +{0x0000F99F, 0x000070C8}, {0x0000F9A0, 0x000088C2}, {0x0000F9A1, 0x00008AAA}, {0x0000F9A2, 0x00005EC9}, +{0x0000F9A3, 0x00005FF5}, {0x0000F9A4, 0x0000637B}, {0x0000F9A5, 0x00006BAE}, {0x0000F9A6, 0x00007C3E}, +{0x0000F9A7, 0x00007375}, {0x0000F9A8, 0x00004EE4}, {0x0000F9A9, 0x000056F9}, {0x0000F9AA, 0x00005BE7}, +{0x0000F9AB, 0x00005DBA}, {0x0000F9AC, 0x0000601C}, {0x0000F9AD, 0x000073B2}, {0x0000F9AE, 0x00007469}, +{0x0000F9AF, 0x00007F9A}, {0x0000F9B0, 0x00008046}, {0x0000F9B1, 0x00009234}, {0x0000F9B2, 0x000096F6}, +{0x0000F9B3, 0x00009748}, {0x0000F9B4, 0x00009818}, {0x0000F9B5, 0x00004F8B}, {0x0000F9B6, 0x000079AE}, +{0x0000F9B7, 0x000091B4}, {0x0000F9B8, 0x000096B8}, {0x0000F9B9, 0x000060E1}, {0x0000F9BA, 0x00004E86}, +{0x0000F9BB, 0x000050DA}, {0x0000F9BC, 0x00005BEE}, {0x0000F9BD, 0x00005C3F}, {0x0000F9BE, 0x00006599}, +{0x0000F9BF, 0x00006A02}, {0x0000F9C0, 0x000071CE}, {0x0000F9C1, 0x00007642}, {0x0000F9C2, 0x000084FC}, +{0x0000F9C3, 0x0000907C}, {0x0000F9C4, 0x00009F8D}, {0x0000F9C5, 0x00006688}, {0x0000F9C6, 0x0000962E}, +{0x0000F9C7, 0x00005289}, {0x0000F9C8, 0x0000677B}, {0x0000F9C9, 0x000067F3}, {0x0000F9CA, 0x00006D41}, +{0x0000F9CB, 0x00006E9C}, {0x0000F9CC, 0x00007409}, {0x0000F9CD, 0x00007559}, {0x0000F9CE, 0x0000786B}, +{0x0000F9CF, 0x00007D10}, {0x0000F9D0, 0x0000985E}, {0x0000F9D1, 0x0000516D}, {0x0000F9D2, 0x0000622E}, +{0x0000F9D3, 0x00009678}, {0x0000F9D4, 0x0000502B}, {0x0000F9D5, 0x00005D19}, {0x0000F9D6, 0x00006DEA}, +{0x0000F9D7, 0x00008F2A}, {0x0000F9D8, 0x00005F8B}, {0x0000F9D9, 0x00006144}, {0x0000F9DA, 0x00006817}, +{0x0000F9DB, 0x00007387}, {0x0000F9DC, 0x00009686}, {0x0000F9DD, 0x00005229}, {0x0000F9DE, 0x0000540F}, +{0x0000F9DF, 0x00005C65}, {0x0000F9E0, 0x00006613}, {0x0000F9E1, 0x0000674E}, {0x0000F9E2, 0x000068A8}, +{0x0000F9E3, 0x00006CE5}, {0x0000F9E4, 0x00007406}, {0x0000F9E5, 0x000075E2}, {0x0000F9E6, 0x00007F79}, +{0x0000F9E7, 0x000088CF}, {0x0000F9E8, 0x000088E1}, {0x0000F9E9, 0x000091CC}, {0x0000F9EA, 0x000096E2}, +{0x0000F9EB, 0x0000533F}, {0x0000F9EC, 0x00006EBA}, {0x0000F9ED, 0x0000541D}, {0x0000F9EE, 0x000071D0}, +{0x0000F9EF, 0x00007498}, {0x0000F9F0, 0x000085FA}, {0x0000F9F1, 0x000096A3}, {0x0000F9F2, 0x00009C57}, +{0x0000F9F3, 0x00009E9F}, {0x0000F9F4, 0x00006797}, {0x0000F9F5, 0x00006DCB}, {0x0000F9F6, 0x000081E8}, +{0x0000F9F7, 0x00007ACB}, {0x0000F9F8, 0x00007B20}, {0x0000F9F9, 0x00007C92}, {0x0000F9FA, 0x000072C0}, +{0x0000F9FB, 0x00007099}, {0x0000F9FC, 0x00008B58}, {0x0000F9FD, 0x00004EC0}, {0x0000F9FE, 0x00008336}, +{0x0000F9FF, 0x0000523A}, {0x0000FA00, 0x00005207}, {0x0000FA01, 0x00005EA6}, {0x0000FA02, 0x000062D3}, +{0x0000FA03, 0x00007CD6}, {0x0000FA04, 0x00005B85}, {0x0000FA05, 0x00006D1E}, {0x0000FA06, 0x000066B4}, +{0x0000FA07, 0x00008F3B}, {0x0000FA08, 0x0000884C}, {0x0000FA09, 0x0000964D}, {0x0000FA0A, 0x0000898B}, +{0x0000FA0B, 0x00005ED3}, {0x0000FA0C, 0x00005140}, {0x0000FA0D, 0x000055C0}, {0x0000FA10, 0x0000585A}, +{0x0000FA12, 0x00006674}, {0x0000FA15, 0x000051DE}, {0x0000FA16, 0x0000732A}, {0x0000FA17, 0x000076CA}, +{0x0000FA18, 0x0000793C}, {0x0000FA19, 0x0000795E}, {0x0000FA1A, 0x00007965}, {0x0000FA1B, 0x0000798F}, +{0x0000FA1C, 0x00009756}, {0x0000FA1D, 0x00007CBE}, {0x0000FA1E, 0x00007FBD}, {0x0000FA20, 0x00008612}, +{0x0000FA22, 0x00008AF8}, {0x0000FA25, 0x00009038}, {0x0000FA26, 0x000090FD}, {0x0000FA2A, 0x000098EF}, +{0x0000FA2B, 0x000098FC}, {0x0000FA2C, 0x00009928}, {0x0000FA2D, 0x00009DB4}, {0x0000FA2E, 0x000090DE}, +{0x0000FA2F, 0x000096B7}, {0x0000FA30, 0x00004FAE}, {0x0000FA31, 0x000050E7}, {0x0000FA32, 0x0000514D}, +{0x0000FA33, 0x000052C9}, {0x0000FA34, 0x000052E4}, {0x0000FA35, 0x00005351}, {0x0000FA36, 0x0000559D}, +{0x0000FA37, 0x00005606}, {0x0000FA38, 0x00005668}, {0x0000FA39, 0x00005840}, {0x0000FA3A, 0x000058A8}, +{0x0000FA3B, 0x00005C64}, {0x0000FA3C, 0x00005C6E}, {0x0000FA3D, 0x00006094}, {0x0000FA3E, 0x00006168}, +{0x0000FA3F, 0x0000618E}, {0x0000FA40, 0x000061F2}, {0x0000FA41, 0x0000654F}, {0x0000FA42, 0x000065E2}, +{0x0000FA43, 0x00006691}, {0x0000FA44, 0x00006885}, {0x0000FA45, 0x00006D77}, {0x0000FA46, 0x00006E1A}, +{0x0000FA47, 0x00006F22}, {0x0000FA48, 0x0000716E}, {0x0000FA49, 0x0000722B}, {0x0000FA4A, 0x00007422}, +{0x0000FA4B, 0x00007891}, {0x0000FA4C, 0x0000793E}, {0x0000FA4D, 0x00007949}, {0x0000FA4E, 0x00007948}, +{0x0000FA4F, 0x00007950}, {0x0000FA50, 0x00007956}, {0x0000FA51, 0x0000795D}, {0x0000FA52, 0x0000798D}, +{0x0000FA53, 0x0000798E}, {0x0000FA54, 0x00007A40}, {0x0000FA55, 0x00007A81}, {0x0000FA56, 0x00007BC0}, +{0x0000FA57, 0x00007DF4}, {0x0000FA58, 0x00007E09}, {0x0000FA59, 0x00007E41}, {0x0000FA5A, 0x00007F72}, +{0x0000FA5B, 0x00008005}, {0x0000FA5C, 0x000081ED}, {0x0000FA5D, 0x00008279}, {0x0000FA5E, 0x00008279}, +{0x0000FA5F, 0x00008457}, {0x0000FA60, 0x00008910}, {0x0000FA61, 0x00008996}, {0x0000FA62, 0x00008B01}, +{0x0000FA63, 0x00008B39}, {0x0000FA64, 0x00008CD3}, {0x0000FA65, 0x00008D08}, {0x0000FA66, 0x00008FB6}, +{0x0000FA67, 0x00009038}, {0x0000FA68, 0x000096E3}, {0x0000FA69, 0x000097FF}, {0x0000FA6A, 0x0000983B}, +{0x0000FA6B, 0x00006075}, {0x0000FA6C, 0x000242EE}, {0x0000FA6D, 0x00008218}, {0x0000FA70, 0x00004E26}, +{0x0000FA71, 0x000051B5}, {0x0000FA72, 0x00005168}, {0x0000FA73, 0x00004F80}, {0x0000FA74, 0x00005145}, +{0x0000FA75, 0x00005180}, {0x0000FA76, 0x000052C7}, {0x0000FA77, 0x000052FA}, {0x0000FA78, 0x0000559D}, +{0x0000FA79, 0x00005555}, {0x0000FA7A, 0x00005599}, {0x0000FA7B, 0x000055E2}, {0x0000FA7C, 0x0000585A}, +{0x0000FA7D, 0x000058B3}, {0x0000FA7E, 0x00005944}, {0x0000FA7F, 0x00005954}, {0x0000FA80, 0x00005A62}, +{0x0000FA81, 0x00005B28}, {0x0000FA82, 0x00005ED2}, {0x0000FA83, 0x00005ED9}, {0x0000FA84, 0x00005F69}, +{0x0000FA85, 0x00005FAD}, {0x0000FA86, 0x000060D8}, {0x0000FA87, 0x0000614E}, {0x0000FA88, 0x00006108}, +{0x0000FA89, 0x0000618E}, {0x0000FA8A, 0x00006160}, {0x0000FA8B, 0x000061F2}, {0x0000FA8C, 0x00006234}, +{0x0000FA8D, 0x000063C4}, {0x0000FA8E, 0x0000641C}, {0x0000FA8F, 0x00006452}, {0x0000FA90, 0x00006556}, +{0x0000FA91, 0x00006674}, {0x0000FA92, 0x00006717}, {0x0000FA93, 0x0000671B}, {0x0000FA94, 0x00006756}, +{0x0000FA95, 0x00006B79}, {0x0000FA96, 0x00006BBA}, {0x0000FA97, 0x00006D41}, {0x0000FA98, 0x00006EDB}, +{0x0000FA99, 0x00006ECB}, {0x0000FA9A, 0x00006F22}, {0x0000FA9B, 0x0000701E}, {0x0000FA9C, 0x0000716E}, +{0x0000FA9D, 0x000077A7}, {0x0000FA9E, 0x00007235}, {0x0000FA9F, 0x000072AF}, {0x0000FAA0, 0x0000732A}, +{0x0000FAA1, 0x00007471}, {0x0000FAA2, 0x00007506}, {0x0000FAA3, 0x0000753B}, {0x0000FAA4, 0x0000761D}, +{0x0000FAA5, 0x0000761F}, {0x0000FAA6, 0x000076CA}, {0x0000FAA7, 0x000076DB}, {0x0000FAA8, 0x000076F4}, +{0x0000FAA9, 0x0000774A}, {0x0000FAAA, 0x00007740}, {0x0000FAAB, 0x000078CC}, {0x0000FAAC, 0x00007AB1}, +{0x0000FAAD, 0x00007BC0}, {0x0000FAAE, 0x00007C7B}, {0x0000FAAF, 0x00007D5B}, {0x0000FAB0, 0x00007DF4}, +{0x0000FAB1, 0x00007F3E}, {0x0000FAB2, 0x00008005}, {0x0000FAB3, 0x00008352}, {0x0000FAB4, 0x000083EF}, +{0x0000FAB5, 0x00008779}, {0x0000FAB6, 0x00008941}, {0x0000FAB7, 0x00008986}, {0x0000FAB8, 0x00008996}, +{0x0000FAB9, 0x00008ABF}, {0x0000FABA, 0x00008AF8}, {0x0000FABB, 0x00008ACB}, {0x0000FABC, 0x00008B01}, +{0x0000FABD, 0x00008AFE}, {0x0000FABE, 0x00008AED}, {0x0000FABF, 0x00008B39}, {0x0000FAC0, 0x00008B8A}, +{0x0000FAC1, 0x00008D08}, {0x0000FAC2, 0x00008F38}, {0x0000FAC3, 0x00009072}, {0x0000FAC4, 0x00009199}, +{0x0000FAC5, 0x00009276}, {0x0000FAC6, 0x0000967C}, {0x0000FAC7, 0x000096E3}, {0x0000FAC8, 0x00009756}, +{0x0000FAC9, 0x000097DB}, {0x0000FACA, 0x000097FF}, {0x0000FACB, 0x0000980B}, {0x0000FACC, 0x0000983B}, +{0x0000FACD, 0x00009B12}, {0x0000FACE, 0x00009F9C}, {0x0000FACF, 0x0002284A}, {0x0000FAD0, 0x00022844}, +{0x0000FAD1, 0x000233D5}, {0x0000FAD2, 0x00003B9D}, {0x0000FAD3, 0x00004018}, {0x0000FAD4, 0x00004039}, +{0x0000FAD5, 0x00025249}, {0x0000FAD6, 0x00025CD0}, {0x0000FAD7, 0x00027ED3}, {0x0000FAD8, 0x00009F43}, +{0x0000FAD9, 0x00009F8E}, {0x0000FB1D, 0x000005D9}, {0x0000FB1D, 0x000005B4}, {0x0000FB1F, 0x000005F2}, +{0x0000FB1F, 0x000005B7}, {0x0000FB2A, 0x000005E9}, {0x0000FB2A, 0x000005C1}, {0x0000FB2B, 0x000005E9}, +{0x0000FB2B, 0x000005C2}, {0x0000FB2C, 0x000005E9}, {0x0000FB2C, 0x000005BC}, {0x0000FB2C, 0x000005C1}, +{0x0000FB2D, 0x000005E9}, {0x0000FB2D, 0x000005BC}, {0x0000FB2D, 0x000005C2}, {0x0000FB2E, 0x000005D0}, +{0x0000FB2E, 0x000005B7}, {0x0000FB2F, 0x000005D0}, {0x0000FB2F, 0x000005B8}, {0x0000FB30, 0x000005D0}, +{0x0000FB30, 0x000005BC}, {0x0000FB31, 0x000005D1}, {0x0000FB31, 0x000005BC}, {0x0000FB32, 0x000005D2}, +{0x0000FB32, 0x000005BC}, {0x0000FB33, 0x000005D3}, {0x0000FB33, 0x000005BC}, {0x0000FB34, 0x000005D4}, +{0x0000FB34, 0x000005BC}, {0x0000FB35, 0x000005D5}, {0x0000FB35, 0x000005BC}, {0x0000FB36, 0x000005D6}, +{0x0000FB36, 0x000005BC}, {0x0000FB38, 0x000005D8}, {0x0000FB38, 0x000005BC}, {0x0000FB39, 0x000005D9}, +{0x0000FB39, 0x000005BC}, {0x0000FB3A, 0x000005DA}, {0x0000FB3A, 0x000005BC}, {0x0000FB3B, 0x000005DB}, +{0x0000FB3B, 0x000005BC}, {0x0000FB3C, 0x000005DC}, {0x0000FB3C, 0x000005BC}, {0x0000FB3E, 0x000005DE}, +{0x0000FB3E, 0x000005BC}, {0x0000FB40, 0x000005E0}, {0x0000FB40, 0x000005BC}, {0x0000FB41, 0x000005E1}, +{0x0000FB41, 0x000005BC}, {0x0000FB43, 0x000005E3}, {0x0000FB43, 0x000005BC}, {0x0000FB44, 0x000005E4}, +{0x0000FB44, 0x000005BC}, {0x0000FB46, 0x000005E6}, {0x0000FB46, 0x000005BC}, {0x0000FB47, 0x000005E7}, +{0x0000FB47, 0x000005BC}, {0x0000FB48, 0x000005E8}, {0x0000FB48, 0x000005BC}, {0x0000FB49, 0x000005E9}, +{0x0000FB49, 0x000005BC}, {0x0000FB4A, 0x000005EA}, {0x0000FB4A, 0x000005BC}, {0x0000FB4B, 0x000005D5}, +{0x0000FB4B, 0x000005B9}, {0x0000FB4C, 0x000005D1}, {0x0000FB4C, 0x000005BF}, {0x0000FB4D, 0x000005DB}, +{0x0000FB4D, 0x000005BF}, {0x0000FB4E, 0x000005E4}, {0x0000FB4E, 0x000005BF}, {0x0001109A, 0x00011099}, +{0x0001109A, 0x000110BA}, {0x0001109C, 0x0001109B}, {0x0001109C, 0x000110BA}, {0x000110AB, 0x000110A5}, +{0x000110AB, 0x000110BA}, {0x0001112E, 0x00011131}, {0x0001112E, 0x00011127}, {0x0001112F, 0x00011132}, +{0x0001112F, 0x00011127}, {0x0001134B, 0x00011347}, {0x0001134B, 0x0001133E}, {0x0001134C, 0x00011347}, +{0x0001134C, 0x00011357}, {0x000114BB, 0x000114B9}, {0x000114BB, 0x000114BA}, {0x000114BC, 0x000114B9}, +{0x000114BC, 0x000114B0}, {0x000114BE, 0x000114B9}, {0x000114BE, 0x000114BD}, {0x000115BA, 0x000115B8}, +{0x000115BA, 0x000115AF}, {0x000115BB, 0x000115B9}, {0x000115BB, 0x000115AF}, {0x0001D15E, 0x0001D157}, +{0x0001D15E, 0x0001D165}, {0x0001D15F, 0x0001D158}, {0x0001D15F, 0x0001D165}, {0x0001D160, 0x0001D158}, +{0x0001D160, 0x0001D165}, {0x0001D160, 0x0001D16E}, {0x0001D161, 0x0001D158}, {0x0001D161, 0x0001D165}, +{0x0001D161, 0x0001D16F}, {0x0001D162, 0x0001D158}, {0x0001D162, 0x0001D165}, {0x0001D162, 0x0001D170}, +{0x0001D163, 0x0001D158}, {0x0001D163, 0x0001D165}, {0x0001D163, 0x0001D171}, {0x0001D164, 0x0001D158}, +{0x0001D164, 0x0001D165}, {0x0001D164, 0x0001D172}, {0x0001D1BB, 0x0001D1B9}, {0x0001D1BB, 0x0001D165}, +{0x0001D1BC, 0x0001D1BA}, {0x0001D1BC, 0x0001D165}, {0x0001D1BD, 0x0001D1B9}, {0x0001D1BD, 0x0001D165}, +{0x0001D1BD, 0x0001D16E}, {0x0001D1BE, 0x0001D1BA}, {0x0001D1BE, 0x0001D165}, {0x0001D1BE, 0x0001D16E}, +{0x0001D1BF, 0x0001D1B9}, {0x0001D1BF, 0x0001D165}, {0x0001D1BF, 0x0001D16F}, {0x0001D1C0, 0x0001D1BA}, +{0x0001D1C0, 0x0001D165}, {0x0001D1C0, 0x0001D16F}, {0x0002F800, 0x00004E3D}, {0x0002F801, 0x00004E38}, +{0x0002F802, 0x00004E41}, {0x0002F803, 0x00020122}, {0x0002F804, 0x00004F60}, {0x0002F805, 0x00004FAE}, +{0x0002F806, 0x00004FBB}, {0x0002F807, 0x00005002}, {0x0002F808, 0x0000507A}, {0x0002F809, 0x00005099}, +{0x0002F80A, 0x000050E7}, {0x0002F80B, 0x000050CF}, {0x0002F80C, 0x0000349E}, {0x0002F80D, 0x0002063A}, +{0x0002F80E, 0x0000514D}, {0x0002F80F, 0x00005154}, {0x0002F810, 0x00005164}, {0x0002F811, 0x00005177}, +{0x0002F812, 0x0002051C}, {0x0002F813, 0x000034B9}, {0x0002F814, 0x00005167}, {0x0002F815, 0x0000518D}, +{0x0002F816, 0x0002054B}, {0x0002F817, 0x00005197}, {0x0002F818, 0x000051A4}, {0x0002F819, 0x00004ECC}, +{0x0002F81A, 0x000051AC}, {0x0002F81B, 0x000051B5}, {0x0002F81C, 0x000291DF}, {0x0002F81D, 0x000051F5}, +{0x0002F81E, 0x00005203}, {0x0002F81F, 0x000034DF}, {0x0002F820, 0x0000523B}, {0x0002F821, 0x00005246}, +{0x0002F822, 0x00005272}, {0x0002F823, 0x00005277}, {0x0002F824, 0x00003515}, {0x0002F825, 0x000052C7}, +{0x0002F826, 0x000052C9}, {0x0002F827, 0x000052E4}, {0x0002F828, 0x000052FA}, {0x0002F829, 0x00005305}, +{0x0002F82A, 0x00005306}, {0x0002F82B, 0x00005317}, {0x0002F82C, 0x00005349}, {0x0002F82D, 0x00005351}, +{0x0002F82E, 0x0000535A}, {0x0002F82F, 0x00005373}, {0x0002F830, 0x0000537D}, {0x0002F831, 0x0000537F}, +{0x0002F832, 0x0000537F}, {0x0002F833, 0x0000537F}, {0x0002F834, 0x00020A2C}, {0x0002F835, 0x00007070}, +{0x0002F836, 0x000053CA}, {0x0002F837, 0x000053DF}, {0x0002F838, 0x00020B63}, {0x0002F839, 0x000053EB}, +{0x0002F83A, 0x000053F1}, {0x0002F83B, 0x00005406}, {0x0002F83C, 0x0000549E}, {0x0002F83D, 0x00005438}, +{0x0002F83E, 0x00005448}, {0x0002F83F, 0x00005468}, {0x0002F840, 0x000054A2}, {0x0002F841, 0x000054F6}, +{0x0002F842, 0x00005510}, {0x0002F843, 0x00005553}, {0x0002F844, 0x00005563}, {0x0002F845, 0x00005584}, +{0x0002F846, 0x00005584}, {0x0002F847, 0x00005599}, {0x0002F848, 0x000055AB}, {0x0002F849, 0x000055B3}, +{0x0002F84A, 0x000055C2}, {0x0002F84B, 0x00005716}, {0x0002F84C, 0x00005606}, {0x0002F84D, 0x00005717}, +{0x0002F84E, 0x00005651}, {0x0002F84F, 0x00005674}, {0x0002F850, 0x00005207}, {0x0002F851, 0x000058EE}, +{0x0002F852, 0x000057CE}, {0x0002F853, 0x000057F4}, {0x0002F854, 0x0000580D}, {0x0002F855, 0x0000578B}, +{0x0002F856, 0x00005832}, {0x0002F857, 0x00005831}, {0x0002F858, 0x000058AC}, {0x0002F859, 0x000214E4}, +{0x0002F85A, 0x000058F2}, {0x0002F85B, 0x000058F7}, {0x0002F85C, 0x00005906}, {0x0002F85D, 0x0000591A}, +{0x0002F85E, 0x00005922}, {0x0002F85F, 0x00005962}, {0x0002F860, 0x000216A8}, {0x0002F861, 0x000216EA}, +{0x0002F862, 0x000059EC}, {0x0002F863, 0x00005A1B}, {0x0002F864, 0x00005A27}, {0x0002F865, 0x000059D8}, +{0x0002F866, 0x00005A66}, {0x0002F867, 0x000036EE}, {0x0002F868, 0x000036FC}, {0x0002F869, 0x00005B08}, +{0x0002F86A, 0x00005B3E}, {0x0002F86B, 0x00005B3E}, {0x0002F86C, 0x000219C8}, {0x0002F86D, 0x00005BC3}, +{0x0002F86E, 0x00005BD8}, {0x0002F86F, 0x00005BE7}, {0x0002F870, 0x00005BF3}, {0x0002F871, 0x00021B18}, +{0x0002F872, 0x00005BFF}, {0x0002F873, 0x00005C06}, {0x0002F874, 0x00005F53}, {0x0002F875, 0x00005C22}, +{0x0002F876, 0x00003781}, {0x0002F877, 0x00005C60}, {0x0002F878, 0x00005C6E}, {0x0002F879, 0x00005CC0}, +{0x0002F87A, 0x00005C8D}, {0x0002F87B, 0x00021DE4}, {0x0002F87C, 0x00005D43}, {0x0002F87D, 0x00021DE6}, +{0x0002F87E, 0x00005D6E}, {0x0002F87F, 0x00005D6B}, {0x0002F880, 0x00005D7C}, {0x0002F881, 0x00005DE1}, +{0x0002F882, 0x00005DE2}, {0x0002F883, 0x0000382F}, {0x0002F884, 0x00005DFD}, {0x0002F885, 0x00005E28}, +{0x0002F886, 0x00005E3D}, {0x0002F887, 0x00005E69}, {0x0002F888, 0x00003862}, {0x0002F889, 0x00022183}, +{0x0002F88A, 0x0000387C}, {0x0002F88B, 0x00005EB0}, {0x0002F88C, 0x00005EB3}, {0x0002F88D, 0x00005EB6}, +{0x0002F88E, 0x00005ECA}, {0x0002F88F, 0x0002A392}, {0x0002F890, 0x00005EFE}, {0x0002F891, 0x00022331}, +{0x0002F892, 0x00022331}, {0x0002F893, 0x00008201}, {0x0002F894, 0x00005F22}, {0x0002F895, 0x00005F22}, +{0x0002F896, 0x000038C7}, {0x0002F897, 0x000232B8}, {0x0002F898, 0x000261DA}, {0x0002F899, 0x00005F62}, +{0x0002F89A, 0x00005F6B}, {0x0002F89B, 0x000038E3}, {0x0002F89C, 0x00005F9A}, {0x0002F89D, 0x00005FCD}, +{0x0002F89E, 0x00005FD7}, {0x0002F89F, 0x00005FF9}, {0x0002F8A0, 0x00006081}, {0x0002F8A1, 0x0000393A}, +{0x0002F8A2, 0x0000391C}, {0x0002F8A3, 0x00006094}, {0x0002F8A4, 0x000226D4}, {0x0002F8A5, 0x000060C7}, +{0x0002F8A6, 0x00006148}, {0x0002F8A7, 0x0000614C}, {0x0002F8A8, 0x0000614E}, {0x0002F8A9, 0x0000614C}, +{0x0002F8AA, 0x0000617A}, {0x0002F8AB, 0x0000618E}, {0x0002F8AC, 0x000061B2}, {0x0002F8AD, 0x000061A4}, +{0x0002F8AE, 0x000061AF}, {0x0002F8AF, 0x000061DE}, {0x0002F8B0, 0x000061F2}, {0x0002F8B1, 0x000061F6}, +{0x0002F8B2, 0x00006210}, {0x0002F8B3, 0x0000621B}, {0x0002F8B4, 0x0000625D}, {0x0002F8B5, 0x000062B1}, +{0x0002F8B6, 0x000062D4}, {0x0002F8B7, 0x00006350}, {0x0002F8B8, 0x00022B0C}, {0x0002F8B9, 0x0000633D}, +{0x0002F8BA, 0x000062FC}, {0x0002F8BB, 0x00006368}, {0x0002F8BC, 0x00006383}, {0x0002F8BD, 0x000063E4}, +{0x0002F8BE, 0x00022BF1}, {0x0002F8BF, 0x00006422}, {0x0002F8C0, 0x000063C5}, {0x0002F8C1, 0x000063A9}, +{0x0002F8C2, 0x00003A2E}, {0x0002F8C3, 0x00006469}, {0x0002F8C4, 0x0000647E}, {0x0002F8C5, 0x0000649D}, +{0x0002F8C6, 0x00006477}, {0x0002F8C7, 0x00003A6C}, {0x0002F8C8, 0x0000654F}, {0x0002F8C9, 0x0000656C}, +{0x0002F8CA, 0x0002300A}, {0x0002F8CB, 0x000065E3}, {0x0002F8CC, 0x000066F8}, {0x0002F8CD, 0x00006649}, +{0x0002F8CE, 0x00003B19}, {0x0002F8CF, 0x00006691}, {0x0002F8D0, 0x00003B08}, {0x0002F8D1, 0x00003AE4}, +{0x0002F8D2, 0x00005192}, {0x0002F8D3, 0x00005195}, {0x0002F8D4, 0x00006700}, {0x0002F8D5, 0x0000669C}, +{0x0002F8D6, 0x000080AD}, {0x0002F8D7, 0x000043D9}, {0x0002F8D8, 0x00006717}, {0x0002F8D9, 0x0000671B}, +{0x0002F8DA, 0x00006721}, {0x0002F8DB, 0x0000675E}, {0x0002F8DC, 0x00006753}, {0x0002F8DD, 0x000233C3}, +{0x0002F8DE, 0x00003B49}, {0x0002F8DF, 0x000067FA}, {0x0002F8E0, 0x00006785}, {0x0002F8E1, 0x00006852}, +{0x0002F8E2, 0x00006885}, {0x0002F8E3, 0x0002346D}, {0x0002F8E4, 0x0000688E}, {0x0002F8E5, 0x0000681F}, +{0x0002F8E6, 0x00006914}, {0x0002F8E7, 0x00003B9D}, {0x0002F8E8, 0x00006942}, {0x0002F8E9, 0x000069A3}, +{0x0002F8EA, 0x000069EA}, {0x0002F8EB, 0x00006AA8}, {0x0002F8EC, 0x000236A3}, {0x0002F8ED, 0x00006ADB}, +{0x0002F8EE, 0x00003C18}, {0x0002F8EF, 0x00006B21}, {0x0002F8F0, 0x000238A7}, {0x0002F8F1, 0x00006B54}, +{0x0002F8F2, 0x00003C4E}, {0x0002F8F3, 0x00006B72}, {0x0002F8F4, 0x00006B9F}, {0x0002F8F5, 0x00006BBA}, +{0x0002F8F6, 0x00006BBB}, {0x0002F8F7, 0x00023A8D}, {0x0002F8F8, 0x00021D0B}, {0x0002F8F9, 0x00023AFA}, +{0x0002F8FA, 0x00006C4E}, {0x0002F8FB, 0x00023CBC}, {0x0002F8FC, 0x00006CBF}, {0x0002F8FD, 0x00006CCD}, +{0x0002F8FE, 0x00006C67}, {0x0002F8FF, 0x00006D16}, {0x0002F900, 0x00006D3E}, {0x0002F901, 0x00006D77}, +{0x0002F902, 0x00006D41}, {0x0002F903, 0x00006D69}, {0x0002F904, 0x00006D78}, {0x0002F905, 0x00006D85}, +{0x0002F906, 0x00023D1E}, {0x0002F907, 0x00006D34}, {0x0002F908, 0x00006E2F}, {0x0002F909, 0x00006E6E}, +{0x0002F90A, 0x00003D33}, {0x0002F90B, 0x00006ECB}, {0x0002F90C, 0x00006EC7}, {0x0002F90D, 0x00023ED1}, +{0x0002F90E, 0x00006DF9}, {0x0002F90F, 0x00006F6E}, {0x0002F910, 0x00023F5E}, {0x0002F911, 0x00023F8E}, +{0x0002F912, 0x00006FC6}, {0x0002F913, 0x00007039}, {0x0002F914, 0x0000701E}, {0x0002F915, 0x0000701B}, +{0x0002F916, 0x00003D96}, {0x0002F917, 0x0000704A}, {0x0002F918, 0x0000707D}, {0x0002F919, 0x00007077}, +{0x0002F91A, 0x000070AD}, {0x0002F91B, 0x00020525}, {0x0002F91C, 0x00007145}, {0x0002F91D, 0x00024263}, +{0x0002F91E, 0x0000719C}, {0x0002F91F, 0x000243AB}, {0x0002F920, 0x00007228}, {0x0002F921, 0x00007235}, +{0x0002F922, 0x00007250}, {0x0002F923, 0x00024608}, {0x0002F924, 0x00007280}, {0x0002F925, 0x00007295}, +{0x0002F926, 0x00024735}, {0x0002F927, 0x00024814}, {0x0002F928, 0x0000737A}, {0x0002F929, 0x0000738B}, +{0x0002F92A, 0x00003EAC}, {0x0002F92B, 0x000073A5}, {0x0002F92C, 0x00003EB8}, {0x0002F92D, 0x00003EB8}, +{0x0002F92E, 0x00007447}, {0x0002F92F, 0x0000745C}, {0x0002F930, 0x00007471}, {0x0002F931, 0x00007485}, +{0x0002F932, 0x000074CA}, {0x0002F933, 0x00003F1B}, {0x0002F934, 0x00007524}, {0x0002F935, 0x00024C36}, +{0x0002F936, 0x0000753E}, {0x0002F937, 0x00024C92}, {0x0002F938, 0x00007570}, {0x0002F939, 0x0002219F}, +{0x0002F93A, 0x00007610}, {0x0002F93B, 0x00024FA1}, {0x0002F93C, 0x00024FB8}, {0x0002F93D, 0x00025044}, +{0x0002F93E, 0x00003FFC}, {0x0002F93F, 0x00004008}, {0x0002F940, 0x000076F4}, {0x0002F941, 0x000250F3}, +{0x0002F942, 0x000250F2}, {0x0002F943, 0x00025119}, {0x0002F944, 0x00025133}, {0x0002F945, 0x0000771E}, +{0x0002F946, 0x0000771F}, {0x0002F947, 0x0000771F}, {0x0002F948, 0x0000774A}, {0x0002F949, 0x00004039}, +{0x0002F94A, 0x0000778B}, {0x0002F94B, 0x00004046}, {0x0002F94C, 0x00004096}, {0x0002F94D, 0x0002541D}, +{0x0002F94E, 0x0000784E}, {0x0002F94F, 0x0000788C}, {0x0002F950, 0x000078CC}, {0x0002F951, 0x000040E3}, +{0x0002F952, 0x00025626}, {0x0002F953, 0x00007956}, {0x0002F954, 0x0002569A}, {0x0002F955, 0x000256C5}, +{0x0002F956, 0x0000798F}, {0x0002F957, 0x000079EB}, {0x0002F958, 0x0000412F}, {0x0002F959, 0x00007A40}, +{0x0002F95A, 0x00007A4A}, {0x0002F95B, 0x00007A4F}, {0x0002F95C, 0x0002597C}, {0x0002F95D, 0x00025AA7}, +{0x0002F95E, 0x00025AA7}, {0x0002F95F, 0x00007AEE}, {0x0002F960, 0x00004202}, {0x0002F961, 0x00025BAB}, +{0x0002F962, 0x00007BC6}, {0x0002F963, 0x00007BC9}, {0x0002F964, 0x00004227}, {0x0002F965, 0x00025C80}, +{0x0002F966, 0x00007CD2}, {0x0002F967, 0x000042A0}, {0x0002F968, 0x00007CE8}, {0x0002F969, 0x00007CE3}, +{0x0002F96A, 0x00007D00}, {0x0002F96B, 0x00025F86}, {0x0002F96C, 0x00007D63}, {0x0002F96D, 0x00004301}, +{0x0002F96E, 0x00007DC7}, {0x0002F96F, 0x00007E02}, {0x0002F970, 0x00007E45}, {0x0002F971, 0x00004334}, +{0x0002F972, 0x00026228}, {0x0002F973, 0x00026247}, {0x0002F974, 0x00004359}, {0x0002F975, 0x000262D9}, +{0x0002F976, 0x00007F7A}, {0x0002F977, 0x0002633E}, {0x0002F978, 0x00007F95}, {0x0002F979, 0x00007FFA}, +{0x0002F97A, 0x00008005}, {0x0002F97B, 0x000264DA}, {0x0002F97C, 0x00026523}, {0x0002F97D, 0x00008060}, +{0x0002F97E, 0x000265A8}, {0x0002F97F, 0x00008070}, {0x0002F980, 0x0002335F}, {0x0002F981, 0x000043D5}, +{0x0002F982, 0x000080B2}, {0x0002F983, 0x00008103}, {0x0002F984, 0x0000440B}, {0x0002F985, 0x0000813E}, +{0x0002F986, 0x00005AB5}, {0x0002F987, 0x000267A7}, {0x0002F988, 0x000267B5}, {0x0002F989, 0x00023393}, +{0x0002F98A, 0x0002339C}, {0x0002F98B, 0x00008201}, {0x0002F98C, 0x00008204}, {0x0002F98D, 0x00008F9E}, +{0x0002F98E, 0x0000446B}, {0x0002F98F, 0x00008291}, {0x0002F990, 0x0000828B}, {0x0002F991, 0x0000829D}, +{0x0002F992, 0x000052B3}, {0x0002F993, 0x000082B1}, {0x0002F994, 0x000082B3}, {0x0002F995, 0x000082BD}, +{0x0002F996, 0x000082E6}, {0x0002F997, 0x00026B3C}, {0x0002F998, 0x000082E5}, {0x0002F999, 0x0000831D}, +{0x0002F99A, 0x00008363}, {0x0002F99B, 0x000083AD}, {0x0002F99C, 0x00008323}, {0x0002F99D, 0x000083BD}, +{0x0002F99E, 0x000083E7}, {0x0002F99F, 0x00008457}, {0x0002F9A0, 0x00008353}, {0x0002F9A1, 0x000083CA}, +{0x0002F9A2, 0x000083CC}, {0x0002F9A3, 0x000083DC}, {0x0002F9A4, 0x00026C36}, {0x0002F9A5, 0x00026D6B}, +{0x0002F9A6, 0x00026CD5}, {0x0002F9A7, 0x0000452B}, {0x0002F9A8, 0x000084F1}, {0x0002F9A9, 0x000084F3}, +{0x0002F9AA, 0x00008516}, {0x0002F9AB, 0x000273CA}, {0x0002F9AC, 0x00008564}, {0x0002F9AD, 0x00026F2C}, +{0x0002F9AE, 0x0000455D}, {0x0002F9AF, 0x00004561}, {0x0002F9B0, 0x00026FB1}, {0x0002F9B1, 0x000270D2}, +{0x0002F9B2, 0x0000456B}, {0x0002F9B3, 0x00008650}, {0x0002F9B4, 0x0000865C}, {0x0002F9B5, 0x00008667}, +{0x0002F9B6, 0x00008669}, {0x0002F9B7, 0x000086A9}, {0x0002F9B8, 0x00008688}, {0x0002F9B9, 0x0000870E}, +{0x0002F9BA, 0x000086E2}, {0x0002F9BB, 0x00008779}, {0x0002F9BC, 0x00008728}, {0x0002F9BD, 0x0000876B}, +{0x0002F9BE, 0x00008786}, {0x0002F9BF, 0x000045D7}, {0x0002F9C0, 0x000087E1}, {0x0002F9C1, 0x00008801}, +{0x0002F9C2, 0x000045F9}, {0x0002F9C3, 0x00008860}, {0x0002F9C4, 0x00008863}, {0x0002F9C5, 0x00027667}, +{0x0002F9C6, 0x000088D7}, {0x0002F9C7, 0x000088DE}, {0x0002F9C8, 0x00004635}, {0x0002F9C9, 0x000088FA}, +{0x0002F9CA, 0x000034BB}, {0x0002F9CB, 0x000278AE}, {0x0002F9CC, 0x00027966}, {0x0002F9CD, 0x000046BE}, +{0x0002F9CE, 0x000046C7}, {0x0002F9CF, 0x00008AA0}, {0x0002F9D0, 0x00008AED}, {0x0002F9D1, 0x00008B8A}, +{0x0002F9D2, 0x00008C55}, {0x0002F9D3, 0x00027CA8}, {0x0002F9D4, 0x00008CAB}, {0x0002F9D5, 0x00008CC1}, +{0x0002F9D6, 0x00008D1B}, {0x0002F9D7, 0x00008D77}, {0x0002F9D8, 0x00027F2F}, {0x0002F9D9, 0x00020804}, +{0x0002F9DA, 0x00008DCB}, {0x0002F9DB, 0x00008DBC}, {0x0002F9DC, 0x00008DF0}, {0x0002F9DD, 0x000208DE}, +{0x0002F9DE, 0x00008ED4}, {0x0002F9DF, 0x00008F38}, {0x0002F9E0, 0x000285D2}, {0x0002F9E1, 0x000285ED}, +{0x0002F9E2, 0x00009094}, {0x0002F9E3, 0x000090F1}, {0x0002F9E4, 0x00009111}, {0x0002F9E5, 0x0002872E}, +{0x0002F9E6, 0x0000911B}, {0x0002F9E7, 0x00009238}, {0x0002F9E8, 0x000092D7}, {0x0002F9E9, 0x000092D8}, +{0x0002F9EA, 0x0000927C}, {0x0002F9EB, 0x000093F9}, {0x0002F9EC, 0x00009415}, {0x0002F9ED, 0x00028BFA}, +{0x0002F9EE, 0x0000958B}, {0x0002F9EF, 0x00004995}, {0x0002F9F0, 0x000095B7}, {0x0002F9F1, 0x00028D77}, +{0x0002F9F2, 0x000049E6}, {0x0002F9F3, 0x000096C3}, {0x0002F9F4, 0x00005DB2}, {0x0002F9F5, 0x00009723}, +{0x0002F9F6, 0x00029145}, {0x0002F9F7, 0x0002921A}, {0x0002F9F8, 0x00004A6E}, {0x0002F9F9, 0x00004A76}, +{0x0002F9FA, 0x000097E0}, {0x0002F9FB, 0x0002940A}, {0x0002F9FC, 0x00004AB2}, {0x0002F9FD, 0x00029496}, +{0x0002F9FE, 0x0000980B}, {0x0002F9FF, 0x0000980B}, {0x0002FA00, 0x00009829}, {0x0002FA01, 0x000295B6}, +{0x0002FA02, 0x000098E2}, {0x0002FA03, 0x00004B33}, {0x0002FA04, 0x00009929}, {0x0002FA05, 0x000099A7}, +{0x0002FA06, 0x000099C2}, {0x0002FA07, 0x000099FE}, {0x0002FA08, 0x00004BCE}, {0x0002FA09, 0x00029B30}, +{0x0002FA0A, 0x00009B12}, {0x0002FA0B, 0x00009C40}, {0x0002FA0C, 0x00009CFD}, {0x0002FA0D, 0x00004CCE}, +{0x0002FA0E, 0x00004CED}, {0x0002FA0F, 0x00009D67}, {0x0002FA10, 0x0002A0CE}, {0x0002FA11, 0x00004CF8}, +{0x0002FA12, 0x0002A105}, {0x0002FA13, 0x0002A20E}, {0x0002FA14, 0x0002A291}, {0x0002FA15, 0x00009EBB}, +{0x0002FA16, 0x00004D56}, {0x0002FA17, 0x00009EF9}, {0x0002FA18, 0x00009EFE}, {0x0002FA19, 0x00009F05}, +{0x0002FA1A, 0x00009F0F}, {0x0002FA1B, 0x00009F16}, {0x0002FA1D, 0x0002A600}, +}; diff --git a/unicode-data.h b/unicode-data.h new file mode 100644 index 000000000..3cccf2068 --- /dev/null +++ b/unicode-data.h @@ -0,0 +1,17 @@ +#pragma once + +#include +#include +#include +#include + +extern const std::vector> unicode_ranges_number; +extern const std::vector> unicode_ranges_letter; +extern const std::vector> unicode_ranges_separator; +extern const std::vector> unicode_ranges_whitespace; +extern const std::vector> unicode_ranges_accent_mark; +extern const std::vector> unicode_ranges_punctuation; +extern const std::vector> unicode_ranges_symbol; +extern const std::vector> unicode_ranges_control; +extern const std::multimap unicode_map_nfd; +extern const std::map unicode_map_lowercase; diff --git a/unicode.cpp b/unicode.cpp index 7fce6fb34..ca03c49d3 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -1,1414 +1,19 @@ #include "unicode.h" +#include "unicode-data.h" #include +#include +#include #include +#include #include #include #include +#include +#include #include - -static const std::vector> unicode_ranges_digit = { -{0x00000030, 0x00000039}, {0x000000B2, 0x000000B3}, {0x000000B9, 0x000000B9}, {0x00000660, 0x00000669}, -{0x000006F0, 0x000006F9}, {0x000007C0, 0x000007C9}, {0x00000966, 0x0000096F}, {0x000009E6, 0x000009EF}, -{0x00000A66, 0x00000A6F}, {0x00000AE6, 0x00000AEF}, {0x00000B66, 0x00000B6F}, {0x00000BE6, 0x00000BEF}, -{0x00000C66, 0x00000C6F}, {0x00000CE6, 0x00000CEF}, {0x00000D66, 0x00000D6F}, {0x00000DE6, 0x00000DEF}, -{0x00000E50, 0x00000E59}, {0x00000ED0, 0x00000ED9}, {0x00000F20, 0x00000F29}, {0x00001040, 0x00001049}, -{0x00001090, 0x00001099}, {0x00001369, 0x00001371}, {0x000017E0, 0x000017E9}, {0x00001810, 0x00001819}, -{0x00001946, 0x0000194F}, {0x000019D0, 0x000019DA}, {0x00001A80, 0x00001A89}, {0x00001A90, 0x00001A99}, -{0x00001B50, 0x00001B59}, {0x00001BB0, 0x00001BB9}, {0x00001C40, 0x00001C49}, {0x00001C50, 0x00001C59}, -{0x00002070, 0x00002070}, {0x00002074, 0x00002079}, {0x00002080, 0x00002089}, {0x00002460, 0x00002468}, -{0x00002474, 0x0000247C}, {0x00002488, 0x00002490}, {0x000024EA, 0x000024EA}, {0x000024F5, 0x000024FD}, -{0x000024FF, 0x000024FF}, {0x00002776, 0x0000277E}, {0x00002780, 0x00002788}, {0x0000278A, 0x00002792}, -{0x0000A620, 0x0000A629}, {0x0000A8D0, 0x0000A8D9}, {0x0000A900, 0x0000A909}, {0x0000A9D0, 0x0000A9D9}, -{0x0000A9F0, 0x0000A9F9}, {0x0000AA50, 0x0000AA59}, {0x0000ABF0, 0x0000ABF9}, {0x0000FF10, 0x0000FF19}, -{0x000104A0, 0x000104A9}, {0x00010A40, 0x00010A43}, {0x00010D30, 0x00010D39}, {0x00010E60, 0x00010E68}, -{0x00011052, 0x0001105A}, {0x00011066, 0x0001106F}, {0x000110F0, 0x000110F9}, {0x00011136, 0x0001113F}, -{0x000111D0, 0x000111D9}, {0x000112F0, 0x000112F9}, {0x00011450, 0x00011459}, {0x000114D0, 0x000114D9}, -{0x00011650, 0x00011659}, {0x000116C0, 0x000116C9}, {0x00011730, 0x00011739}, {0x000118E0, 0x000118E9}, -{0x00011950, 0x00011959}, {0x00011C50, 0x00011C59}, {0x00011D50, 0x00011D59}, {0x00011DA0, 0x00011DA9}, -{0x00016A60, 0x00016A69}, {0x00016B50, 0x00016B59}, {0x0001D7CE, 0x0001D7FF}, {0x0001E140, 0x0001E149}, -{0x0001E2F0, 0x0001E2F9}, {0x0001E950, 0x0001E959}, {0x0001F100, 0x0001F10A}, {0x0001FBF0, 0x0001FBF9}, -}; - -static const std::vector> unicode_ranges_letter = { -{0x00000041, 0x0000005A}, {0x00000061, 0x0000007A}, {0x000000AA, 0x000000AA}, {0x000000B5, 0x000000B5}, -{0x000000BA, 0x000000BA}, {0x000000C0, 0x000000D6}, {0x000000D8, 0x000000F6}, {0x000000F8, 0x000002C1}, -{0x000002C6, 0x000002D1}, {0x000002E0, 0x000002E4}, {0x000002EC, 0x000002EC}, {0x000002EE, 0x000002EE}, -{0x00000370, 0x00000374}, {0x00000376, 0x00000377}, {0x0000037A, 0x0000037D}, {0x0000037F, 0x0000037F}, -{0x00000386, 0x00000386}, {0x00000388, 0x0000038A}, {0x0000038C, 0x0000038C}, {0x0000038E, 0x000003A1}, -{0x000003A3, 0x000003F5}, {0x000003F7, 0x00000481}, {0x0000048A, 0x0000052F}, {0x00000531, 0x00000556}, -{0x00000559, 0x00000559}, {0x00000560, 0x00000588}, {0x000005D0, 0x000005EA}, {0x000005EF, 0x000005F2}, -{0x00000620, 0x0000064A}, {0x0000066E, 0x0000066F}, {0x00000671, 0x000006D3}, {0x000006D5, 0x000006D5}, -{0x000006E5, 0x000006E6}, {0x000006EE, 0x000006EF}, {0x000006FA, 0x000006FC}, {0x000006FF, 0x000006FF}, -{0x00000710, 0x00000710}, {0x00000712, 0x0000072F}, {0x0000074D, 0x000007A5}, {0x000007B1, 0x000007B1}, -{0x000007CA, 0x000007EA}, {0x000007F4, 0x000007F5}, {0x000007FA, 0x000007FA}, {0x00000800, 0x00000815}, -{0x0000081A, 0x0000081A}, {0x00000824, 0x00000824}, {0x00000828, 0x00000828}, {0x00000840, 0x00000858}, -{0x00000860, 0x0000086A}, {0x000008A0, 0x000008B4}, {0x000008B6, 0x000008C7}, {0x00000904, 0x00000939}, -{0x0000093D, 0x0000093D}, {0x00000950, 0x00000950}, {0x00000958, 0x00000961}, {0x00000971, 0x00000980}, -{0x00000985, 0x0000098C}, {0x0000098F, 0x00000990}, {0x00000993, 0x000009A8}, {0x000009AA, 0x000009B0}, -{0x000009B2, 0x000009B2}, {0x000009B6, 0x000009B9}, {0x000009BD, 0x000009BD}, {0x000009CE, 0x000009CE}, -{0x000009DC, 0x000009DD}, {0x000009DF, 0x000009E1}, {0x000009F0, 0x000009F1}, {0x000009FC, 0x000009FC}, -{0x00000A05, 0x00000A0A}, {0x00000A0F, 0x00000A10}, {0x00000A13, 0x00000A28}, {0x00000A2A, 0x00000A30}, -{0x00000A32, 0x00000A33}, {0x00000A35, 0x00000A36}, {0x00000A38, 0x00000A39}, {0x00000A59, 0x00000A5C}, -{0x00000A5E, 0x00000A5E}, {0x00000A72, 0x00000A74}, {0x00000A85, 0x00000A8D}, {0x00000A8F, 0x00000A91}, -{0x00000A93, 0x00000AA8}, {0x00000AAA, 0x00000AB0}, {0x00000AB2, 0x00000AB3}, {0x00000AB5, 0x00000AB9}, -{0x00000ABD, 0x00000ABD}, {0x00000AD0, 0x00000AD0}, {0x00000AE0, 0x00000AE1}, {0x00000AF9, 0x00000AF9}, -{0x00000B05, 0x00000B0C}, {0x00000B0F, 0x00000B10}, {0x00000B13, 0x00000B28}, {0x00000B2A, 0x00000B30}, -{0x00000B32, 0x00000B33}, {0x00000B35, 0x00000B39}, {0x00000B3D, 0x00000B3D}, {0x00000B5C, 0x00000B5D}, -{0x00000B5F, 0x00000B61}, {0x00000B71, 0x00000B71}, {0x00000B83, 0x00000B83}, {0x00000B85, 0x00000B8A}, -{0x00000B8E, 0x00000B90}, {0x00000B92, 0x00000B95}, {0x00000B99, 0x00000B9A}, {0x00000B9C, 0x00000B9C}, -{0x00000B9E, 0x00000B9F}, {0x00000BA3, 0x00000BA4}, {0x00000BA8, 0x00000BAA}, {0x00000BAE, 0x00000BB9}, -{0x00000BD0, 0x00000BD0}, {0x00000C05, 0x00000C0C}, {0x00000C0E, 0x00000C10}, {0x00000C12, 0x00000C28}, -{0x00000C2A, 0x00000C39}, {0x00000C3D, 0x00000C3D}, {0x00000C58, 0x00000C5A}, {0x00000C60, 0x00000C61}, -{0x00000C80, 0x00000C80}, {0x00000C85, 0x00000C8C}, {0x00000C8E, 0x00000C90}, {0x00000C92, 0x00000CA8}, -{0x00000CAA, 0x00000CB3}, {0x00000CB5, 0x00000CB9}, {0x00000CBD, 0x00000CBD}, {0x00000CDE, 0x00000CDE}, -{0x00000CE0, 0x00000CE1}, {0x00000CF1, 0x00000CF2}, {0x00000D04, 0x00000D0C}, {0x00000D0E, 0x00000D10}, -{0x00000D12, 0x00000D3A}, {0x00000D3D, 0x00000D3D}, {0x00000D4E, 0x00000D4E}, {0x00000D54, 0x00000D56}, -{0x00000D5F, 0x00000D61}, {0x00000D7A, 0x00000D7F}, {0x00000D85, 0x00000D96}, {0x00000D9A, 0x00000DB1}, -{0x00000DB3, 0x00000DBB}, {0x00000DBD, 0x00000DBD}, {0x00000DC0, 0x00000DC6}, {0x00000E01, 0x00000E30}, -{0x00000E32, 0x00000E33}, {0x00000E40, 0x00000E46}, {0x00000E81, 0x00000E82}, {0x00000E84, 0x00000E84}, -{0x00000E86, 0x00000E8A}, {0x00000E8C, 0x00000EA3}, {0x00000EA5, 0x00000EA5}, {0x00000EA7, 0x00000EB0}, -{0x00000EB2, 0x00000EB3}, {0x00000EBD, 0x00000EBD}, {0x00000EC0, 0x00000EC4}, {0x00000EC6, 0x00000EC6}, -{0x00000EDC, 0x00000EDF}, {0x00000F00, 0x00000F00}, {0x00000F40, 0x00000F47}, {0x00000F49, 0x00000F6C}, -{0x00000F88, 0x00000F8C}, {0x00001000, 0x0000102A}, {0x0000103F, 0x0000103F}, {0x00001050, 0x00001055}, -{0x0000105A, 0x0000105D}, {0x00001061, 0x00001061}, {0x00001065, 0x00001066}, {0x0000106E, 0x00001070}, -{0x00001075, 0x00001081}, {0x0000108E, 0x0000108E}, {0x000010A0, 0x000010C5}, {0x000010C7, 0x000010C7}, -{0x000010CD, 0x000010CD}, {0x000010D0, 0x000010FA}, {0x000010FC, 0x00001248}, {0x0000124A, 0x0000124D}, -{0x00001250, 0x00001256}, {0x00001258, 0x00001258}, {0x0000125A, 0x0000125D}, {0x00001260, 0x00001288}, -{0x0000128A, 0x0000128D}, {0x00001290, 0x000012B0}, {0x000012B2, 0x000012B5}, {0x000012B8, 0x000012BE}, -{0x000012C0, 0x000012C0}, {0x000012C2, 0x000012C5}, {0x000012C8, 0x000012D6}, {0x000012D8, 0x00001310}, -{0x00001312, 0x00001315}, {0x00001318, 0x0000135A}, {0x00001380, 0x0000138F}, {0x000013A0, 0x000013F5}, -{0x000013F8, 0x000013FD}, {0x00001401, 0x0000166C}, {0x0000166F, 0x0000167F}, {0x00001681, 0x0000169A}, -{0x000016A0, 0x000016EA}, {0x000016F1, 0x000016F8}, {0x00001700, 0x0000170C}, {0x0000170E, 0x00001711}, -{0x00001720, 0x00001731}, {0x00001740, 0x00001751}, {0x00001760, 0x0000176C}, {0x0000176E, 0x00001770}, -{0x00001780, 0x000017B3}, {0x000017D7, 0x000017D7}, {0x000017DC, 0x000017DC}, {0x00001820, 0x00001878}, -{0x00001880, 0x00001884}, {0x00001887, 0x000018A8}, {0x000018AA, 0x000018AA}, {0x000018B0, 0x000018F5}, -{0x00001900, 0x0000191E}, {0x00001950, 0x0000196D}, {0x00001970, 0x00001974}, {0x00001980, 0x000019AB}, -{0x000019B0, 0x000019C9}, {0x00001A00, 0x00001A16}, {0x00001A20, 0x00001A54}, {0x00001AA7, 0x00001AA7}, -{0x00001B05, 0x00001B33}, {0x00001B45, 0x00001B4B}, {0x00001B83, 0x00001BA0}, {0x00001BAE, 0x00001BAF}, -{0x00001BBA, 0x00001BE5}, {0x00001C00, 0x00001C23}, {0x00001C4D, 0x00001C4F}, {0x00001C5A, 0x00001C7D}, -{0x00001C80, 0x00001C88}, {0x00001C90, 0x00001CBA}, {0x00001CBD, 0x00001CBF}, {0x00001CE9, 0x00001CEC}, -{0x00001CEE, 0x00001CF3}, {0x00001CF5, 0x00001CF6}, {0x00001CFA, 0x00001CFA}, {0x00001D00, 0x00001DBF}, -{0x00001E00, 0x00001F15}, {0x00001F18, 0x00001F1D}, {0x00001F20, 0x00001F45}, {0x00001F48, 0x00001F4D}, -{0x00001F50, 0x00001F57}, {0x00001F59, 0x00001F59}, {0x00001F5B, 0x00001F5B}, {0x00001F5D, 0x00001F5D}, -{0x00001F5F, 0x00001F7D}, {0x00001F80, 0x00001FB4}, {0x00001FB6, 0x00001FBC}, {0x00001FBE, 0x00001FBE}, -{0x00001FC2, 0x00001FC4}, {0x00001FC6, 0x00001FCC}, {0x00001FD0, 0x00001FD3}, {0x00001FD6, 0x00001FDB}, -{0x00001FE0, 0x00001FEC}, {0x00001FF2, 0x00001FF4}, {0x00001FF6, 0x00001FFC}, {0x00002071, 0x00002071}, -{0x0000207F, 0x0000207F}, {0x00002090, 0x0000209C}, {0x00002102, 0x00002102}, {0x00002107, 0x00002107}, -{0x0000210A, 0x00002113}, {0x00002115, 0x00002115}, {0x00002119, 0x0000211D}, {0x00002124, 0x00002124}, -{0x00002126, 0x00002126}, {0x00002128, 0x00002128}, {0x0000212A, 0x0000212D}, {0x0000212F, 0x00002139}, -{0x0000213C, 0x0000213F}, {0x00002145, 0x00002149}, {0x0000214E, 0x0000214E}, {0x00002183, 0x00002184}, -{0x00002C00, 0x00002C2E}, {0x00002C30, 0x00002C5E}, {0x00002C60, 0x00002CE4}, {0x00002CEB, 0x00002CEE}, -{0x00002CF2, 0x00002CF3}, {0x00002D00, 0x00002D25}, {0x00002D27, 0x00002D27}, {0x00002D2D, 0x00002D2D}, -{0x00002D30, 0x00002D67}, {0x00002D6F, 0x00002D6F}, {0x00002D80, 0x00002D96}, {0x00002DA0, 0x00002DA6}, -{0x00002DA8, 0x00002DAE}, {0x00002DB0, 0x00002DB6}, {0x00002DB8, 0x00002DBE}, {0x00002DC0, 0x00002DC6}, -{0x00002DC8, 0x00002DCE}, {0x00002DD0, 0x00002DD6}, {0x00002DD8, 0x00002DDE}, {0x00002E2F, 0x00002E2F}, -{0x00003005, 0x00003006}, {0x00003031, 0x00003035}, {0x0000303B, 0x0000303C}, {0x00003041, 0x00003096}, -{0x0000309D, 0x0000309F}, {0x000030A1, 0x000030FA}, {0x000030FC, 0x000030FF}, {0x00003105, 0x0000312F}, -{0x00003131, 0x0000318E}, {0x000031A0, 0x000031BF}, {0x000031F0, 0x000031FF}, {0x00003400, 0x00004DBF}, -{0x00004E00, 0x00009FFC}, {0x0000A000, 0x0000A48C}, {0x0000A4D0, 0x0000A4FD}, {0x0000A500, 0x0000A60C}, -{0x0000A610, 0x0000A61F}, {0x0000A62A, 0x0000A62B}, {0x0000A640, 0x0000A66E}, {0x0000A67F, 0x0000A69D}, -{0x0000A6A0, 0x0000A6E5}, {0x0000A717, 0x0000A71F}, {0x0000A722, 0x0000A788}, {0x0000A78B, 0x0000A7BF}, -{0x0000A7C2, 0x0000A7CA}, {0x0000A7F5, 0x0000A801}, {0x0000A803, 0x0000A805}, {0x0000A807, 0x0000A80A}, -{0x0000A80C, 0x0000A822}, {0x0000A840, 0x0000A873}, {0x0000A882, 0x0000A8B3}, {0x0000A8F2, 0x0000A8F7}, -{0x0000A8FB, 0x0000A8FB}, {0x0000A8FD, 0x0000A8FE}, {0x0000A90A, 0x0000A925}, {0x0000A930, 0x0000A946}, -{0x0000A960, 0x0000A97C}, {0x0000A984, 0x0000A9B2}, {0x0000A9CF, 0x0000A9CF}, {0x0000A9E0, 0x0000A9E4}, -{0x0000A9E6, 0x0000A9EF}, {0x0000A9FA, 0x0000A9FE}, {0x0000AA00, 0x0000AA28}, {0x0000AA40, 0x0000AA42}, -{0x0000AA44, 0x0000AA4B}, {0x0000AA60, 0x0000AA76}, {0x0000AA7A, 0x0000AA7A}, {0x0000AA7E, 0x0000AAAF}, -{0x0000AAB1, 0x0000AAB1}, {0x0000AAB5, 0x0000AAB6}, {0x0000AAB9, 0x0000AABD}, {0x0000AAC0, 0x0000AAC0}, -{0x0000AAC2, 0x0000AAC2}, {0x0000AADB, 0x0000AADD}, {0x0000AAE0, 0x0000AAEA}, {0x0000AAF2, 0x0000AAF4}, -{0x0000AB01, 0x0000AB06}, {0x0000AB09, 0x0000AB0E}, {0x0000AB11, 0x0000AB16}, {0x0000AB20, 0x0000AB26}, -{0x0000AB28, 0x0000AB2E}, {0x0000AB30, 0x0000AB5A}, {0x0000AB5C, 0x0000AB69}, {0x0000AB70, 0x0000ABE2}, -{0x0000AC00, 0x0000D7A3}, {0x0000D7B0, 0x0000D7C6}, {0x0000D7CB, 0x0000D7FB}, {0x0000F900, 0x0000FA6D}, -{0x0000FA70, 0x0000FAD9}, {0x0000FB00, 0x0000FB06}, {0x0000FB13, 0x0000FB17}, {0x0000FB1D, 0x0000FB1D}, -{0x0000FB1F, 0x0000FB28}, {0x0000FB2A, 0x0000FB36}, {0x0000FB38, 0x0000FB3C}, {0x0000FB3E, 0x0000FB3E}, -{0x0000FB40, 0x0000FB41}, {0x0000FB43, 0x0000FB44}, {0x0000FB46, 0x0000FBB1}, {0x0000FBD3, 0x0000FD3D}, -{0x0000FD50, 0x0000FD8F}, {0x0000FD92, 0x0000FDC7}, {0x0000FDF0, 0x0000FDFB}, {0x0000FE70, 0x0000FE74}, -{0x0000FE76, 0x0000FEFC}, {0x0000FF21, 0x0000FF3A}, {0x0000FF41, 0x0000FF5A}, {0x0000FF66, 0x0000FFBE}, -{0x0000FFC2, 0x0000FFC7}, {0x0000FFCA, 0x0000FFCF}, {0x0000FFD2, 0x0000FFD7}, {0x0000FFDA, 0x0000FFDC}, -{0x00010000, 0x0001000B}, {0x0001000D, 0x00010026}, {0x00010028, 0x0001003A}, {0x0001003C, 0x0001003D}, -{0x0001003F, 0x0001004D}, {0x00010050, 0x0001005D}, {0x00010080, 0x000100FA}, {0x00010280, 0x0001029C}, -{0x000102A0, 0x000102D0}, {0x00010300, 0x0001031F}, {0x0001032D, 0x00010340}, {0x00010342, 0x00010349}, -{0x00010350, 0x00010375}, {0x00010380, 0x0001039D}, {0x000103A0, 0x000103C3}, {0x000103C8, 0x000103CF}, -{0x00010400, 0x0001049D}, {0x000104B0, 0x000104D3}, {0x000104D8, 0x000104FB}, {0x00010500, 0x00010527}, -{0x00010530, 0x00010563}, {0x00010600, 0x00010736}, {0x00010740, 0x00010755}, {0x00010760, 0x00010767}, -{0x00010800, 0x00010805}, {0x00010808, 0x00010808}, {0x0001080A, 0x00010835}, {0x00010837, 0x00010838}, -{0x0001083C, 0x0001083C}, {0x0001083F, 0x00010855}, {0x00010860, 0x00010876}, {0x00010880, 0x0001089E}, -{0x000108E0, 0x000108F2}, {0x000108F4, 0x000108F5}, {0x00010900, 0x00010915}, {0x00010920, 0x00010939}, -{0x00010980, 0x000109B7}, {0x000109BE, 0x000109BF}, {0x00010A00, 0x00010A00}, {0x00010A10, 0x00010A13}, -{0x00010A15, 0x00010A17}, {0x00010A19, 0x00010A35}, {0x00010A60, 0x00010A7C}, {0x00010A80, 0x00010A9C}, -{0x00010AC0, 0x00010AC7}, {0x00010AC9, 0x00010AE4}, {0x00010B00, 0x00010B35}, {0x00010B40, 0x00010B55}, -{0x00010B60, 0x00010B72}, {0x00010B80, 0x00010B91}, {0x00010C00, 0x00010C48}, {0x00010C80, 0x00010CB2}, -{0x00010CC0, 0x00010CF2}, {0x00010D00, 0x00010D23}, {0x00010E80, 0x00010EA9}, {0x00010EB0, 0x00010EB1}, -{0x00010F00, 0x00010F1C}, {0x00010F27, 0x00010F27}, {0x00010F30, 0x00010F45}, {0x00010FB0, 0x00010FC4}, -{0x00010FE0, 0x00010FF6}, {0x00011003, 0x00011037}, {0x00011083, 0x000110AF}, {0x000110D0, 0x000110E8}, -{0x00011103, 0x00011126}, {0x00011144, 0x00011144}, {0x00011147, 0x00011147}, {0x00011150, 0x00011172}, -{0x00011176, 0x00011176}, {0x00011183, 0x000111B2}, {0x000111C1, 0x000111C4}, {0x000111DA, 0x000111DA}, -{0x000111DC, 0x000111DC}, {0x00011200, 0x00011211}, {0x00011213, 0x0001122B}, {0x00011280, 0x00011286}, -{0x00011288, 0x00011288}, {0x0001128A, 0x0001128D}, {0x0001128F, 0x0001129D}, {0x0001129F, 0x000112A8}, -{0x000112B0, 0x000112DE}, {0x00011305, 0x0001130C}, {0x0001130F, 0x00011310}, {0x00011313, 0x00011328}, -{0x0001132A, 0x00011330}, {0x00011332, 0x00011333}, {0x00011335, 0x00011339}, {0x0001133D, 0x0001133D}, -{0x00011350, 0x00011350}, {0x0001135D, 0x00011361}, {0x00011400, 0x00011434}, {0x00011447, 0x0001144A}, -{0x0001145F, 0x00011461}, {0x00011480, 0x000114AF}, {0x000114C4, 0x000114C5}, {0x000114C7, 0x000114C7}, -{0x00011580, 0x000115AE}, {0x000115D8, 0x000115DB}, {0x00011600, 0x0001162F}, {0x00011644, 0x00011644}, -{0x00011680, 0x000116AA}, {0x000116B8, 0x000116B8}, {0x00011700, 0x0001171A}, {0x00011800, 0x0001182B}, -{0x000118A0, 0x000118DF}, {0x000118FF, 0x00011906}, {0x00011909, 0x00011909}, {0x0001190C, 0x00011913}, -{0x00011915, 0x00011916}, {0x00011918, 0x0001192F}, {0x0001193F, 0x0001193F}, {0x00011941, 0x00011941}, -{0x000119A0, 0x000119A7}, {0x000119AA, 0x000119D0}, {0x000119E1, 0x000119E1}, {0x000119E3, 0x000119E3}, -{0x00011A00, 0x00011A00}, {0x00011A0B, 0x00011A32}, {0x00011A3A, 0x00011A3A}, {0x00011A50, 0x00011A50}, -{0x00011A5C, 0x00011A89}, {0x00011A9D, 0x00011A9D}, {0x00011AC0, 0x00011AF8}, {0x00011C00, 0x00011C08}, -{0x00011C0A, 0x00011C2E}, {0x00011C40, 0x00011C40}, {0x00011C72, 0x00011C8F}, {0x00011D00, 0x00011D06}, -{0x00011D08, 0x00011D09}, {0x00011D0B, 0x00011D30}, {0x00011D46, 0x00011D46}, {0x00011D60, 0x00011D65}, -{0x00011D67, 0x00011D68}, {0x00011D6A, 0x00011D89}, {0x00011D98, 0x00011D98}, {0x00011EE0, 0x00011EF2}, -{0x00011FB0, 0x00011FB0}, {0x00012000, 0x00012399}, {0x00012480, 0x00012543}, {0x00013000, 0x0001342E}, -{0x00014400, 0x00014646}, {0x00016800, 0x00016A38}, {0x00016A40, 0x00016A5E}, {0x00016AD0, 0x00016AED}, -{0x00016B00, 0x00016B2F}, {0x00016B40, 0x00016B43}, {0x00016B63, 0x00016B77}, {0x00016B7D, 0x00016B8F}, -{0x00016E40, 0x00016E7F}, {0x00016F00, 0x00016F4A}, {0x00016F50, 0x00016F50}, {0x00016F93, 0x00016F9F}, -{0x00016FE0, 0x00016FE1}, {0x00016FE3, 0x00016FE3}, {0x00017000, 0x000187F7}, {0x00018800, 0x00018CD5}, -{0x00018D00, 0x00018D08}, {0x0001B000, 0x0001B11E}, {0x0001B150, 0x0001B152}, {0x0001B164, 0x0001B167}, -{0x0001B170, 0x0001B2FB}, {0x0001BC00, 0x0001BC6A}, {0x0001BC70, 0x0001BC7C}, {0x0001BC80, 0x0001BC88}, -{0x0001BC90, 0x0001BC99}, {0x0001D400, 0x0001D454}, {0x0001D456, 0x0001D49C}, {0x0001D49E, 0x0001D49F}, -{0x0001D4A2, 0x0001D4A2}, {0x0001D4A5, 0x0001D4A6}, {0x0001D4A9, 0x0001D4AC}, {0x0001D4AE, 0x0001D4B9}, -{0x0001D4BB, 0x0001D4BB}, {0x0001D4BD, 0x0001D4C3}, {0x0001D4C5, 0x0001D505}, {0x0001D507, 0x0001D50A}, -{0x0001D50D, 0x0001D514}, {0x0001D516, 0x0001D51C}, {0x0001D51E, 0x0001D539}, {0x0001D53B, 0x0001D53E}, -{0x0001D540, 0x0001D544}, {0x0001D546, 0x0001D546}, {0x0001D54A, 0x0001D550}, {0x0001D552, 0x0001D6A5}, -{0x0001D6A8, 0x0001D6C0}, {0x0001D6C2, 0x0001D6DA}, {0x0001D6DC, 0x0001D6FA}, {0x0001D6FC, 0x0001D714}, -{0x0001D716, 0x0001D734}, {0x0001D736, 0x0001D74E}, {0x0001D750, 0x0001D76E}, {0x0001D770, 0x0001D788}, -{0x0001D78A, 0x0001D7A8}, {0x0001D7AA, 0x0001D7C2}, {0x0001D7C4, 0x0001D7CB}, {0x0001E100, 0x0001E12C}, -{0x0001E137, 0x0001E13D}, {0x0001E14E, 0x0001E14E}, {0x0001E2C0, 0x0001E2EB}, {0x0001E800, 0x0001E8C4}, -{0x0001E900, 0x0001E943}, {0x0001E94B, 0x0001E94B}, {0x0001EE00, 0x0001EE03}, {0x0001EE05, 0x0001EE1F}, -{0x0001EE21, 0x0001EE22}, {0x0001EE24, 0x0001EE24}, {0x0001EE27, 0x0001EE27}, {0x0001EE29, 0x0001EE32}, -{0x0001EE34, 0x0001EE37}, {0x0001EE39, 0x0001EE39}, {0x0001EE3B, 0x0001EE3B}, {0x0001EE42, 0x0001EE42}, -{0x0001EE47, 0x0001EE47}, {0x0001EE49, 0x0001EE49}, {0x0001EE4B, 0x0001EE4B}, {0x0001EE4D, 0x0001EE4F}, -{0x0001EE51, 0x0001EE52}, {0x0001EE54, 0x0001EE54}, {0x0001EE57, 0x0001EE57}, {0x0001EE59, 0x0001EE59}, -{0x0001EE5B, 0x0001EE5B}, {0x0001EE5D, 0x0001EE5D}, {0x0001EE5F, 0x0001EE5F}, {0x0001EE61, 0x0001EE62}, -{0x0001EE64, 0x0001EE64}, {0x0001EE67, 0x0001EE6A}, {0x0001EE6C, 0x0001EE72}, {0x0001EE74, 0x0001EE77}, -{0x0001EE79, 0x0001EE7C}, {0x0001EE7E, 0x0001EE7E}, {0x0001EE80, 0x0001EE89}, {0x0001EE8B, 0x0001EE9B}, -{0x0001EEA1, 0x0001EEA3}, {0x0001EEA5, 0x0001EEA9}, {0x0001EEAB, 0x0001EEBB}, {0x00020000, 0x0002A6DD}, -{0x0002A700, 0x0002B734}, {0x0002B740, 0x0002B81D}, {0x0002B820, 0x0002CEA1}, {0x0002CEB0, 0x0002EBE0}, -{0x0002F800, 0x0002FA1D}, {0x00030000, 0x0003134A}, -}; - -static const std::vector> unicode_ranges_whitespace = { -{0x00000009, 0x0000000D}, {0x0000001C, 0x00000020}, {0x00000085, 0x00000085}, {0x000000A0, 0x000000A0}, -{0x00001680, 0x00001680}, {0x00002000, 0x0000200A}, {0x00002028, 0x00002029}, {0x0000202F, 0x0000202F}, -{0x0000205F, 0x0000205F}, {0x00003000, 0x00003000}, -}; - -static const std::vector> unicode_ranges_accent_mark = { -{0x00000300, 0x0000036F}, {0x00000483, 0x00000489}, {0x00000591, 0x000005BD}, {0x000005BF, 0x000005BF}, -{0x000005C1, 0x000005C2}, {0x000005C4, 0x000005C5}, {0x000005C7, 0x000005C7}, {0x00000610, 0x0000061A}, -{0x0000064B, 0x0000065F}, {0x00000670, 0x00000670}, {0x000006D6, 0x000006DC}, {0x000006DF, 0x000006E4}, -{0x000006E7, 0x000006E8}, {0x000006EA, 0x000006ED}, {0x00000711, 0x00000711}, {0x00000730, 0x0000074A}, -{0x000007A6, 0x000007B0}, {0x000007EB, 0x000007F3}, {0x000007FD, 0x000007FD}, {0x00000816, 0x00000819}, -{0x0000081B, 0x00000823}, {0x00000825, 0x00000827}, {0x00000829, 0x0000082D}, {0x00000859, 0x0000085B}, -{0x000008D3, 0x000008E1}, {0x000008E3, 0x00000903}, {0x0000093A, 0x0000093C}, {0x0000093E, 0x0000094F}, -{0x00000951, 0x00000957}, {0x00000962, 0x00000963}, {0x00000981, 0x00000983}, {0x000009BC, 0x000009BC}, -{0x000009BE, 0x000009C4}, {0x000009C7, 0x000009C8}, {0x000009CB, 0x000009CD}, {0x000009D7, 0x000009D7}, -{0x000009E2, 0x000009E3}, {0x000009FE, 0x000009FE}, {0x00000A01, 0x00000A03}, {0x00000A3C, 0x00000A3C}, -{0x00000A3E, 0x00000A42}, {0x00000A47, 0x00000A48}, {0x00000A4B, 0x00000A4D}, {0x00000A51, 0x00000A51}, -{0x00000A70, 0x00000A71}, {0x00000A75, 0x00000A75}, {0x00000A81, 0x00000A83}, {0x00000ABC, 0x00000ABC}, -{0x00000ABE, 0x00000AC5}, {0x00000AC7, 0x00000AC9}, {0x00000ACB, 0x00000ACD}, {0x00000AE2, 0x00000AE3}, -{0x00000AFA, 0x00000AFF}, {0x00000B01, 0x00000B03}, {0x00000B3C, 0x00000B3C}, {0x00000B3E, 0x00000B44}, -{0x00000B47, 0x00000B48}, {0x00000B4B, 0x00000B4D}, {0x00000B55, 0x00000B57}, {0x00000B62, 0x00000B63}, -{0x00000B82, 0x00000B82}, {0x00000BBE, 0x00000BC2}, {0x00000BC6, 0x00000BC8}, {0x00000BCA, 0x00000BCD}, -{0x00000BD7, 0x00000BD7}, {0x00000C00, 0x00000C04}, {0x00000C3E, 0x00000C44}, {0x00000C46, 0x00000C48}, -{0x00000C4A, 0x00000C4D}, {0x00000C55, 0x00000C56}, {0x00000C62, 0x00000C63}, {0x00000C81, 0x00000C83}, -{0x00000CBC, 0x00000CBC}, {0x00000CBE, 0x00000CC4}, {0x00000CC6, 0x00000CC8}, {0x00000CCA, 0x00000CCD}, -{0x00000CD5, 0x00000CD6}, {0x00000CE2, 0x00000CE3}, {0x00000D00, 0x00000D03}, {0x00000D3B, 0x00000D3C}, -{0x00000D3E, 0x00000D44}, {0x00000D46, 0x00000D48}, {0x00000D4A, 0x00000D4D}, {0x00000D57, 0x00000D57}, -{0x00000D62, 0x00000D63}, {0x00000D81, 0x00000D83}, {0x00000DCA, 0x00000DCA}, {0x00000DCF, 0x00000DD4}, -{0x00000DD6, 0x00000DD6}, {0x00000DD8, 0x00000DDF}, {0x00000DF2, 0x00000DF3}, {0x00000E31, 0x00000E31}, -{0x00000E34, 0x00000E3A}, {0x00000E47, 0x00000E4E}, {0x00000EB1, 0x00000EB1}, {0x00000EB4, 0x00000EBC}, -{0x00000EC8, 0x00000ECD}, {0x00000F18, 0x00000F19}, {0x00000F35, 0x00000F35}, {0x00000F37, 0x00000F37}, -{0x00000F39, 0x00000F39}, {0x00000F3E, 0x00000F3F}, {0x00000F71, 0x00000F84}, {0x00000F86, 0x00000F87}, -{0x00000F8D, 0x00000F97}, {0x00000F99, 0x00000FBC}, {0x00000FC6, 0x00000FC6}, {0x0000102B, 0x0000103E}, -{0x00001056, 0x00001059}, {0x0000105E, 0x00001060}, {0x00001062, 0x00001064}, {0x00001067, 0x0000106D}, -{0x00001071, 0x00001074}, {0x00001082, 0x0000108D}, {0x0000108F, 0x0000108F}, {0x0000109A, 0x0000109D}, -{0x0000135D, 0x0000135F}, {0x00001712, 0x00001714}, {0x00001732, 0x00001734}, {0x00001752, 0x00001753}, -{0x00001772, 0x00001773}, {0x000017B4, 0x000017D3}, {0x000017DD, 0x000017DD}, {0x0000180B, 0x0000180D}, -{0x00001885, 0x00001886}, {0x000018A9, 0x000018A9}, {0x00001920, 0x0000192B}, {0x00001930, 0x0000193B}, -{0x00001A17, 0x00001A1B}, {0x00001A55, 0x00001A5E}, {0x00001A60, 0x00001A7C}, {0x00001A7F, 0x00001A7F}, -{0x00001AB0, 0x00001AC0}, {0x00001B00, 0x00001B04}, {0x00001B34, 0x00001B44}, {0x00001B6B, 0x00001B73}, -{0x00001B80, 0x00001B82}, {0x00001BA1, 0x00001BAD}, {0x00001BE6, 0x00001BF3}, {0x00001C24, 0x00001C37}, -{0x00001CD0, 0x00001CD2}, {0x00001CD4, 0x00001CE8}, {0x00001CED, 0x00001CED}, {0x00001CF4, 0x00001CF4}, -{0x00001CF7, 0x00001CF9}, {0x00001DC0, 0x00001DF9}, {0x00001DFB, 0x00001DFF}, {0x000020D0, 0x000020F0}, -{0x00002CEF, 0x00002CF1}, {0x00002D7F, 0x00002D7F}, {0x00002DE0, 0x00002DFF}, {0x0000302A, 0x0000302F}, -{0x00003099, 0x0000309A}, {0x0000A66F, 0x0000A672}, {0x0000A674, 0x0000A67D}, {0x0000A69E, 0x0000A69F}, -{0x0000A6F0, 0x0000A6F1}, {0x0000A802, 0x0000A802}, {0x0000A806, 0x0000A806}, {0x0000A80B, 0x0000A80B}, -{0x0000A823, 0x0000A827}, {0x0000A82C, 0x0000A82C}, {0x0000A880, 0x0000A881}, {0x0000A8B4, 0x0000A8C5}, -{0x0000A8E0, 0x0000A8F1}, {0x0000A8FF, 0x0000A8FF}, {0x0000A926, 0x0000A92D}, {0x0000A947, 0x0000A953}, -{0x0000A980, 0x0000A983}, {0x0000A9B3, 0x0000A9C0}, {0x0000A9E5, 0x0000A9E5}, {0x0000AA29, 0x0000AA36}, -{0x0000AA43, 0x0000AA43}, {0x0000AA4C, 0x0000AA4D}, {0x0000AA7B, 0x0000AA7D}, {0x0000AAB0, 0x0000AAB0}, -{0x0000AAB2, 0x0000AAB4}, {0x0000AAB7, 0x0000AAB8}, {0x0000AABE, 0x0000AABF}, {0x0000AAC1, 0x0000AAC1}, -{0x0000AAEB, 0x0000AAEF}, {0x0000AAF5, 0x0000AAF6}, {0x0000ABE3, 0x0000ABEA}, {0x0000ABEC, 0x0000ABED}, -{0x0000FB1E, 0x0000FB1E}, {0x0000FE00, 0x0000FE0F}, {0x0000FE20, 0x0000FE2F}, {0x000101FD, 0x000101FD}, -{0x000102E0, 0x000102E0}, {0x00010376, 0x0001037A}, {0x00010A01, 0x00010A03}, {0x00010A05, 0x00010A06}, -{0x00010A0C, 0x00010A0F}, {0x00010A38, 0x00010A3A}, {0x00010A3F, 0x00010A3F}, {0x00010AE5, 0x00010AE6}, -{0x00010D24, 0x00010D27}, {0x00010EAB, 0x00010EAC}, {0x00010F46, 0x00010F50}, {0x00011000, 0x00011002}, -{0x00011038, 0x00011046}, {0x0001107F, 0x00011082}, {0x000110B0, 0x000110BA}, {0x00011100, 0x00011102}, -{0x00011127, 0x00011134}, {0x00011145, 0x00011146}, {0x00011173, 0x00011173}, {0x00011180, 0x00011182}, -{0x000111B3, 0x000111C0}, {0x000111C9, 0x000111CC}, {0x000111CE, 0x000111CF}, {0x0001122C, 0x00011237}, -{0x0001123E, 0x0001123E}, {0x000112DF, 0x000112EA}, {0x00011300, 0x00011303}, {0x0001133B, 0x0001133C}, -{0x0001133E, 0x00011344}, {0x00011347, 0x00011348}, {0x0001134B, 0x0001134D}, {0x00011357, 0x00011357}, -{0x00011362, 0x00011363}, {0x00011366, 0x0001136C}, {0x00011370, 0x00011374}, {0x00011435, 0x00011446}, -{0x0001145E, 0x0001145E}, {0x000114B0, 0x000114C3}, {0x000115AF, 0x000115B5}, {0x000115B8, 0x000115C0}, -{0x000115DC, 0x000115DD}, {0x00011630, 0x00011640}, {0x000116AB, 0x000116B7}, {0x0001171D, 0x0001172B}, -{0x0001182C, 0x0001183A}, {0x00011930, 0x00011935}, {0x00011937, 0x00011938}, {0x0001193B, 0x0001193E}, -{0x00011940, 0x00011940}, {0x00011942, 0x00011943}, {0x000119D1, 0x000119D7}, {0x000119DA, 0x000119E0}, -{0x000119E4, 0x000119E4}, {0x00011A01, 0x00011A0A}, {0x00011A33, 0x00011A39}, {0x00011A3B, 0x00011A3E}, -{0x00011A47, 0x00011A47}, {0x00011A51, 0x00011A5B}, {0x00011A8A, 0x00011A99}, {0x00011C2F, 0x00011C36}, -{0x00011C38, 0x00011C3F}, {0x00011C92, 0x00011CA7}, {0x00011CA9, 0x00011CB6}, {0x00011D31, 0x00011D36}, -{0x00011D3A, 0x00011D3A}, {0x00011D3C, 0x00011D3D}, {0x00011D3F, 0x00011D45}, {0x00011D47, 0x00011D47}, -{0x00011D8A, 0x00011D8E}, {0x00011D90, 0x00011D91}, {0x00011D93, 0x00011D97}, {0x00011EF3, 0x00011EF6}, -{0x00016AF0, 0x00016AF4}, {0x00016B30, 0x00016B36}, {0x00016F4F, 0x00016F4F}, {0x00016F51, 0x00016F87}, -{0x00016F8F, 0x00016F92}, {0x00016FE4, 0x00016FE4}, {0x00016FF0, 0x00016FF1}, {0x0001BC9D, 0x0001BC9E}, -{0x0001D165, 0x0001D169}, {0x0001D16D, 0x0001D172}, {0x0001D17B, 0x0001D182}, {0x0001D185, 0x0001D18B}, -{0x0001D1AA, 0x0001D1AD}, {0x0001D242, 0x0001D244}, {0x0001DA00, 0x0001DA36}, {0x0001DA3B, 0x0001DA6C}, -{0x0001DA75, 0x0001DA75}, {0x0001DA84, 0x0001DA84}, {0x0001DA9B, 0x0001DA9F}, {0x0001DAA1, 0x0001DAAF}, -{0x0001E000, 0x0001E006}, {0x0001E008, 0x0001E018}, {0x0001E01B, 0x0001E021}, {0x0001E023, 0x0001E024}, -{0x0001E026, 0x0001E02A}, {0x0001E130, 0x0001E136}, {0x0001E2EC, 0x0001E2EF}, {0x0001E8D0, 0x0001E8D6}, -{0x0001E944, 0x0001E94A}, {0x000E0100, 0x000E01EF}, -}; - -static const std::vector> unicode_ranges_punctuation = { -{0x00000021, 0x00000023}, {0x00000025, 0x0000002A}, {0x0000002C, 0x0000002F}, {0x0000003A, 0x0000003B}, -{0x0000003F, 0x00000040}, {0x0000005B, 0x0000005D}, {0x0000005F, 0x0000005F}, {0x0000007B, 0x0000007B}, -{0x0000007D, 0x0000007D}, {0x000000A1, 0x000000A1}, {0x000000A7, 0x000000A7}, {0x000000AB, 0x000000AB}, -{0x000000B6, 0x000000B7}, {0x000000BB, 0x000000BB}, {0x000000BF, 0x000000BF}, {0x0000037E, 0x0000037E}, -{0x00000387, 0x00000387}, {0x0000055A, 0x0000055F}, {0x00000589, 0x0000058A}, {0x000005BE, 0x000005BE}, -{0x000005C0, 0x000005C0}, {0x000005C3, 0x000005C3}, {0x000005C6, 0x000005C6}, {0x000005F3, 0x000005F4}, -{0x00000609, 0x0000060A}, {0x0000060C, 0x0000060D}, {0x0000061B, 0x0000061B}, {0x0000061E, 0x0000061F}, -{0x0000066A, 0x0000066D}, {0x000006D4, 0x000006D4}, {0x00000700, 0x0000070D}, {0x000007F7, 0x000007F9}, -{0x00000830, 0x0000083E}, {0x0000085E, 0x0000085E}, {0x00000964, 0x00000965}, {0x00000970, 0x00000970}, -{0x000009FD, 0x000009FD}, {0x00000A76, 0x00000A76}, {0x00000AF0, 0x00000AF0}, {0x00000C77, 0x00000C77}, -{0x00000C84, 0x00000C84}, {0x00000DF4, 0x00000DF4}, {0x00000E4F, 0x00000E4F}, {0x00000E5A, 0x00000E5B}, -{0x00000F04, 0x00000F12}, {0x00000F14, 0x00000F14}, {0x00000F3A, 0x00000F3D}, {0x00000F85, 0x00000F85}, -{0x00000FD0, 0x00000FD4}, {0x00000FD9, 0x00000FDA}, {0x0000104A, 0x0000104F}, {0x000010FB, 0x000010FB}, -{0x00001360, 0x00001368}, {0x00001400, 0x00001400}, {0x0000166E, 0x0000166E}, {0x0000169B, 0x0000169C}, -{0x000016EB, 0x000016ED}, {0x00001735, 0x00001736}, {0x000017D4, 0x000017D6}, {0x000017D8, 0x000017DA}, -{0x00001800, 0x0000180A}, {0x00001944, 0x00001945}, {0x00001A1E, 0x00001A1F}, {0x00001AA0, 0x00001AA6}, -{0x00001AA8, 0x00001AAD}, {0x00001B5A, 0x00001B60}, {0x00001BFC, 0x00001BFF}, {0x00001C3B, 0x00001C3F}, -{0x00001C7E, 0x00001C7F}, {0x00001CC0, 0x00001CC7}, {0x00001CD3, 0x00001CD3}, {0x00002010, 0x00002027}, -{0x00002030, 0x00002043}, {0x00002045, 0x00002051}, {0x00002053, 0x0000205E}, {0x0000207D, 0x0000207E}, -{0x0000208D, 0x0000208E}, {0x00002308, 0x0000230B}, {0x00002329, 0x0000232A}, {0x00002768, 0x00002775}, -{0x000027C5, 0x000027C6}, {0x000027E6, 0x000027EF}, {0x00002983, 0x00002998}, {0x000029D8, 0x000029DB}, -{0x000029FC, 0x000029FD}, {0x00002CF9, 0x00002CFC}, {0x00002CFE, 0x00002CFF}, {0x00002D70, 0x00002D70}, -{0x00002E00, 0x00002E2E}, {0x00002E30, 0x00002E4F}, {0x00002E52, 0x00002E52}, {0x00003001, 0x00003003}, -{0x00003008, 0x00003011}, {0x00003014, 0x0000301F}, {0x00003030, 0x00003030}, {0x0000303D, 0x0000303D}, -{0x000030A0, 0x000030A0}, {0x000030FB, 0x000030FB}, {0x0000A4FE, 0x0000A4FF}, {0x0000A60D, 0x0000A60F}, -{0x0000A673, 0x0000A673}, {0x0000A67E, 0x0000A67E}, {0x0000A6F2, 0x0000A6F7}, {0x0000A874, 0x0000A877}, -{0x0000A8CE, 0x0000A8CF}, {0x0000A8F8, 0x0000A8FA}, {0x0000A8FC, 0x0000A8FC}, {0x0000A92E, 0x0000A92F}, -{0x0000A95F, 0x0000A95F}, {0x0000A9C1, 0x0000A9CD}, {0x0000A9DE, 0x0000A9DF}, {0x0000AA5C, 0x0000AA5F}, -{0x0000AADE, 0x0000AADF}, {0x0000AAF0, 0x0000AAF1}, {0x0000ABEB, 0x0000ABEB}, {0x0000FD3E, 0x0000FD3F}, -{0x0000FE10, 0x0000FE19}, {0x0000FE30, 0x0000FE52}, {0x0000FE54, 0x0000FE61}, {0x0000FE63, 0x0000FE63}, -{0x0000FE68, 0x0000FE68}, {0x0000FE6A, 0x0000FE6B}, {0x0000FF01, 0x0000FF03}, {0x0000FF05, 0x0000FF0A}, -{0x0000FF0C, 0x0000FF0F}, {0x0000FF1A, 0x0000FF1B}, {0x0000FF1F, 0x0000FF20}, {0x0000FF3B, 0x0000FF3D}, -{0x0000FF3F, 0x0000FF3F}, {0x0000FF5B, 0x0000FF5B}, {0x0000FF5D, 0x0000FF5D}, {0x0000FF5F, 0x0000FF65}, -{0x00010100, 0x00010102}, {0x0001039F, 0x0001039F}, {0x000103D0, 0x000103D0}, {0x0001056F, 0x0001056F}, -{0x00010857, 0x00010857}, {0x0001091F, 0x0001091F}, {0x0001093F, 0x0001093F}, {0x00010A50, 0x00010A58}, -{0x00010A7F, 0x00010A7F}, {0x00010AF0, 0x00010AF6}, {0x00010B39, 0x00010B3F}, {0x00010B99, 0x00010B9C}, -{0x00010EAD, 0x00010EAD}, {0x00010F55, 0x00010F59}, {0x00011047, 0x0001104D}, {0x000110BB, 0x000110BC}, -{0x000110BE, 0x000110C1}, {0x00011140, 0x00011143}, {0x00011174, 0x00011175}, {0x000111C5, 0x000111C8}, -{0x000111CD, 0x000111CD}, {0x000111DB, 0x000111DB}, {0x000111DD, 0x000111DF}, {0x00011238, 0x0001123D}, -{0x000112A9, 0x000112A9}, {0x0001144B, 0x0001144F}, {0x0001145A, 0x0001145B}, {0x0001145D, 0x0001145D}, -{0x000114C6, 0x000114C6}, {0x000115C1, 0x000115D7}, {0x00011641, 0x00011643}, {0x00011660, 0x0001166C}, -{0x0001173C, 0x0001173E}, {0x0001183B, 0x0001183B}, {0x00011944, 0x00011946}, {0x000119E2, 0x000119E2}, -{0x00011A3F, 0x00011A46}, {0x00011A9A, 0x00011A9C}, {0x00011A9E, 0x00011AA2}, {0x00011C41, 0x00011C45}, -{0x00011C70, 0x00011C71}, {0x00011EF7, 0x00011EF8}, {0x00011FFF, 0x00011FFF}, {0x00012470, 0x00012474}, -{0x00016A6E, 0x00016A6F}, {0x00016AF5, 0x00016AF5}, {0x00016B37, 0x00016B3B}, {0x00016B44, 0x00016B44}, -{0x00016E97, 0x00016E9A}, {0x00016FE2, 0x00016FE2}, {0x0001BC9F, 0x0001BC9F}, {0x0001DA87, 0x0001DA8B}, -{0x0001E95E, 0x0001E95F}, -}; - -static const std::vector> unicode_ranges_symbol = { -{0x00000024, 0x00000024}, {0x0000002B, 0x0000002B}, {0x0000003C, 0x0000003E}, {0x0000005E, 0x0000005E}, -{0x00000060, 0x00000060}, {0x0000007C, 0x0000007C}, {0x0000007E, 0x0000007E}, {0x000000A2, 0x000000A6}, -{0x000000A8, 0x000000A9}, {0x000000AC, 0x000000AC}, {0x000000AE, 0x000000B1}, {0x000000B4, 0x000000B4}, -{0x000000B8, 0x000000B8}, {0x000000D7, 0x000000D7}, {0x000000F7, 0x000000F7}, {0x000002C2, 0x000002C5}, -{0x000002D2, 0x000002DF}, {0x000002E5, 0x000002EB}, {0x000002ED, 0x000002ED}, {0x000002EF, 0x000002FF}, -{0x00000375, 0x00000375}, {0x00000384, 0x00000385}, {0x000003F6, 0x000003F6}, {0x00000482, 0x00000482}, -{0x0000058D, 0x0000058F}, {0x00000606, 0x00000608}, {0x0000060B, 0x0000060B}, {0x0000060E, 0x0000060F}, -{0x000006DE, 0x000006DE}, {0x000006E9, 0x000006E9}, {0x000006FD, 0x000006FE}, {0x000007F6, 0x000007F6}, -{0x000007FE, 0x000007FF}, {0x000009F2, 0x000009F3}, {0x000009FA, 0x000009FB}, {0x00000AF1, 0x00000AF1}, -{0x00000B70, 0x00000B70}, {0x00000BF3, 0x00000BFA}, {0x00000C7F, 0x00000C7F}, {0x00000D4F, 0x00000D4F}, -{0x00000D79, 0x00000D79}, {0x00000E3F, 0x00000E3F}, {0x00000F01, 0x00000F03}, {0x00000F13, 0x00000F13}, -{0x00000F15, 0x00000F17}, {0x00000F1A, 0x00000F1F}, {0x00000F34, 0x00000F34}, {0x00000F36, 0x00000F36}, -{0x00000F38, 0x00000F38}, {0x00000FBE, 0x00000FC5}, {0x00000FC7, 0x00000FCC}, {0x00000FCE, 0x00000FCF}, -{0x00000FD5, 0x00000FD8}, {0x0000109E, 0x0000109F}, {0x00001390, 0x00001399}, {0x0000166D, 0x0000166D}, -{0x000017DB, 0x000017DB}, {0x00001940, 0x00001940}, {0x000019DE, 0x000019FF}, {0x00001B61, 0x00001B6A}, -{0x00001B74, 0x00001B7C}, {0x00001FBD, 0x00001FBD}, {0x00001FBF, 0x00001FC1}, {0x00001FCD, 0x00001FCF}, -{0x00001FDD, 0x00001FDF}, {0x00001FED, 0x00001FEF}, {0x00001FFD, 0x00001FFE}, {0x00002044, 0x00002044}, -{0x00002052, 0x00002052}, {0x0000207A, 0x0000207C}, {0x0000208A, 0x0000208C}, {0x000020A0, 0x000020BF}, -{0x00002100, 0x00002101}, {0x00002103, 0x00002106}, {0x00002108, 0x00002109}, {0x00002114, 0x00002114}, -{0x00002116, 0x00002118}, {0x0000211E, 0x00002123}, {0x00002125, 0x00002125}, {0x00002127, 0x00002127}, -{0x00002129, 0x00002129}, {0x0000212E, 0x0000212E}, {0x0000213A, 0x0000213B}, {0x00002140, 0x00002144}, -{0x0000214A, 0x0000214D}, {0x0000214F, 0x0000214F}, {0x0000218A, 0x0000218B}, {0x00002190, 0x00002307}, -{0x0000230C, 0x00002328}, {0x0000232B, 0x00002426}, {0x00002440, 0x0000244A}, {0x0000249C, 0x000024E9}, -{0x00002500, 0x00002767}, {0x00002794, 0x000027C4}, {0x000027C7, 0x000027E5}, {0x000027F0, 0x00002982}, -{0x00002999, 0x000029D7}, {0x000029DC, 0x000029FB}, {0x000029FE, 0x00002B73}, {0x00002B76, 0x00002B95}, -{0x00002B97, 0x00002BFF}, {0x00002CE5, 0x00002CEA}, {0x00002E50, 0x00002E51}, {0x00002E80, 0x00002E99}, -{0x00002E9B, 0x00002EF3}, {0x00002F00, 0x00002FD5}, {0x00002FF0, 0x00002FFB}, {0x00003004, 0x00003004}, -{0x00003012, 0x00003013}, {0x00003020, 0x00003020}, {0x00003036, 0x00003037}, {0x0000303E, 0x0000303F}, -{0x0000309B, 0x0000309C}, {0x00003190, 0x00003191}, {0x00003196, 0x0000319F}, {0x000031C0, 0x000031E3}, -{0x00003200, 0x0000321E}, {0x0000322A, 0x00003247}, {0x00003250, 0x00003250}, {0x00003260, 0x0000327F}, -{0x0000328A, 0x000032B0}, {0x000032C0, 0x000033FF}, {0x00004DC0, 0x00004DFF}, {0x0000A490, 0x0000A4C6}, -{0x0000A700, 0x0000A716}, {0x0000A720, 0x0000A721}, {0x0000A789, 0x0000A78A}, {0x0000A828, 0x0000A82B}, -{0x0000A836, 0x0000A839}, {0x0000AA77, 0x0000AA79}, {0x0000AB5B, 0x0000AB5B}, {0x0000AB6A, 0x0000AB6B}, -{0x0000FB29, 0x0000FB29}, {0x0000FBB2, 0x0000FBC1}, {0x0000FDFC, 0x0000FDFD}, {0x0000FE62, 0x0000FE62}, -{0x0000FE64, 0x0000FE66}, {0x0000FE69, 0x0000FE69}, {0x0000FF04, 0x0000FF04}, {0x0000FF0B, 0x0000FF0B}, -{0x0000FF1C, 0x0000FF1E}, {0x0000FF3E, 0x0000FF3E}, {0x0000FF40, 0x0000FF40}, {0x0000FF5C, 0x0000FF5C}, -{0x0000FF5E, 0x0000FF5E}, {0x0000FFE0, 0x0000FFE6}, {0x0000FFE8, 0x0000FFEE}, {0x0000FFFC, 0x0000FFFD}, -{0x00010137, 0x0001013F}, {0x00010179, 0x00010189}, {0x0001018C, 0x0001018E}, {0x00010190, 0x0001019C}, -{0x000101A0, 0x000101A0}, {0x000101D0, 0x000101FC}, {0x00010877, 0x00010878}, {0x00010AC8, 0x00010AC8}, -{0x0001173F, 0x0001173F}, {0x00011FD5, 0x00011FF1}, {0x00016B3C, 0x00016B3F}, {0x00016B45, 0x00016B45}, -{0x0001BC9C, 0x0001BC9C}, {0x0001D000, 0x0001D0F5}, {0x0001D100, 0x0001D126}, {0x0001D129, 0x0001D164}, -{0x0001D16A, 0x0001D16C}, {0x0001D183, 0x0001D184}, {0x0001D18C, 0x0001D1A9}, {0x0001D1AE, 0x0001D1E8}, -{0x0001D200, 0x0001D241}, {0x0001D245, 0x0001D245}, {0x0001D300, 0x0001D356}, {0x0001D6C1, 0x0001D6C1}, -{0x0001D6DB, 0x0001D6DB}, {0x0001D6FB, 0x0001D6FB}, {0x0001D715, 0x0001D715}, {0x0001D735, 0x0001D735}, -{0x0001D74F, 0x0001D74F}, {0x0001D76F, 0x0001D76F}, {0x0001D789, 0x0001D789}, {0x0001D7A9, 0x0001D7A9}, -{0x0001D7C3, 0x0001D7C3}, {0x0001D800, 0x0001D9FF}, {0x0001DA37, 0x0001DA3A}, {0x0001DA6D, 0x0001DA74}, -{0x0001DA76, 0x0001DA83}, {0x0001DA85, 0x0001DA86}, {0x0001E14F, 0x0001E14F}, {0x0001E2FF, 0x0001E2FF}, -{0x0001ECAC, 0x0001ECAC}, {0x0001ECB0, 0x0001ECB0}, {0x0001ED2E, 0x0001ED2E}, {0x0001EEF0, 0x0001EEF1}, -{0x0001F000, 0x0001F02B}, {0x0001F030, 0x0001F093}, {0x0001F0A0, 0x0001F0AE}, {0x0001F0B1, 0x0001F0BF}, -{0x0001F0C1, 0x0001F0CF}, {0x0001F0D1, 0x0001F0F5}, {0x0001F10D, 0x0001F1AD}, {0x0001F1E6, 0x0001F202}, -{0x0001F210, 0x0001F23B}, {0x0001F240, 0x0001F248}, {0x0001F250, 0x0001F251}, {0x0001F260, 0x0001F265}, -{0x0001F300, 0x0001F6D7}, {0x0001F6E0, 0x0001F6EC}, {0x0001F6F0, 0x0001F6FC}, {0x0001F700, 0x0001F773}, -{0x0001F780, 0x0001F7D8}, {0x0001F7E0, 0x0001F7EB}, {0x0001F800, 0x0001F80B}, {0x0001F810, 0x0001F847}, -{0x0001F850, 0x0001F859}, {0x0001F860, 0x0001F887}, {0x0001F890, 0x0001F8AD}, {0x0001F8B0, 0x0001F8B1}, -{0x0001F900, 0x0001F978}, {0x0001F97A, 0x0001F9CB}, {0x0001F9CD, 0x0001FA53}, {0x0001FA60, 0x0001FA6D}, -{0x0001FA70, 0x0001FA74}, {0x0001FA78, 0x0001FA7A}, {0x0001FA80, 0x0001FA86}, {0x0001FA90, 0x0001FAA8}, -{0x0001FAB0, 0x0001FAB6}, {0x0001FAC0, 0x0001FAC2}, {0x0001FAD0, 0x0001FAD6}, {0x0001FB00, 0x0001FB92}, -{0x0001FB94, 0x0001FBCA}, -}; - -static const std::vector> unicode_ranges_control = { -{0x00000000, 0x00000008}, {0x0000000E, 0x0000001B}, {0x0000007F, 0x00000084}, {0x00000086, 0x0000009F}, -{0x000000AD, 0x000000AD}, {0x00000378, 0x00000379}, {0x00000380, 0x00000383}, {0x0000038B, 0x0000038B}, -{0x0000038D, 0x0000038D}, {0x000003A2, 0x000003A2}, {0x00000530, 0x00000530}, {0x00000557, 0x00000558}, -{0x0000058B, 0x0000058C}, {0x00000590, 0x00000590}, {0x000005C8, 0x000005CF}, {0x000005EB, 0x000005EE}, -{0x000005F5, 0x00000605}, {0x0000061C, 0x0000061D}, {0x000006DD, 0x000006DD}, {0x0000070E, 0x0000070F}, -{0x0000074B, 0x0000074C}, {0x000007B2, 0x000007BF}, {0x000007FB, 0x000007FC}, {0x0000082E, 0x0000082F}, -{0x0000083F, 0x0000083F}, {0x0000085C, 0x0000085D}, {0x0000085F, 0x0000085F}, {0x0000086B, 0x0000089F}, -{0x000008B5, 0x000008B5}, {0x000008C8, 0x000008D2}, {0x000008E2, 0x000008E2}, {0x00000984, 0x00000984}, -{0x0000098D, 0x0000098E}, {0x00000991, 0x00000992}, {0x000009A9, 0x000009A9}, {0x000009B1, 0x000009B1}, -{0x000009B3, 0x000009B5}, {0x000009BA, 0x000009BB}, {0x000009C5, 0x000009C6}, {0x000009C9, 0x000009CA}, -{0x000009CF, 0x000009D6}, {0x000009D8, 0x000009DB}, {0x000009DE, 0x000009DE}, {0x000009E4, 0x000009E5}, -{0x000009FF, 0x00000A00}, {0x00000A04, 0x00000A04}, {0x00000A0B, 0x00000A0E}, {0x00000A11, 0x00000A12}, -{0x00000A29, 0x00000A29}, {0x00000A31, 0x00000A31}, {0x00000A34, 0x00000A34}, {0x00000A37, 0x00000A37}, -{0x00000A3A, 0x00000A3B}, {0x00000A3D, 0x00000A3D}, {0x00000A43, 0x00000A46}, {0x00000A49, 0x00000A4A}, -{0x00000A4E, 0x00000A50}, {0x00000A52, 0x00000A58}, {0x00000A5D, 0x00000A5D}, {0x00000A5F, 0x00000A65}, -{0x00000A77, 0x00000A80}, {0x00000A84, 0x00000A84}, {0x00000A8E, 0x00000A8E}, {0x00000A92, 0x00000A92}, -{0x00000AA9, 0x00000AA9}, {0x00000AB1, 0x00000AB1}, {0x00000AB4, 0x00000AB4}, {0x00000ABA, 0x00000ABB}, -{0x00000AC6, 0x00000AC6}, {0x00000ACA, 0x00000ACA}, {0x00000ACE, 0x00000ACF}, {0x00000AD1, 0x00000ADF}, -{0x00000AE4, 0x00000AE5}, {0x00000AF2, 0x00000AF8}, {0x00000B00, 0x00000B00}, {0x00000B04, 0x00000B04}, -{0x00000B0D, 0x00000B0E}, {0x00000B11, 0x00000B12}, {0x00000B29, 0x00000B29}, {0x00000B31, 0x00000B31}, -{0x00000B34, 0x00000B34}, {0x00000B3A, 0x00000B3B}, {0x00000B45, 0x00000B46}, {0x00000B49, 0x00000B4A}, -{0x00000B4E, 0x00000B54}, {0x00000B58, 0x00000B5B}, {0x00000B5E, 0x00000B5E}, {0x00000B64, 0x00000B65}, -{0x00000B78, 0x00000B81}, {0x00000B84, 0x00000B84}, {0x00000B8B, 0x00000B8D}, {0x00000B91, 0x00000B91}, -{0x00000B96, 0x00000B98}, {0x00000B9B, 0x00000B9B}, {0x00000B9D, 0x00000B9D}, {0x00000BA0, 0x00000BA2}, -{0x00000BA5, 0x00000BA7}, {0x00000BAB, 0x00000BAD}, {0x00000BBA, 0x00000BBD}, {0x00000BC3, 0x00000BC5}, -{0x00000BC9, 0x00000BC9}, {0x00000BCE, 0x00000BCF}, {0x00000BD1, 0x00000BD6}, {0x00000BD8, 0x00000BE5}, -{0x00000BFB, 0x00000BFF}, {0x00000C0D, 0x00000C0D}, {0x00000C11, 0x00000C11}, {0x00000C29, 0x00000C29}, -{0x00000C3A, 0x00000C3C}, {0x00000C45, 0x00000C45}, {0x00000C49, 0x00000C49}, {0x00000C4E, 0x00000C54}, -{0x00000C57, 0x00000C57}, {0x00000C5B, 0x00000C5F}, {0x00000C64, 0x00000C65}, {0x00000C70, 0x00000C76}, -{0x00000C8D, 0x00000C8D}, {0x00000C91, 0x00000C91}, {0x00000CA9, 0x00000CA9}, {0x00000CB4, 0x00000CB4}, -{0x00000CBA, 0x00000CBB}, {0x00000CC5, 0x00000CC5}, {0x00000CC9, 0x00000CC9}, {0x00000CCE, 0x00000CD4}, -{0x00000CD7, 0x00000CDD}, {0x00000CDF, 0x00000CDF}, {0x00000CE4, 0x00000CE5}, {0x00000CF0, 0x00000CF0}, -{0x00000CF3, 0x00000CFF}, {0x00000D0D, 0x00000D0D}, {0x00000D11, 0x00000D11}, {0x00000D45, 0x00000D45}, -{0x00000D49, 0x00000D49}, {0x00000D50, 0x00000D53}, {0x00000D64, 0x00000D65}, {0x00000D80, 0x00000D80}, -{0x00000D84, 0x00000D84}, {0x00000D97, 0x00000D99}, {0x00000DB2, 0x00000DB2}, {0x00000DBC, 0x00000DBC}, -{0x00000DBE, 0x00000DBF}, {0x00000DC7, 0x00000DC9}, {0x00000DCB, 0x00000DCE}, {0x00000DD5, 0x00000DD5}, -{0x00000DD7, 0x00000DD7}, {0x00000DE0, 0x00000DE5}, {0x00000DF0, 0x00000DF1}, {0x00000DF5, 0x00000E00}, -{0x00000E3B, 0x00000E3E}, {0x00000E5C, 0x00000E80}, {0x00000E83, 0x00000E83}, {0x00000E85, 0x00000E85}, -{0x00000E8B, 0x00000E8B}, {0x00000EA4, 0x00000EA4}, {0x00000EA6, 0x00000EA6}, {0x00000EBE, 0x00000EBF}, -{0x00000EC5, 0x00000EC5}, {0x00000EC7, 0x00000EC7}, {0x00000ECE, 0x00000ECF}, {0x00000EDA, 0x00000EDB}, -{0x00000EE0, 0x00000EFF}, {0x00000F48, 0x00000F48}, {0x00000F6D, 0x00000F70}, {0x00000F98, 0x00000F98}, -{0x00000FBD, 0x00000FBD}, {0x00000FCD, 0x00000FCD}, {0x00000FDB, 0x00000FFF}, {0x000010C6, 0x000010C6}, -{0x000010C8, 0x000010CC}, {0x000010CE, 0x000010CF}, {0x00001249, 0x00001249}, {0x0000124E, 0x0000124F}, -{0x00001257, 0x00001257}, {0x00001259, 0x00001259}, {0x0000125E, 0x0000125F}, {0x00001289, 0x00001289}, -{0x0000128E, 0x0000128F}, {0x000012B1, 0x000012B1}, {0x000012B6, 0x000012B7}, {0x000012BF, 0x000012BF}, -{0x000012C1, 0x000012C1}, {0x000012C6, 0x000012C7}, {0x000012D7, 0x000012D7}, {0x00001311, 0x00001311}, -{0x00001316, 0x00001317}, {0x0000135B, 0x0000135C}, {0x0000137D, 0x0000137F}, {0x0000139A, 0x0000139F}, -{0x000013F6, 0x000013F7}, {0x000013FE, 0x000013FF}, {0x0000169D, 0x0000169F}, {0x000016F9, 0x000016FF}, -{0x0000170D, 0x0000170D}, {0x00001715, 0x0000171F}, {0x00001737, 0x0000173F}, {0x00001754, 0x0000175F}, -{0x0000176D, 0x0000176D}, {0x00001771, 0x00001771}, {0x00001774, 0x0000177F}, {0x000017DE, 0x000017DF}, -{0x000017EA, 0x000017EF}, {0x000017FA, 0x000017FF}, {0x0000180E, 0x0000180F}, {0x0000181A, 0x0000181F}, -{0x00001879, 0x0000187F}, {0x000018AB, 0x000018AF}, {0x000018F6, 0x000018FF}, {0x0000191F, 0x0000191F}, -{0x0000192C, 0x0000192F}, {0x0000193C, 0x0000193F}, {0x00001941, 0x00001943}, {0x0000196E, 0x0000196F}, -{0x00001975, 0x0000197F}, {0x000019AC, 0x000019AF}, {0x000019CA, 0x000019CF}, {0x000019DB, 0x000019DD}, -{0x00001A1C, 0x00001A1D}, {0x00001A5F, 0x00001A5F}, {0x00001A7D, 0x00001A7E}, {0x00001A8A, 0x00001A8F}, -{0x00001A9A, 0x00001A9F}, {0x00001AAE, 0x00001AAF}, {0x00001AC1, 0x00001AFF}, {0x00001B4C, 0x00001B4F}, -{0x00001B7D, 0x00001B7F}, {0x00001BF4, 0x00001BFB}, {0x00001C38, 0x00001C3A}, {0x00001C4A, 0x00001C4C}, -{0x00001C89, 0x00001C8F}, {0x00001CBB, 0x00001CBC}, {0x00001CC8, 0x00001CCF}, {0x00001CFB, 0x00001CFF}, -{0x00001DFA, 0x00001DFA}, {0x00001F16, 0x00001F17}, {0x00001F1E, 0x00001F1F}, {0x00001F46, 0x00001F47}, -{0x00001F4E, 0x00001F4F}, {0x00001F58, 0x00001F58}, {0x00001F5A, 0x00001F5A}, {0x00001F5C, 0x00001F5C}, -{0x00001F5E, 0x00001F5E}, {0x00001F7E, 0x00001F7F}, {0x00001FB5, 0x00001FB5}, {0x00001FC5, 0x00001FC5}, -{0x00001FD4, 0x00001FD5}, {0x00001FDC, 0x00001FDC}, {0x00001FF0, 0x00001FF1}, {0x00001FF5, 0x00001FF5}, -{0x00001FFF, 0x00001FFF}, {0x0000200B, 0x0000200F}, {0x0000202A, 0x0000202E}, {0x00002060, 0x0000206F}, -{0x00002072, 0x00002073}, {0x0000208F, 0x0000208F}, {0x0000209D, 0x0000209F}, {0x000020C0, 0x000020CF}, -{0x000020F1, 0x000020FF}, {0x0000218C, 0x0000218F}, {0x00002427, 0x0000243F}, {0x0000244B, 0x0000245F}, -{0x00002B74, 0x00002B75}, {0x00002B96, 0x00002B96}, {0x00002C2F, 0x00002C2F}, {0x00002C5F, 0x00002C5F}, -{0x00002CF4, 0x00002CF8}, {0x00002D26, 0x00002D26}, {0x00002D28, 0x00002D2C}, {0x00002D2E, 0x00002D2F}, -{0x00002D68, 0x00002D6E}, {0x00002D71, 0x00002D7E}, {0x00002D97, 0x00002D9F}, {0x00002DA7, 0x00002DA7}, -{0x00002DAF, 0x00002DAF}, {0x00002DB7, 0x00002DB7}, {0x00002DBF, 0x00002DBF}, {0x00002DC7, 0x00002DC7}, -{0x00002DCF, 0x00002DCF}, {0x00002DD7, 0x00002DD7}, {0x00002DDF, 0x00002DDF}, {0x00002E53, 0x00002E7F}, -{0x00002E9A, 0x00002E9A}, {0x00002EF4, 0x00002EFF}, {0x00002FD6, 0x00002FEF}, {0x00002FFC, 0x00002FFF}, -{0x00003040, 0x00003040}, {0x00003097, 0x00003098}, {0x00003100, 0x00003104}, {0x00003130, 0x00003130}, -{0x0000318F, 0x0000318F}, {0x000031E4, 0x000031EF}, {0x0000321F, 0x0000321F}, {0x00009FFD, 0x00009FFF}, -{0x0000A48D, 0x0000A48F}, {0x0000A4C7, 0x0000A4CF}, {0x0000A62C, 0x0000A63F}, {0x0000A6F8, 0x0000A6FF}, -{0x0000A7C0, 0x0000A7C1}, {0x0000A7CB, 0x0000A7F4}, {0x0000A82D, 0x0000A82F}, {0x0000A83A, 0x0000A83F}, -{0x0000A878, 0x0000A87F}, {0x0000A8C6, 0x0000A8CD}, {0x0000A8DA, 0x0000A8DF}, {0x0000A954, 0x0000A95E}, -{0x0000A97D, 0x0000A97F}, {0x0000A9CE, 0x0000A9CE}, {0x0000A9DA, 0x0000A9DD}, {0x0000A9FF, 0x0000A9FF}, -{0x0000AA37, 0x0000AA3F}, {0x0000AA4E, 0x0000AA4F}, {0x0000AA5A, 0x0000AA5B}, {0x0000AAC3, 0x0000AADA}, -{0x0000AAF7, 0x0000AB00}, {0x0000AB07, 0x0000AB08}, {0x0000AB0F, 0x0000AB10}, {0x0000AB17, 0x0000AB1F}, -{0x0000AB27, 0x0000AB27}, {0x0000AB2F, 0x0000AB2F}, {0x0000AB6C, 0x0000AB6F}, {0x0000ABEE, 0x0000ABEF}, -{0x0000ABFA, 0x0000ABFF}, {0x0000D7A4, 0x0000D7AF}, {0x0000D7C7, 0x0000D7CA}, {0x0000D7FC, 0x0000F8FF}, -{0x0000FA6E, 0x0000FA6F}, {0x0000FADA, 0x0000FAFF}, {0x0000FB07, 0x0000FB12}, {0x0000FB18, 0x0000FB1C}, -{0x0000FB37, 0x0000FB37}, {0x0000FB3D, 0x0000FB3D}, {0x0000FB3F, 0x0000FB3F}, {0x0000FB42, 0x0000FB42}, -{0x0000FB45, 0x0000FB45}, {0x0000FBC2, 0x0000FBD2}, {0x0000FD40, 0x0000FD4F}, {0x0000FD90, 0x0000FD91}, -{0x0000FDC8, 0x0000FDEF}, {0x0000FDFE, 0x0000FDFF}, {0x0000FE1A, 0x0000FE1F}, {0x0000FE53, 0x0000FE53}, -{0x0000FE67, 0x0000FE67}, {0x0000FE6C, 0x0000FE6F}, {0x0000FE75, 0x0000FE75}, {0x0000FEFD, 0x0000FF00}, -{0x0000FFBF, 0x0000FFC1}, {0x0000FFC8, 0x0000FFC9}, {0x0000FFD0, 0x0000FFD1}, {0x0000FFD8, 0x0000FFD9}, -{0x0000FFDD, 0x0000FFDF}, {0x0000FFE7, 0x0000FFE7}, {0x0000FFEF, 0x0000FFFB}, {0x0000FFFE, 0x0000FFFF}, -{0x0001000C, 0x0001000C}, {0x00010027, 0x00010027}, {0x0001003B, 0x0001003B}, {0x0001003E, 0x0001003E}, -{0x0001004E, 0x0001004F}, {0x0001005E, 0x0001007F}, {0x000100FB, 0x000100FF}, {0x00010103, 0x00010106}, -{0x00010134, 0x00010136}, {0x0001018F, 0x0001018F}, {0x0001019D, 0x0001019F}, {0x000101A1, 0x000101CF}, -{0x000101FE, 0x0001027F}, {0x0001029D, 0x0001029F}, {0x000102D1, 0x000102DF}, {0x000102FC, 0x000102FF}, -{0x00010324, 0x0001032C}, {0x0001034B, 0x0001034F}, {0x0001037B, 0x0001037F}, {0x0001039E, 0x0001039E}, -{0x000103C4, 0x000103C7}, {0x000103D6, 0x000103FF}, {0x0001049E, 0x0001049F}, {0x000104AA, 0x000104AF}, -{0x000104D4, 0x000104D7}, {0x000104FC, 0x000104FF}, {0x00010528, 0x0001052F}, {0x00010564, 0x0001056E}, -{0x00010570, 0x000105FF}, {0x00010737, 0x0001073F}, {0x00010756, 0x0001075F}, {0x00010768, 0x000107FF}, -{0x00010806, 0x00010807}, {0x00010809, 0x00010809}, {0x00010836, 0x00010836}, {0x00010839, 0x0001083B}, -{0x0001083D, 0x0001083E}, {0x00010856, 0x00010856}, {0x0001089F, 0x000108A6}, {0x000108B0, 0x000108DF}, -{0x000108F3, 0x000108F3}, {0x000108F6, 0x000108FA}, {0x0001091C, 0x0001091E}, {0x0001093A, 0x0001093E}, -{0x00010940, 0x0001097F}, {0x000109B8, 0x000109BB}, {0x000109D0, 0x000109D1}, {0x00010A04, 0x00010A04}, -{0x00010A07, 0x00010A0B}, {0x00010A14, 0x00010A14}, {0x00010A18, 0x00010A18}, {0x00010A36, 0x00010A37}, -{0x00010A3B, 0x00010A3E}, {0x00010A49, 0x00010A4F}, {0x00010A59, 0x00010A5F}, {0x00010AA0, 0x00010ABF}, -{0x00010AE7, 0x00010AEA}, {0x00010AF7, 0x00010AFF}, {0x00010B36, 0x00010B38}, {0x00010B56, 0x00010B57}, -{0x00010B73, 0x00010B77}, {0x00010B92, 0x00010B98}, {0x00010B9D, 0x00010BA8}, {0x00010BB0, 0x00010BFF}, -{0x00010C49, 0x00010C7F}, {0x00010CB3, 0x00010CBF}, {0x00010CF3, 0x00010CF9}, {0x00010D28, 0x00010D2F}, -{0x00010D3A, 0x00010E5F}, {0x00010E7F, 0x00010E7F}, {0x00010EAA, 0x00010EAA}, {0x00010EAE, 0x00010EAF}, -{0x00010EB2, 0x00010EFF}, {0x00010F28, 0x00010F2F}, {0x00010F5A, 0x00010FAF}, {0x00010FCC, 0x00010FDF}, -{0x00010FF7, 0x00010FFF}, {0x0001104E, 0x00011051}, {0x00011070, 0x0001107E}, {0x000110BD, 0x000110BD}, -{0x000110C2, 0x000110CF}, {0x000110E9, 0x000110EF}, {0x000110FA, 0x000110FF}, {0x00011135, 0x00011135}, -{0x00011148, 0x0001114F}, {0x00011177, 0x0001117F}, {0x000111E0, 0x000111E0}, {0x000111F5, 0x000111FF}, -{0x00011212, 0x00011212}, {0x0001123F, 0x0001127F}, {0x00011287, 0x00011287}, {0x00011289, 0x00011289}, -{0x0001128E, 0x0001128E}, {0x0001129E, 0x0001129E}, {0x000112AA, 0x000112AF}, {0x000112EB, 0x000112EF}, -{0x000112FA, 0x000112FF}, {0x00011304, 0x00011304}, {0x0001130D, 0x0001130E}, {0x00011311, 0x00011312}, -{0x00011329, 0x00011329}, {0x00011331, 0x00011331}, {0x00011334, 0x00011334}, {0x0001133A, 0x0001133A}, -{0x00011345, 0x00011346}, {0x00011349, 0x0001134A}, {0x0001134E, 0x0001134F}, {0x00011351, 0x00011356}, -{0x00011358, 0x0001135C}, {0x00011364, 0x00011365}, {0x0001136D, 0x0001136F}, {0x00011375, 0x000113FF}, -{0x0001145C, 0x0001145C}, {0x00011462, 0x0001147F}, {0x000114C8, 0x000114CF}, {0x000114DA, 0x0001157F}, -{0x000115B6, 0x000115B7}, {0x000115DE, 0x000115FF}, {0x00011645, 0x0001164F}, {0x0001165A, 0x0001165F}, -{0x0001166D, 0x0001167F}, {0x000116B9, 0x000116BF}, {0x000116CA, 0x000116FF}, {0x0001171B, 0x0001171C}, -{0x0001172C, 0x0001172F}, {0x00011740, 0x000117FF}, {0x0001183C, 0x0001189F}, {0x000118F3, 0x000118FE}, -{0x00011907, 0x00011908}, {0x0001190A, 0x0001190B}, {0x00011914, 0x00011914}, {0x00011917, 0x00011917}, -{0x00011936, 0x00011936}, {0x00011939, 0x0001193A}, {0x00011947, 0x0001194F}, {0x0001195A, 0x0001199F}, -{0x000119A8, 0x000119A9}, {0x000119D8, 0x000119D9}, {0x000119E5, 0x000119FF}, {0x00011A48, 0x00011A4F}, -{0x00011AA3, 0x00011ABF}, {0x00011AF9, 0x00011BFF}, {0x00011C09, 0x00011C09}, {0x00011C37, 0x00011C37}, -{0x00011C46, 0x00011C4F}, {0x00011C6D, 0x00011C6F}, {0x00011C90, 0x00011C91}, {0x00011CA8, 0x00011CA8}, -{0x00011CB7, 0x00011CFF}, {0x00011D07, 0x00011D07}, {0x00011D0A, 0x00011D0A}, {0x00011D37, 0x00011D39}, -{0x00011D3B, 0x00011D3B}, {0x00011D3E, 0x00011D3E}, {0x00011D48, 0x00011D4F}, {0x00011D5A, 0x00011D5F}, -{0x00011D66, 0x00011D66}, {0x00011D69, 0x00011D69}, {0x00011D8F, 0x00011D8F}, {0x00011D92, 0x00011D92}, -{0x00011D99, 0x00011D9F}, {0x00011DAA, 0x00011EDF}, {0x00011EF9, 0x00011FAF}, {0x00011FB1, 0x00011FBF}, -{0x00011FF2, 0x00011FFE}, {0x0001239A, 0x000123FF}, {0x0001246F, 0x0001246F}, {0x00012475, 0x0001247F}, -{0x00012544, 0x00012FFF}, {0x0001342F, 0x000143FF}, {0x00014647, 0x000167FF}, {0x00016A39, 0x00016A3F}, -{0x00016A5F, 0x00016A5F}, {0x00016A6A, 0x00016A6D}, {0x00016A70, 0x00016ACF}, {0x00016AEE, 0x00016AEF}, -{0x00016AF6, 0x00016AFF}, {0x00016B46, 0x00016B4F}, {0x00016B5A, 0x00016B5A}, {0x00016B62, 0x00016B62}, -{0x00016B78, 0x00016B7C}, {0x00016B90, 0x00016E3F}, {0x00016E9B, 0x00016EFF}, {0x00016F4B, 0x00016F4E}, -{0x00016F88, 0x00016F8E}, {0x00016FA0, 0x00016FDF}, {0x00016FE5, 0x00016FEF}, {0x00016FF2, 0x00016FFF}, -{0x000187F8, 0x000187FF}, {0x00018CD6, 0x00018CFF}, {0x00018D09, 0x0001AFFF}, {0x0001B11F, 0x0001B14F}, -{0x0001B153, 0x0001B163}, {0x0001B168, 0x0001B16F}, {0x0001B2FC, 0x0001BBFF}, {0x0001BC6B, 0x0001BC6F}, -{0x0001BC7D, 0x0001BC7F}, {0x0001BC89, 0x0001BC8F}, {0x0001BC9A, 0x0001BC9B}, {0x0001BCA0, 0x0001CFFF}, -{0x0001D0F6, 0x0001D0FF}, {0x0001D127, 0x0001D128}, {0x0001D173, 0x0001D17A}, {0x0001D1E9, 0x0001D1FF}, -{0x0001D246, 0x0001D2DF}, {0x0001D2F4, 0x0001D2FF}, {0x0001D357, 0x0001D35F}, {0x0001D379, 0x0001D3FF}, -{0x0001D455, 0x0001D455}, {0x0001D49D, 0x0001D49D}, {0x0001D4A0, 0x0001D4A1}, {0x0001D4A3, 0x0001D4A4}, -{0x0001D4A7, 0x0001D4A8}, {0x0001D4AD, 0x0001D4AD}, {0x0001D4BA, 0x0001D4BA}, {0x0001D4BC, 0x0001D4BC}, -{0x0001D4C4, 0x0001D4C4}, {0x0001D506, 0x0001D506}, {0x0001D50B, 0x0001D50C}, {0x0001D515, 0x0001D515}, -{0x0001D51D, 0x0001D51D}, {0x0001D53A, 0x0001D53A}, {0x0001D53F, 0x0001D53F}, {0x0001D545, 0x0001D545}, -{0x0001D547, 0x0001D549}, {0x0001D551, 0x0001D551}, {0x0001D6A6, 0x0001D6A7}, {0x0001D7CC, 0x0001D7CD}, -{0x0001DA8C, 0x0001DA9A}, {0x0001DAA0, 0x0001DAA0}, {0x0001DAB0, 0x0001DFFF}, {0x0001E007, 0x0001E007}, -{0x0001E019, 0x0001E01A}, {0x0001E022, 0x0001E022}, {0x0001E025, 0x0001E025}, {0x0001E02B, 0x0001E0FF}, -{0x0001E12D, 0x0001E12F}, {0x0001E13E, 0x0001E13F}, {0x0001E14A, 0x0001E14D}, {0x0001E150, 0x0001E2BF}, -{0x0001E2FA, 0x0001E2FE}, {0x0001E300, 0x0001E7FF}, {0x0001E8C5, 0x0001E8C6}, {0x0001E8D7, 0x0001E8FF}, -{0x0001E94C, 0x0001E94F}, {0x0001E95A, 0x0001E95D}, {0x0001E960, 0x0001EC70}, {0x0001ECB5, 0x0001ED00}, -{0x0001ED3E, 0x0001EDFF}, {0x0001EE04, 0x0001EE04}, {0x0001EE20, 0x0001EE20}, {0x0001EE23, 0x0001EE23}, -{0x0001EE25, 0x0001EE26}, {0x0001EE28, 0x0001EE28}, {0x0001EE33, 0x0001EE33}, {0x0001EE38, 0x0001EE38}, -{0x0001EE3A, 0x0001EE3A}, {0x0001EE3C, 0x0001EE41}, {0x0001EE43, 0x0001EE46}, {0x0001EE48, 0x0001EE48}, -{0x0001EE4A, 0x0001EE4A}, {0x0001EE4C, 0x0001EE4C}, {0x0001EE50, 0x0001EE50}, {0x0001EE53, 0x0001EE53}, -{0x0001EE55, 0x0001EE56}, {0x0001EE58, 0x0001EE58}, {0x0001EE5A, 0x0001EE5A}, {0x0001EE5C, 0x0001EE5C}, -{0x0001EE5E, 0x0001EE5E}, {0x0001EE60, 0x0001EE60}, {0x0001EE63, 0x0001EE63}, {0x0001EE65, 0x0001EE66}, -{0x0001EE6B, 0x0001EE6B}, {0x0001EE73, 0x0001EE73}, {0x0001EE78, 0x0001EE78}, {0x0001EE7D, 0x0001EE7D}, -{0x0001EE7F, 0x0001EE7F}, {0x0001EE8A, 0x0001EE8A}, {0x0001EE9C, 0x0001EEA0}, {0x0001EEA4, 0x0001EEA4}, -{0x0001EEAA, 0x0001EEAA}, {0x0001EEBC, 0x0001EEEF}, {0x0001EEF2, 0x0001EFFF}, {0x0001F02C, 0x0001F02F}, -{0x0001F094, 0x0001F09F}, {0x0001F0AF, 0x0001F0B0}, {0x0001F0C0, 0x0001F0C0}, {0x0001F0D0, 0x0001F0D0}, -{0x0001F0F6, 0x0001F0FF}, {0x0001F1AE, 0x0001F1E5}, {0x0001F203, 0x0001F20F}, {0x0001F23C, 0x0001F23F}, -{0x0001F249, 0x0001F24F}, {0x0001F252, 0x0001F25F}, {0x0001F266, 0x0001F2FF}, {0x0001F6D8, 0x0001F6DF}, -{0x0001F6ED, 0x0001F6EF}, {0x0001F6FD, 0x0001F6FF}, {0x0001F774, 0x0001F77F}, {0x0001F7D9, 0x0001F7DF}, -{0x0001F7EC, 0x0001F7FF}, {0x0001F80C, 0x0001F80F}, {0x0001F848, 0x0001F84F}, {0x0001F85A, 0x0001F85F}, -{0x0001F888, 0x0001F88F}, {0x0001F8AE, 0x0001F8AF}, {0x0001F8B2, 0x0001F8FF}, {0x0001F979, 0x0001F979}, -{0x0001F9CC, 0x0001F9CC}, {0x0001FA54, 0x0001FA5F}, {0x0001FA6E, 0x0001FA6F}, {0x0001FA75, 0x0001FA77}, -{0x0001FA7B, 0x0001FA7F}, {0x0001FA87, 0x0001FA8F}, {0x0001FAA9, 0x0001FAAF}, {0x0001FAB7, 0x0001FABF}, -{0x0001FAC3, 0x0001FACF}, {0x0001FAD7, 0x0001FAFF}, {0x0001FB93, 0x0001FB93}, {0x0001FBCB, 0x0001FBEF}, -{0x0001FBFA, 0x0001FFFF}, {0x0002A6DE, 0x0002A6FF}, {0x0002B735, 0x0002B73F}, {0x0002B81E, 0x0002B81F}, -{0x0002CEA2, 0x0002CEAF}, {0x0002EBE1, 0x0002F7FF}, {0x0002FA1E, 0x0002FFFF}, {0x0003134B, 0x000E00FF}, -{0x000E01F0, 0x0010FFFF}, -}; - -static const std::multimap unicode_map_nfd = { -{0x000000C0, 0x00000041}, {0x000000C0, 0x00000300}, {0x000000C1, 0x00000041}, {0x000000C1, 0x00000301}, -{0x000000C2, 0x00000041}, {0x000000C2, 0x00000302}, {0x000000C3, 0x00000041}, {0x000000C3, 0x00000303}, -{0x000000C4, 0x00000041}, {0x000000C4, 0x00000308}, {0x000000C5, 0x00000041}, {0x000000C5, 0x0000030A}, -{0x000000C7, 0x00000043}, {0x000000C7, 0x00000327}, {0x000000C8, 0x00000045}, {0x000000C8, 0x00000300}, -{0x000000C9, 0x00000045}, {0x000000C9, 0x00000301}, {0x000000CA, 0x00000045}, {0x000000CA, 0x00000302}, -{0x000000CB, 0x00000045}, {0x000000CB, 0x00000308}, {0x000000CC, 0x00000049}, {0x000000CC, 0x00000300}, -{0x000000CD, 0x00000049}, {0x000000CD, 0x00000301}, {0x000000CE, 0x00000049}, {0x000000CE, 0x00000302}, -{0x000000CF, 0x00000049}, {0x000000CF, 0x00000308}, {0x000000D1, 0x0000004E}, {0x000000D1, 0x00000303}, -{0x000000D2, 0x0000004F}, {0x000000D2, 0x00000300}, {0x000000D3, 0x0000004F}, {0x000000D3, 0x00000301}, -{0x000000D4, 0x0000004F}, {0x000000D4, 0x00000302}, {0x000000D5, 0x0000004F}, {0x000000D5, 0x00000303}, -{0x000000D6, 0x0000004F}, {0x000000D6, 0x00000308}, {0x000000D9, 0x00000055}, {0x000000D9, 0x00000300}, -{0x000000DA, 0x00000055}, {0x000000DA, 0x00000301}, {0x000000DB, 0x00000055}, {0x000000DB, 0x00000302}, -{0x000000DC, 0x00000055}, {0x000000DC, 0x00000308}, {0x000000DD, 0x00000059}, {0x000000DD, 0x00000301}, -{0x000000E0, 0x00000061}, {0x000000E0, 0x00000300}, {0x000000E1, 0x00000061}, {0x000000E1, 0x00000301}, -{0x000000E2, 0x00000061}, {0x000000E2, 0x00000302}, {0x000000E3, 0x00000061}, {0x000000E3, 0x00000303}, -{0x000000E4, 0x00000061}, {0x000000E4, 0x00000308}, {0x000000E5, 0x00000061}, {0x000000E5, 0x0000030A}, -{0x000000E7, 0x00000063}, {0x000000E7, 0x00000327}, {0x000000E8, 0x00000065}, {0x000000E8, 0x00000300}, -{0x000000E9, 0x00000065}, {0x000000E9, 0x00000301}, {0x000000EA, 0x00000065}, {0x000000EA, 0x00000302}, -{0x000000EB, 0x00000065}, {0x000000EB, 0x00000308}, {0x000000EC, 0x00000069}, {0x000000EC, 0x00000300}, -{0x000000ED, 0x00000069}, {0x000000ED, 0x00000301}, {0x000000EE, 0x00000069}, {0x000000EE, 0x00000302}, -{0x000000EF, 0x00000069}, {0x000000EF, 0x00000308}, {0x000000F1, 0x0000006E}, {0x000000F1, 0x00000303}, -{0x000000F2, 0x0000006F}, {0x000000F2, 0x00000300}, {0x000000F3, 0x0000006F}, {0x000000F3, 0x00000301}, -{0x000000F4, 0x0000006F}, {0x000000F4, 0x00000302}, {0x000000F5, 0x0000006F}, {0x000000F5, 0x00000303}, -{0x000000F6, 0x0000006F}, {0x000000F6, 0x00000308}, {0x000000F9, 0x00000075}, {0x000000F9, 0x00000300}, -{0x000000FA, 0x00000075}, {0x000000FA, 0x00000301}, {0x000000FB, 0x00000075}, {0x000000FB, 0x00000302}, -{0x000000FC, 0x00000075}, {0x000000FC, 0x00000308}, {0x000000FD, 0x00000079}, {0x000000FD, 0x00000301}, -{0x000000FF, 0x00000079}, {0x000000FF, 0x00000308}, {0x00000100, 0x00000041}, {0x00000100, 0x00000304}, -{0x00000101, 0x00000061}, {0x00000101, 0x00000304}, {0x00000102, 0x00000041}, {0x00000102, 0x00000306}, -{0x00000103, 0x00000061}, {0x00000103, 0x00000306}, {0x00000104, 0x00000041}, {0x00000104, 0x00000328}, -{0x00000105, 0x00000061}, {0x00000105, 0x00000328}, {0x00000106, 0x00000043}, {0x00000106, 0x00000301}, -{0x00000107, 0x00000063}, {0x00000107, 0x00000301}, {0x00000108, 0x00000043}, {0x00000108, 0x00000302}, -{0x00000109, 0x00000063}, {0x00000109, 0x00000302}, {0x0000010A, 0x00000043}, {0x0000010A, 0x00000307}, -{0x0000010B, 0x00000063}, {0x0000010B, 0x00000307}, {0x0000010C, 0x00000043}, {0x0000010C, 0x0000030C}, -{0x0000010D, 0x00000063}, {0x0000010D, 0x0000030C}, {0x0000010E, 0x00000044}, {0x0000010E, 0x0000030C}, -{0x0000010F, 0x00000064}, {0x0000010F, 0x0000030C}, {0x00000112, 0x00000045}, {0x00000112, 0x00000304}, -{0x00000113, 0x00000065}, {0x00000113, 0x00000304}, {0x00000114, 0x00000045}, {0x00000114, 0x00000306}, -{0x00000115, 0x00000065}, {0x00000115, 0x00000306}, {0x00000116, 0x00000045}, {0x00000116, 0x00000307}, -{0x00000117, 0x00000065}, {0x00000117, 0x00000307}, {0x00000118, 0x00000045}, {0x00000118, 0x00000328}, -{0x00000119, 0x00000065}, {0x00000119, 0x00000328}, {0x0000011A, 0x00000045}, {0x0000011A, 0x0000030C}, -{0x0000011B, 0x00000065}, {0x0000011B, 0x0000030C}, {0x0000011C, 0x00000047}, {0x0000011C, 0x00000302}, -{0x0000011D, 0x00000067}, {0x0000011D, 0x00000302}, {0x0000011E, 0x00000047}, {0x0000011E, 0x00000306}, -{0x0000011F, 0x00000067}, {0x0000011F, 0x00000306}, {0x00000120, 0x00000047}, {0x00000120, 0x00000307}, -{0x00000121, 0x00000067}, {0x00000121, 0x00000307}, {0x00000122, 0x00000047}, {0x00000122, 0x00000327}, -{0x00000123, 0x00000067}, {0x00000123, 0x00000327}, {0x00000124, 0x00000048}, {0x00000124, 0x00000302}, -{0x00000125, 0x00000068}, {0x00000125, 0x00000302}, {0x00000128, 0x00000049}, {0x00000128, 0x00000303}, -{0x00000129, 0x00000069}, {0x00000129, 0x00000303}, {0x0000012A, 0x00000049}, {0x0000012A, 0x00000304}, -{0x0000012B, 0x00000069}, {0x0000012B, 0x00000304}, {0x0000012C, 0x00000049}, {0x0000012C, 0x00000306}, -{0x0000012D, 0x00000069}, {0x0000012D, 0x00000306}, {0x0000012E, 0x00000049}, {0x0000012E, 0x00000328}, -{0x0000012F, 0x00000069}, {0x0000012F, 0x00000328}, {0x00000130, 0x00000049}, {0x00000130, 0x00000307}, -{0x00000134, 0x0000004A}, {0x00000134, 0x00000302}, {0x00000135, 0x0000006A}, {0x00000135, 0x00000302}, -{0x00000136, 0x0000004B}, {0x00000136, 0x00000327}, {0x00000137, 0x0000006B}, {0x00000137, 0x00000327}, -{0x00000139, 0x0000004C}, {0x00000139, 0x00000301}, {0x0000013A, 0x0000006C}, {0x0000013A, 0x00000301}, -{0x0000013B, 0x0000004C}, {0x0000013B, 0x00000327}, {0x0000013C, 0x0000006C}, {0x0000013C, 0x00000327}, -{0x0000013D, 0x0000004C}, {0x0000013D, 0x0000030C}, {0x0000013E, 0x0000006C}, {0x0000013E, 0x0000030C}, -{0x00000143, 0x0000004E}, {0x00000143, 0x00000301}, {0x00000144, 0x0000006E}, {0x00000144, 0x00000301}, -{0x00000145, 0x0000004E}, {0x00000145, 0x00000327}, {0x00000146, 0x0000006E}, {0x00000146, 0x00000327}, -{0x00000147, 0x0000004E}, {0x00000147, 0x0000030C}, {0x00000148, 0x0000006E}, {0x00000148, 0x0000030C}, -{0x0000014C, 0x0000004F}, {0x0000014C, 0x00000304}, {0x0000014D, 0x0000006F}, {0x0000014D, 0x00000304}, -{0x0000014E, 0x0000004F}, {0x0000014E, 0x00000306}, {0x0000014F, 0x0000006F}, {0x0000014F, 0x00000306}, -{0x00000150, 0x0000004F}, {0x00000150, 0x0000030B}, {0x00000151, 0x0000006F}, {0x00000151, 0x0000030B}, -{0x00000154, 0x00000052}, {0x00000154, 0x00000301}, {0x00000155, 0x00000072}, {0x00000155, 0x00000301}, -{0x00000156, 0x00000052}, {0x00000156, 0x00000327}, {0x00000157, 0x00000072}, {0x00000157, 0x00000327}, -{0x00000158, 0x00000052}, {0x00000158, 0x0000030C}, {0x00000159, 0x00000072}, {0x00000159, 0x0000030C}, -{0x0000015A, 0x00000053}, {0x0000015A, 0x00000301}, {0x0000015B, 0x00000073}, {0x0000015B, 0x00000301}, -{0x0000015C, 0x00000053}, {0x0000015C, 0x00000302}, {0x0000015D, 0x00000073}, {0x0000015D, 0x00000302}, -{0x0000015E, 0x00000053}, {0x0000015E, 0x00000327}, {0x0000015F, 0x00000073}, {0x0000015F, 0x00000327}, -{0x00000160, 0x00000053}, {0x00000160, 0x0000030C}, {0x00000161, 0x00000073}, {0x00000161, 0x0000030C}, -{0x00000162, 0x00000054}, {0x00000162, 0x00000327}, {0x00000163, 0x00000074}, {0x00000163, 0x00000327}, -{0x00000164, 0x00000054}, {0x00000164, 0x0000030C}, {0x00000165, 0x00000074}, {0x00000165, 0x0000030C}, -{0x00000168, 0x00000055}, {0x00000168, 0x00000303}, {0x00000169, 0x00000075}, {0x00000169, 0x00000303}, -{0x0000016A, 0x00000055}, {0x0000016A, 0x00000304}, {0x0000016B, 0x00000075}, {0x0000016B, 0x00000304}, -{0x0000016C, 0x00000055}, {0x0000016C, 0x00000306}, {0x0000016D, 0x00000075}, {0x0000016D, 0x00000306}, -{0x0000016E, 0x00000055}, {0x0000016E, 0x0000030A}, {0x0000016F, 0x00000075}, {0x0000016F, 0x0000030A}, -{0x00000170, 0x00000055}, {0x00000170, 0x0000030B}, {0x00000171, 0x00000075}, {0x00000171, 0x0000030B}, -{0x00000172, 0x00000055}, {0x00000172, 0x00000328}, {0x00000173, 0x00000075}, {0x00000173, 0x00000328}, -{0x00000174, 0x00000057}, {0x00000174, 0x00000302}, {0x00000175, 0x00000077}, {0x00000175, 0x00000302}, -{0x00000176, 0x00000059}, {0x00000176, 0x00000302}, {0x00000177, 0x00000079}, {0x00000177, 0x00000302}, -{0x00000178, 0x00000059}, {0x00000178, 0x00000308}, {0x00000179, 0x0000005A}, {0x00000179, 0x00000301}, -{0x0000017A, 0x0000007A}, {0x0000017A, 0x00000301}, {0x0000017B, 0x0000005A}, {0x0000017B, 0x00000307}, -{0x0000017C, 0x0000007A}, {0x0000017C, 0x00000307}, {0x0000017D, 0x0000005A}, {0x0000017D, 0x0000030C}, -{0x0000017E, 0x0000007A}, {0x0000017E, 0x0000030C}, {0x000001A0, 0x0000004F}, {0x000001A0, 0x0000031B}, -{0x000001A1, 0x0000006F}, {0x000001A1, 0x0000031B}, {0x000001AF, 0x00000055}, {0x000001AF, 0x0000031B}, -{0x000001B0, 0x00000075}, {0x000001B0, 0x0000031B}, {0x000001CD, 0x00000041}, {0x000001CD, 0x0000030C}, -{0x000001CE, 0x00000061}, {0x000001CE, 0x0000030C}, {0x000001CF, 0x00000049}, {0x000001CF, 0x0000030C}, -{0x000001D0, 0x00000069}, {0x000001D0, 0x0000030C}, {0x000001D1, 0x0000004F}, {0x000001D1, 0x0000030C}, -{0x000001D2, 0x0000006F}, {0x000001D2, 0x0000030C}, {0x000001D3, 0x00000055}, {0x000001D3, 0x0000030C}, -{0x000001D4, 0x00000075}, {0x000001D4, 0x0000030C}, {0x000001D5, 0x00000055}, {0x000001D5, 0x00000308}, -{0x000001D5, 0x00000304}, {0x000001D6, 0x00000075}, {0x000001D6, 0x00000308}, {0x000001D6, 0x00000304}, -{0x000001D7, 0x00000055}, {0x000001D7, 0x00000308}, {0x000001D7, 0x00000301}, {0x000001D8, 0x00000075}, -{0x000001D8, 0x00000308}, {0x000001D8, 0x00000301}, {0x000001D9, 0x00000055}, {0x000001D9, 0x00000308}, -{0x000001D9, 0x0000030C}, {0x000001DA, 0x00000075}, {0x000001DA, 0x00000308}, {0x000001DA, 0x0000030C}, -{0x000001DB, 0x00000055}, {0x000001DB, 0x00000308}, {0x000001DB, 0x00000300}, {0x000001DC, 0x00000075}, -{0x000001DC, 0x00000308}, {0x000001DC, 0x00000300}, {0x000001DE, 0x00000041}, {0x000001DE, 0x00000308}, -{0x000001DE, 0x00000304}, {0x000001DF, 0x00000061}, {0x000001DF, 0x00000308}, {0x000001DF, 0x00000304}, -{0x000001E0, 0x00000041}, {0x000001E0, 0x00000307}, {0x000001E0, 0x00000304}, {0x000001E1, 0x00000061}, -{0x000001E1, 0x00000307}, {0x000001E1, 0x00000304}, {0x000001E2, 0x000000C6}, {0x000001E2, 0x00000304}, -{0x000001E3, 0x000000E6}, {0x000001E3, 0x00000304}, {0x000001E6, 0x00000047}, {0x000001E6, 0x0000030C}, -{0x000001E7, 0x00000067}, {0x000001E7, 0x0000030C}, {0x000001E8, 0x0000004B}, {0x000001E8, 0x0000030C}, -{0x000001E9, 0x0000006B}, {0x000001E9, 0x0000030C}, {0x000001EA, 0x0000004F}, {0x000001EA, 0x00000328}, -{0x000001EB, 0x0000006F}, {0x000001EB, 0x00000328}, {0x000001EC, 0x0000004F}, {0x000001EC, 0x00000328}, -{0x000001EC, 0x00000304}, {0x000001ED, 0x0000006F}, {0x000001ED, 0x00000328}, {0x000001ED, 0x00000304}, -{0x000001EE, 0x000001B7}, {0x000001EE, 0x0000030C}, {0x000001EF, 0x00000292}, {0x000001EF, 0x0000030C}, -{0x000001F0, 0x0000006A}, {0x000001F0, 0x0000030C}, {0x000001F4, 0x00000047}, {0x000001F4, 0x00000301}, -{0x000001F5, 0x00000067}, {0x000001F5, 0x00000301}, {0x000001F8, 0x0000004E}, {0x000001F8, 0x00000300}, -{0x000001F9, 0x0000006E}, {0x000001F9, 0x00000300}, {0x000001FA, 0x00000041}, {0x000001FA, 0x0000030A}, -{0x000001FA, 0x00000301}, {0x000001FB, 0x00000061}, {0x000001FB, 0x0000030A}, {0x000001FB, 0x00000301}, -{0x000001FC, 0x000000C6}, {0x000001FC, 0x00000301}, {0x000001FD, 0x000000E6}, {0x000001FD, 0x00000301}, -{0x000001FE, 0x000000D8}, {0x000001FE, 0x00000301}, {0x000001FF, 0x000000F8}, {0x000001FF, 0x00000301}, -{0x00000200, 0x00000041}, {0x00000200, 0x0000030F}, {0x00000201, 0x00000061}, {0x00000201, 0x0000030F}, -{0x00000202, 0x00000041}, {0x00000202, 0x00000311}, {0x00000203, 0x00000061}, {0x00000203, 0x00000311}, -{0x00000204, 0x00000045}, {0x00000204, 0x0000030F}, {0x00000205, 0x00000065}, {0x00000205, 0x0000030F}, -{0x00000206, 0x00000045}, {0x00000206, 0x00000311}, {0x00000207, 0x00000065}, {0x00000207, 0x00000311}, -{0x00000208, 0x00000049}, {0x00000208, 0x0000030F}, {0x00000209, 0x00000069}, {0x00000209, 0x0000030F}, -{0x0000020A, 0x00000049}, {0x0000020A, 0x00000311}, {0x0000020B, 0x00000069}, {0x0000020B, 0x00000311}, -{0x0000020C, 0x0000004F}, {0x0000020C, 0x0000030F}, {0x0000020D, 0x0000006F}, {0x0000020D, 0x0000030F}, -{0x0000020E, 0x0000004F}, {0x0000020E, 0x00000311}, {0x0000020F, 0x0000006F}, {0x0000020F, 0x00000311}, -{0x00000210, 0x00000052}, {0x00000210, 0x0000030F}, {0x00000211, 0x00000072}, {0x00000211, 0x0000030F}, -{0x00000212, 0x00000052}, {0x00000212, 0x00000311}, {0x00000213, 0x00000072}, {0x00000213, 0x00000311}, -{0x00000214, 0x00000055}, {0x00000214, 0x0000030F}, {0x00000215, 0x00000075}, {0x00000215, 0x0000030F}, -{0x00000216, 0x00000055}, {0x00000216, 0x00000311}, {0x00000217, 0x00000075}, {0x00000217, 0x00000311}, -{0x00000218, 0x00000053}, {0x00000218, 0x00000326}, {0x00000219, 0x00000073}, {0x00000219, 0x00000326}, -{0x0000021A, 0x00000054}, {0x0000021A, 0x00000326}, {0x0000021B, 0x00000074}, {0x0000021B, 0x00000326}, -{0x0000021E, 0x00000048}, {0x0000021E, 0x0000030C}, {0x0000021F, 0x00000068}, {0x0000021F, 0x0000030C}, -{0x00000226, 0x00000041}, {0x00000226, 0x00000307}, {0x00000227, 0x00000061}, {0x00000227, 0x00000307}, -{0x00000228, 0x00000045}, {0x00000228, 0x00000327}, {0x00000229, 0x00000065}, {0x00000229, 0x00000327}, -{0x0000022A, 0x0000004F}, {0x0000022A, 0x00000308}, {0x0000022A, 0x00000304}, {0x0000022B, 0x0000006F}, -{0x0000022B, 0x00000308}, {0x0000022B, 0x00000304}, {0x0000022C, 0x0000004F}, {0x0000022C, 0x00000303}, -{0x0000022C, 0x00000304}, {0x0000022D, 0x0000006F}, {0x0000022D, 0x00000303}, {0x0000022D, 0x00000304}, -{0x0000022E, 0x0000004F}, {0x0000022E, 0x00000307}, {0x0000022F, 0x0000006F}, {0x0000022F, 0x00000307}, -{0x00000230, 0x0000004F}, {0x00000230, 0x00000307}, {0x00000230, 0x00000304}, {0x00000231, 0x0000006F}, -{0x00000231, 0x00000307}, {0x00000231, 0x00000304}, {0x00000232, 0x00000059}, {0x00000232, 0x00000304}, -{0x00000233, 0x00000079}, {0x00000233, 0x00000304}, {0x00000340, 0x00000300}, {0x00000341, 0x00000301}, -{0x00000343, 0x00000313}, {0x00000344, 0x00000308}, {0x00000344, 0x00000301}, {0x00000374, 0x000002B9}, -{0x0000037E, 0x0000003B}, {0x00000385, 0x000000A8}, {0x00000385, 0x00000301}, {0x00000386, 0x00000391}, -{0x00000386, 0x00000301}, {0x00000387, 0x000000B7}, {0x00000388, 0x00000395}, {0x00000388, 0x00000301}, -{0x00000389, 0x00000397}, {0x00000389, 0x00000301}, {0x0000038A, 0x00000399}, {0x0000038A, 0x00000301}, -{0x0000038C, 0x0000039F}, {0x0000038C, 0x00000301}, {0x0000038E, 0x000003A5}, {0x0000038E, 0x00000301}, -{0x0000038F, 0x000003A9}, {0x0000038F, 0x00000301}, {0x00000390, 0x000003B9}, {0x00000390, 0x00000308}, -{0x00000390, 0x00000301}, {0x000003AA, 0x00000399}, {0x000003AA, 0x00000308}, {0x000003AB, 0x000003A5}, -{0x000003AB, 0x00000308}, {0x000003AC, 0x000003B1}, {0x000003AC, 0x00000301}, {0x000003AD, 0x000003B5}, -{0x000003AD, 0x00000301}, {0x000003AE, 0x000003B7}, {0x000003AE, 0x00000301}, {0x000003AF, 0x000003B9}, -{0x000003AF, 0x00000301}, {0x000003B0, 0x000003C5}, {0x000003B0, 0x00000308}, {0x000003B0, 0x00000301}, -{0x000003CA, 0x000003B9}, {0x000003CA, 0x00000308}, {0x000003CB, 0x000003C5}, {0x000003CB, 0x00000308}, -{0x000003CC, 0x000003BF}, {0x000003CC, 0x00000301}, {0x000003CD, 0x000003C5}, {0x000003CD, 0x00000301}, -{0x000003CE, 0x000003C9}, {0x000003CE, 0x00000301}, {0x000003D3, 0x000003D2}, {0x000003D3, 0x00000301}, -{0x000003D4, 0x000003D2}, {0x000003D4, 0x00000308}, {0x00000400, 0x00000415}, {0x00000400, 0x00000300}, -{0x00000401, 0x00000415}, {0x00000401, 0x00000308}, {0x00000403, 0x00000413}, {0x00000403, 0x00000301}, -{0x00000407, 0x00000406}, {0x00000407, 0x00000308}, {0x0000040C, 0x0000041A}, {0x0000040C, 0x00000301}, -{0x0000040D, 0x00000418}, {0x0000040D, 0x00000300}, {0x0000040E, 0x00000423}, {0x0000040E, 0x00000306}, -{0x00000419, 0x00000418}, {0x00000419, 0x00000306}, {0x00000439, 0x00000438}, {0x00000439, 0x00000306}, -{0x00000450, 0x00000435}, {0x00000450, 0x00000300}, {0x00000451, 0x00000435}, {0x00000451, 0x00000308}, -{0x00000453, 0x00000433}, {0x00000453, 0x00000301}, {0x00000457, 0x00000456}, {0x00000457, 0x00000308}, -{0x0000045C, 0x0000043A}, {0x0000045C, 0x00000301}, {0x0000045D, 0x00000438}, {0x0000045D, 0x00000300}, -{0x0000045E, 0x00000443}, {0x0000045E, 0x00000306}, {0x00000476, 0x00000474}, {0x00000476, 0x0000030F}, -{0x00000477, 0x00000475}, {0x00000477, 0x0000030F}, {0x000004C1, 0x00000416}, {0x000004C1, 0x00000306}, -{0x000004C2, 0x00000436}, {0x000004C2, 0x00000306}, {0x000004D0, 0x00000410}, {0x000004D0, 0x00000306}, -{0x000004D1, 0x00000430}, {0x000004D1, 0x00000306}, {0x000004D2, 0x00000410}, {0x000004D2, 0x00000308}, -{0x000004D3, 0x00000430}, {0x000004D3, 0x00000308}, {0x000004D6, 0x00000415}, {0x000004D6, 0x00000306}, -{0x000004D7, 0x00000435}, {0x000004D7, 0x00000306}, {0x000004DA, 0x000004D8}, {0x000004DA, 0x00000308}, -{0x000004DB, 0x000004D9}, {0x000004DB, 0x00000308}, {0x000004DC, 0x00000416}, {0x000004DC, 0x00000308}, -{0x000004DD, 0x00000436}, {0x000004DD, 0x00000308}, {0x000004DE, 0x00000417}, {0x000004DE, 0x00000308}, -{0x000004DF, 0x00000437}, {0x000004DF, 0x00000308}, {0x000004E2, 0x00000418}, {0x000004E2, 0x00000304}, -{0x000004E3, 0x00000438}, {0x000004E3, 0x00000304}, {0x000004E4, 0x00000418}, {0x000004E4, 0x00000308}, -{0x000004E5, 0x00000438}, {0x000004E5, 0x00000308}, {0x000004E6, 0x0000041E}, {0x000004E6, 0x00000308}, -{0x000004E7, 0x0000043E}, {0x000004E7, 0x00000308}, {0x000004EA, 0x000004E8}, {0x000004EA, 0x00000308}, -{0x000004EB, 0x000004E9}, {0x000004EB, 0x00000308}, {0x000004EC, 0x0000042D}, {0x000004EC, 0x00000308}, -{0x000004ED, 0x0000044D}, {0x000004ED, 0x00000308}, {0x000004EE, 0x00000423}, {0x000004EE, 0x00000304}, -{0x000004EF, 0x00000443}, {0x000004EF, 0x00000304}, {0x000004F0, 0x00000423}, {0x000004F0, 0x00000308}, -{0x000004F1, 0x00000443}, {0x000004F1, 0x00000308}, {0x000004F2, 0x00000423}, {0x000004F2, 0x0000030B}, -{0x000004F3, 0x00000443}, {0x000004F3, 0x0000030B}, {0x000004F4, 0x00000427}, {0x000004F4, 0x00000308}, -{0x000004F5, 0x00000447}, {0x000004F5, 0x00000308}, {0x000004F8, 0x0000042B}, {0x000004F8, 0x00000308}, -{0x000004F9, 0x0000044B}, {0x000004F9, 0x00000308}, {0x00000622, 0x00000627}, {0x00000622, 0x00000653}, -{0x00000623, 0x00000627}, {0x00000623, 0x00000654}, {0x00000624, 0x00000648}, {0x00000624, 0x00000654}, -{0x00000625, 0x00000627}, {0x00000625, 0x00000655}, {0x00000626, 0x0000064A}, {0x00000626, 0x00000654}, -{0x000006C0, 0x000006D5}, {0x000006C0, 0x00000654}, {0x000006C2, 0x000006C1}, {0x000006C2, 0x00000654}, -{0x000006D3, 0x000006D2}, {0x000006D3, 0x00000654}, {0x00000929, 0x00000928}, {0x00000929, 0x0000093C}, -{0x00000931, 0x00000930}, {0x00000931, 0x0000093C}, {0x00000934, 0x00000933}, {0x00000934, 0x0000093C}, -{0x00000958, 0x00000915}, {0x00000958, 0x0000093C}, {0x00000959, 0x00000916}, {0x00000959, 0x0000093C}, -{0x0000095A, 0x00000917}, {0x0000095A, 0x0000093C}, {0x0000095B, 0x0000091C}, {0x0000095B, 0x0000093C}, -{0x0000095C, 0x00000921}, {0x0000095C, 0x0000093C}, {0x0000095D, 0x00000922}, {0x0000095D, 0x0000093C}, -{0x0000095E, 0x0000092B}, {0x0000095E, 0x0000093C}, {0x0000095F, 0x0000092F}, {0x0000095F, 0x0000093C}, -{0x000009CB, 0x000009C7}, {0x000009CB, 0x000009BE}, {0x000009CC, 0x000009C7}, {0x000009CC, 0x000009D7}, -{0x000009DC, 0x000009A1}, {0x000009DC, 0x000009BC}, {0x000009DD, 0x000009A2}, {0x000009DD, 0x000009BC}, -{0x000009DF, 0x000009AF}, {0x000009DF, 0x000009BC}, {0x00000A33, 0x00000A32}, {0x00000A33, 0x00000A3C}, -{0x00000A36, 0x00000A38}, {0x00000A36, 0x00000A3C}, {0x00000A59, 0x00000A16}, {0x00000A59, 0x00000A3C}, -{0x00000A5A, 0x00000A17}, {0x00000A5A, 0x00000A3C}, {0x00000A5B, 0x00000A1C}, {0x00000A5B, 0x00000A3C}, -{0x00000A5E, 0x00000A2B}, {0x00000A5E, 0x00000A3C}, {0x00000B48, 0x00000B47}, {0x00000B48, 0x00000B56}, -{0x00000B4B, 0x00000B47}, {0x00000B4B, 0x00000B3E}, {0x00000B4C, 0x00000B47}, {0x00000B4C, 0x00000B57}, -{0x00000B5C, 0x00000B21}, {0x00000B5C, 0x00000B3C}, {0x00000B5D, 0x00000B22}, {0x00000B5D, 0x00000B3C}, -{0x00000B94, 0x00000B92}, {0x00000B94, 0x00000BD7}, {0x00000BCA, 0x00000BC6}, {0x00000BCA, 0x00000BBE}, -{0x00000BCB, 0x00000BC7}, {0x00000BCB, 0x00000BBE}, {0x00000BCC, 0x00000BC6}, {0x00000BCC, 0x00000BD7}, -{0x00000C48, 0x00000C46}, {0x00000C48, 0x00000C56}, {0x00000CC0, 0x00000CBF}, {0x00000CC0, 0x00000CD5}, -{0x00000CC7, 0x00000CC6}, {0x00000CC7, 0x00000CD5}, {0x00000CC8, 0x00000CC6}, {0x00000CC8, 0x00000CD6}, -{0x00000CCA, 0x00000CC6}, {0x00000CCA, 0x00000CC2}, {0x00000CCB, 0x00000CC6}, {0x00000CCB, 0x00000CC2}, -{0x00000CCB, 0x00000CD5}, {0x00000D4A, 0x00000D46}, {0x00000D4A, 0x00000D3E}, {0x00000D4B, 0x00000D47}, -{0x00000D4B, 0x00000D3E}, {0x00000D4C, 0x00000D46}, {0x00000D4C, 0x00000D57}, {0x00000DDA, 0x00000DD9}, -{0x00000DDA, 0x00000DCA}, {0x00000DDC, 0x00000DD9}, {0x00000DDC, 0x00000DCF}, {0x00000DDD, 0x00000DD9}, -{0x00000DDD, 0x00000DCF}, {0x00000DDD, 0x00000DCA}, {0x00000DDE, 0x00000DD9}, {0x00000DDE, 0x00000DDF}, -{0x00000F43, 0x00000F42}, {0x00000F43, 0x00000FB7}, {0x00000F4D, 0x00000F4C}, {0x00000F4D, 0x00000FB7}, -{0x00000F52, 0x00000F51}, {0x00000F52, 0x00000FB7}, {0x00000F57, 0x00000F56}, {0x00000F57, 0x00000FB7}, -{0x00000F5C, 0x00000F5B}, {0x00000F5C, 0x00000FB7}, {0x00000F69, 0x00000F40}, {0x00000F69, 0x00000FB5}, -{0x00000F73, 0x00000F71}, {0x00000F73, 0x00000F72}, {0x00000F75, 0x00000F71}, {0x00000F75, 0x00000F74}, -{0x00000F76, 0x00000FB2}, {0x00000F76, 0x00000F80}, {0x00000F78, 0x00000FB3}, {0x00000F78, 0x00000F80}, -{0x00000F81, 0x00000F71}, {0x00000F81, 0x00000F80}, {0x00000F93, 0x00000F92}, {0x00000F93, 0x00000FB7}, -{0x00000F9D, 0x00000F9C}, {0x00000F9D, 0x00000FB7}, {0x00000FA2, 0x00000FA1}, {0x00000FA2, 0x00000FB7}, -{0x00000FA7, 0x00000FA6}, {0x00000FA7, 0x00000FB7}, {0x00000FAC, 0x00000FAB}, {0x00000FAC, 0x00000FB7}, -{0x00000FB9, 0x00000F90}, {0x00000FB9, 0x00000FB5}, {0x00001026, 0x00001025}, {0x00001026, 0x0000102E}, -{0x00001B06, 0x00001B05}, {0x00001B06, 0x00001B35}, {0x00001B08, 0x00001B07}, {0x00001B08, 0x00001B35}, -{0x00001B0A, 0x00001B09}, {0x00001B0A, 0x00001B35}, {0x00001B0C, 0x00001B0B}, {0x00001B0C, 0x00001B35}, -{0x00001B0E, 0x00001B0D}, {0x00001B0E, 0x00001B35}, {0x00001B12, 0x00001B11}, {0x00001B12, 0x00001B35}, -{0x00001B3B, 0x00001B3A}, {0x00001B3B, 0x00001B35}, {0x00001B3D, 0x00001B3C}, {0x00001B3D, 0x00001B35}, -{0x00001B40, 0x00001B3E}, {0x00001B40, 0x00001B35}, {0x00001B41, 0x00001B3F}, {0x00001B41, 0x00001B35}, -{0x00001B43, 0x00001B42}, {0x00001B43, 0x00001B35}, {0x00001E00, 0x00000041}, {0x00001E00, 0x00000325}, -{0x00001E01, 0x00000061}, {0x00001E01, 0x00000325}, {0x00001E02, 0x00000042}, {0x00001E02, 0x00000307}, -{0x00001E03, 0x00000062}, {0x00001E03, 0x00000307}, {0x00001E04, 0x00000042}, {0x00001E04, 0x00000323}, -{0x00001E05, 0x00000062}, {0x00001E05, 0x00000323}, {0x00001E06, 0x00000042}, {0x00001E06, 0x00000331}, -{0x00001E07, 0x00000062}, {0x00001E07, 0x00000331}, {0x00001E08, 0x00000043}, {0x00001E08, 0x00000327}, -{0x00001E08, 0x00000301}, {0x00001E09, 0x00000063}, {0x00001E09, 0x00000327}, {0x00001E09, 0x00000301}, -{0x00001E0A, 0x00000044}, {0x00001E0A, 0x00000307}, {0x00001E0B, 0x00000064}, {0x00001E0B, 0x00000307}, -{0x00001E0C, 0x00000044}, {0x00001E0C, 0x00000323}, {0x00001E0D, 0x00000064}, {0x00001E0D, 0x00000323}, -{0x00001E0E, 0x00000044}, {0x00001E0E, 0x00000331}, {0x00001E0F, 0x00000064}, {0x00001E0F, 0x00000331}, -{0x00001E10, 0x00000044}, {0x00001E10, 0x00000327}, {0x00001E11, 0x00000064}, {0x00001E11, 0x00000327}, -{0x00001E12, 0x00000044}, {0x00001E12, 0x0000032D}, {0x00001E13, 0x00000064}, {0x00001E13, 0x0000032D}, -{0x00001E14, 0x00000045}, {0x00001E14, 0x00000304}, {0x00001E14, 0x00000300}, {0x00001E15, 0x00000065}, -{0x00001E15, 0x00000304}, {0x00001E15, 0x00000300}, {0x00001E16, 0x00000045}, {0x00001E16, 0x00000304}, -{0x00001E16, 0x00000301}, {0x00001E17, 0x00000065}, {0x00001E17, 0x00000304}, {0x00001E17, 0x00000301}, -{0x00001E18, 0x00000045}, {0x00001E18, 0x0000032D}, {0x00001E19, 0x00000065}, {0x00001E19, 0x0000032D}, -{0x00001E1A, 0x00000045}, {0x00001E1A, 0x00000330}, {0x00001E1B, 0x00000065}, {0x00001E1B, 0x00000330}, -{0x00001E1C, 0x00000045}, {0x00001E1C, 0x00000327}, {0x00001E1C, 0x00000306}, {0x00001E1D, 0x00000065}, -{0x00001E1D, 0x00000327}, {0x00001E1D, 0x00000306}, {0x00001E1E, 0x00000046}, {0x00001E1E, 0x00000307}, -{0x00001E1F, 0x00000066}, {0x00001E1F, 0x00000307}, {0x00001E20, 0x00000047}, {0x00001E20, 0x00000304}, -{0x00001E21, 0x00000067}, {0x00001E21, 0x00000304}, {0x00001E22, 0x00000048}, {0x00001E22, 0x00000307}, -{0x00001E23, 0x00000068}, {0x00001E23, 0x00000307}, {0x00001E24, 0x00000048}, {0x00001E24, 0x00000323}, -{0x00001E25, 0x00000068}, {0x00001E25, 0x00000323}, {0x00001E26, 0x00000048}, {0x00001E26, 0x00000308}, -{0x00001E27, 0x00000068}, {0x00001E27, 0x00000308}, {0x00001E28, 0x00000048}, {0x00001E28, 0x00000327}, -{0x00001E29, 0x00000068}, {0x00001E29, 0x00000327}, {0x00001E2A, 0x00000048}, {0x00001E2A, 0x0000032E}, -{0x00001E2B, 0x00000068}, {0x00001E2B, 0x0000032E}, {0x00001E2C, 0x00000049}, {0x00001E2C, 0x00000330}, -{0x00001E2D, 0x00000069}, {0x00001E2D, 0x00000330}, {0x00001E2E, 0x00000049}, {0x00001E2E, 0x00000308}, -{0x00001E2E, 0x00000301}, {0x00001E2F, 0x00000069}, {0x00001E2F, 0x00000308}, {0x00001E2F, 0x00000301}, -{0x00001E30, 0x0000004B}, {0x00001E30, 0x00000301}, {0x00001E31, 0x0000006B}, {0x00001E31, 0x00000301}, -{0x00001E32, 0x0000004B}, {0x00001E32, 0x00000323}, {0x00001E33, 0x0000006B}, {0x00001E33, 0x00000323}, -{0x00001E34, 0x0000004B}, {0x00001E34, 0x00000331}, {0x00001E35, 0x0000006B}, {0x00001E35, 0x00000331}, -{0x00001E36, 0x0000004C}, {0x00001E36, 0x00000323}, {0x00001E37, 0x0000006C}, {0x00001E37, 0x00000323}, -{0x00001E38, 0x0000004C}, {0x00001E38, 0x00000323}, {0x00001E38, 0x00000304}, {0x00001E39, 0x0000006C}, -{0x00001E39, 0x00000323}, {0x00001E39, 0x00000304}, {0x00001E3A, 0x0000004C}, {0x00001E3A, 0x00000331}, -{0x00001E3B, 0x0000006C}, {0x00001E3B, 0x00000331}, {0x00001E3C, 0x0000004C}, {0x00001E3C, 0x0000032D}, -{0x00001E3D, 0x0000006C}, {0x00001E3D, 0x0000032D}, {0x00001E3E, 0x0000004D}, {0x00001E3E, 0x00000301}, -{0x00001E3F, 0x0000006D}, {0x00001E3F, 0x00000301}, {0x00001E40, 0x0000004D}, {0x00001E40, 0x00000307}, -{0x00001E41, 0x0000006D}, {0x00001E41, 0x00000307}, {0x00001E42, 0x0000004D}, {0x00001E42, 0x00000323}, -{0x00001E43, 0x0000006D}, {0x00001E43, 0x00000323}, {0x00001E44, 0x0000004E}, {0x00001E44, 0x00000307}, -{0x00001E45, 0x0000006E}, {0x00001E45, 0x00000307}, {0x00001E46, 0x0000004E}, {0x00001E46, 0x00000323}, -{0x00001E47, 0x0000006E}, {0x00001E47, 0x00000323}, {0x00001E48, 0x0000004E}, {0x00001E48, 0x00000331}, -{0x00001E49, 0x0000006E}, {0x00001E49, 0x00000331}, {0x00001E4A, 0x0000004E}, {0x00001E4A, 0x0000032D}, -{0x00001E4B, 0x0000006E}, {0x00001E4B, 0x0000032D}, {0x00001E4C, 0x0000004F}, {0x00001E4C, 0x00000303}, -{0x00001E4C, 0x00000301}, {0x00001E4D, 0x0000006F}, {0x00001E4D, 0x00000303}, {0x00001E4D, 0x00000301}, -{0x00001E4E, 0x0000004F}, {0x00001E4E, 0x00000303}, {0x00001E4E, 0x00000308}, {0x00001E4F, 0x0000006F}, -{0x00001E4F, 0x00000303}, {0x00001E4F, 0x00000308}, {0x00001E50, 0x0000004F}, {0x00001E50, 0x00000304}, -{0x00001E50, 0x00000300}, {0x00001E51, 0x0000006F}, {0x00001E51, 0x00000304}, {0x00001E51, 0x00000300}, -{0x00001E52, 0x0000004F}, {0x00001E52, 0x00000304}, {0x00001E52, 0x00000301}, {0x00001E53, 0x0000006F}, -{0x00001E53, 0x00000304}, {0x00001E53, 0x00000301}, {0x00001E54, 0x00000050}, {0x00001E54, 0x00000301}, -{0x00001E55, 0x00000070}, {0x00001E55, 0x00000301}, {0x00001E56, 0x00000050}, {0x00001E56, 0x00000307}, -{0x00001E57, 0x00000070}, {0x00001E57, 0x00000307}, {0x00001E58, 0x00000052}, {0x00001E58, 0x00000307}, -{0x00001E59, 0x00000072}, {0x00001E59, 0x00000307}, {0x00001E5A, 0x00000052}, {0x00001E5A, 0x00000323}, -{0x00001E5B, 0x00000072}, {0x00001E5B, 0x00000323}, {0x00001E5C, 0x00000052}, {0x00001E5C, 0x00000323}, -{0x00001E5C, 0x00000304}, {0x00001E5D, 0x00000072}, {0x00001E5D, 0x00000323}, {0x00001E5D, 0x00000304}, -{0x00001E5E, 0x00000052}, {0x00001E5E, 0x00000331}, {0x00001E5F, 0x00000072}, {0x00001E5F, 0x00000331}, -{0x00001E60, 0x00000053}, {0x00001E60, 0x00000307}, {0x00001E61, 0x00000073}, {0x00001E61, 0x00000307}, -{0x00001E62, 0x00000053}, {0x00001E62, 0x00000323}, {0x00001E63, 0x00000073}, {0x00001E63, 0x00000323}, -{0x00001E64, 0x00000053}, {0x00001E64, 0x00000301}, {0x00001E64, 0x00000307}, {0x00001E65, 0x00000073}, -{0x00001E65, 0x00000301}, {0x00001E65, 0x00000307}, {0x00001E66, 0x00000053}, {0x00001E66, 0x0000030C}, -{0x00001E66, 0x00000307}, {0x00001E67, 0x00000073}, {0x00001E67, 0x0000030C}, {0x00001E67, 0x00000307}, -{0x00001E68, 0x00000053}, {0x00001E68, 0x00000323}, {0x00001E68, 0x00000307}, {0x00001E69, 0x00000073}, -{0x00001E69, 0x00000323}, {0x00001E69, 0x00000307}, {0x00001E6A, 0x00000054}, {0x00001E6A, 0x00000307}, -{0x00001E6B, 0x00000074}, {0x00001E6B, 0x00000307}, {0x00001E6C, 0x00000054}, {0x00001E6C, 0x00000323}, -{0x00001E6D, 0x00000074}, {0x00001E6D, 0x00000323}, {0x00001E6E, 0x00000054}, {0x00001E6E, 0x00000331}, -{0x00001E6F, 0x00000074}, {0x00001E6F, 0x00000331}, {0x00001E70, 0x00000054}, {0x00001E70, 0x0000032D}, -{0x00001E71, 0x00000074}, {0x00001E71, 0x0000032D}, {0x00001E72, 0x00000055}, {0x00001E72, 0x00000324}, -{0x00001E73, 0x00000075}, {0x00001E73, 0x00000324}, {0x00001E74, 0x00000055}, {0x00001E74, 0x00000330}, -{0x00001E75, 0x00000075}, {0x00001E75, 0x00000330}, {0x00001E76, 0x00000055}, {0x00001E76, 0x0000032D}, -{0x00001E77, 0x00000075}, {0x00001E77, 0x0000032D}, {0x00001E78, 0x00000055}, {0x00001E78, 0x00000303}, -{0x00001E78, 0x00000301}, {0x00001E79, 0x00000075}, {0x00001E79, 0x00000303}, {0x00001E79, 0x00000301}, -{0x00001E7A, 0x00000055}, {0x00001E7A, 0x00000304}, {0x00001E7A, 0x00000308}, {0x00001E7B, 0x00000075}, -{0x00001E7B, 0x00000304}, {0x00001E7B, 0x00000308}, {0x00001E7C, 0x00000056}, {0x00001E7C, 0x00000303}, -{0x00001E7D, 0x00000076}, {0x00001E7D, 0x00000303}, {0x00001E7E, 0x00000056}, {0x00001E7E, 0x00000323}, -{0x00001E7F, 0x00000076}, {0x00001E7F, 0x00000323}, {0x00001E80, 0x00000057}, {0x00001E80, 0x00000300}, -{0x00001E81, 0x00000077}, {0x00001E81, 0x00000300}, {0x00001E82, 0x00000057}, {0x00001E82, 0x00000301}, -{0x00001E83, 0x00000077}, {0x00001E83, 0x00000301}, {0x00001E84, 0x00000057}, {0x00001E84, 0x00000308}, -{0x00001E85, 0x00000077}, {0x00001E85, 0x00000308}, {0x00001E86, 0x00000057}, {0x00001E86, 0x00000307}, -{0x00001E87, 0x00000077}, {0x00001E87, 0x00000307}, {0x00001E88, 0x00000057}, {0x00001E88, 0x00000323}, -{0x00001E89, 0x00000077}, {0x00001E89, 0x00000323}, {0x00001E8A, 0x00000058}, {0x00001E8A, 0x00000307}, -{0x00001E8B, 0x00000078}, {0x00001E8B, 0x00000307}, {0x00001E8C, 0x00000058}, {0x00001E8C, 0x00000308}, -{0x00001E8D, 0x00000078}, {0x00001E8D, 0x00000308}, {0x00001E8E, 0x00000059}, {0x00001E8E, 0x00000307}, -{0x00001E8F, 0x00000079}, {0x00001E8F, 0x00000307}, {0x00001E90, 0x0000005A}, {0x00001E90, 0x00000302}, -{0x00001E91, 0x0000007A}, {0x00001E91, 0x00000302}, {0x00001E92, 0x0000005A}, {0x00001E92, 0x00000323}, -{0x00001E93, 0x0000007A}, {0x00001E93, 0x00000323}, {0x00001E94, 0x0000005A}, {0x00001E94, 0x00000331}, -{0x00001E95, 0x0000007A}, {0x00001E95, 0x00000331}, {0x00001E96, 0x00000068}, {0x00001E96, 0x00000331}, -{0x00001E97, 0x00000074}, {0x00001E97, 0x00000308}, {0x00001E98, 0x00000077}, {0x00001E98, 0x0000030A}, -{0x00001E99, 0x00000079}, {0x00001E99, 0x0000030A}, {0x00001E9B, 0x0000017F}, {0x00001E9B, 0x00000307}, -{0x00001EA0, 0x00000041}, {0x00001EA0, 0x00000323}, {0x00001EA1, 0x00000061}, {0x00001EA1, 0x00000323}, -{0x00001EA2, 0x00000041}, {0x00001EA2, 0x00000309}, {0x00001EA3, 0x00000061}, {0x00001EA3, 0x00000309}, -{0x00001EA4, 0x00000041}, {0x00001EA4, 0x00000302}, {0x00001EA4, 0x00000301}, {0x00001EA5, 0x00000061}, -{0x00001EA5, 0x00000302}, {0x00001EA5, 0x00000301}, {0x00001EA6, 0x00000041}, {0x00001EA6, 0x00000302}, -{0x00001EA6, 0x00000300}, {0x00001EA7, 0x00000061}, {0x00001EA7, 0x00000302}, {0x00001EA7, 0x00000300}, -{0x00001EA8, 0x00000041}, {0x00001EA8, 0x00000302}, {0x00001EA8, 0x00000309}, {0x00001EA9, 0x00000061}, -{0x00001EA9, 0x00000302}, {0x00001EA9, 0x00000309}, {0x00001EAA, 0x00000041}, {0x00001EAA, 0x00000302}, -{0x00001EAA, 0x00000303}, {0x00001EAB, 0x00000061}, {0x00001EAB, 0x00000302}, {0x00001EAB, 0x00000303}, -{0x00001EAC, 0x00000041}, {0x00001EAC, 0x00000323}, {0x00001EAC, 0x00000302}, {0x00001EAD, 0x00000061}, -{0x00001EAD, 0x00000323}, {0x00001EAD, 0x00000302}, {0x00001EAE, 0x00000041}, {0x00001EAE, 0x00000306}, -{0x00001EAE, 0x00000301}, {0x00001EAF, 0x00000061}, {0x00001EAF, 0x00000306}, {0x00001EAF, 0x00000301}, -{0x00001EB0, 0x00000041}, {0x00001EB0, 0x00000306}, {0x00001EB0, 0x00000300}, {0x00001EB1, 0x00000061}, -{0x00001EB1, 0x00000306}, {0x00001EB1, 0x00000300}, {0x00001EB2, 0x00000041}, {0x00001EB2, 0x00000306}, -{0x00001EB2, 0x00000309}, {0x00001EB3, 0x00000061}, {0x00001EB3, 0x00000306}, {0x00001EB3, 0x00000309}, -{0x00001EB4, 0x00000041}, {0x00001EB4, 0x00000306}, {0x00001EB4, 0x00000303}, {0x00001EB5, 0x00000061}, -{0x00001EB5, 0x00000306}, {0x00001EB5, 0x00000303}, {0x00001EB6, 0x00000041}, {0x00001EB6, 0x00000323}, -{0x00001EB6, 0x00000306}, {0x00001EB7, 0x00000061}, {0x00001EB7, 0x00000323}, {0x00001EB7, 0x00000306}, -{0x00001EB8, 0x00000045}, {0x00001EB8, 0x00000323}, {0x00001EB9, 0x00000065}, {0x00001EB9, 0x00000323}, -{0x00001EBA, 0x00000045}, {0x00001EBA, 0x00000309}, {0x00001EBB, 0x00000065}, {0x00001EBB, 0x00000309}, -{0x00001EBC, 0x00000045}, {0x00001EBC, 0x00000303}, {0x00001EBD, 0x00000065}, {0x00001EBD, 0x00000303}, -{0x00001EBE, 0x00000045}, {0x00001EBE, 0x00000302}, {0x00001EBE, 0x00000301}, {0x00001EBF, 0x00000065}, -{0x00001EBF, 0x00000302}, {0x00001EBF, 0x00000301}, {0x00001EC0, 0x00000045}, {0x00001EC0, 0x00000302}, -{0x00001EC0, 0x00000300}, {0x00001EC1, 0x00000065}, {0x00001EC1, 0x00000302}, {0x00001EC1, 0x00000300}, -{0x00001EC2, 0x00000045}, {0x00001EC2, 0x00000302}, {0x00001EC2, 0x00000309}, {0x00001EC3, 0x00000065}, -{0x00001EC3, 0x00000302}, {0x00001EC3, 0x00000309}, {0x00001EC4, 0x00000045}, {0x00001EC4, 0x00000302}, -{0x00001EC4, 0x00000303}, {0x00001EC5, 0x00000065}, {0x00001EC5, 0x00000302}, {0x00001EC5, 0x00000303}, -{0x00001EC6, 0x00000045}, {0x00001EC6, 0x00000323}, {0x00001EC6, 0x00000302}, {0x00001EC7, 0x00000065}, -{0x00001EC7, 0x00000323}, {0x00001EC7, 0x00000302}, {0x00001EC8, 0x00000049}, {0x00001EC8, 0x00000309}, -{0x00001EC9, 0x00000069}, {0x00001EC9, 0x00000309}, {0x00001ECA, 0x00000049}, {0x00001ECA, 0x00000323}, -{0x00001ECB, 0x00000069}, {0x00001ECB, 0x00000323}, {0x00001ECC, 0x0000004F}, {0x00001ECC, 0x00000323}, -{0x00001ECD, 0x0000006F}, {0x00001ECD, 0x00000323}, {0x00001ECE, 0x0000004F}, {0x00001ECE, 0x00000309}, -{0x00001ECF, 0x0000006F}, {0x00001ECF, 0x00000309}, {0x00001ED0, 0x0000004F}, {0x00001ED0, 0x00000302}, -{0x00001ED0, 0x00000301}, {0x00001ED1, 0x0000006F}, {0x00001ED1, 0x00000302}, {0x00001ED1, 0x00000301}, -{0x00001ED2, 0x0000004F}, {0x00001ED2, 0x00000302}, {0x00001ED2, 0x00000300}, {0x00001ED3, 0x0000006F}, -{0x00001ED3, 0x00000302}, {0x00001ED3, 0x00000300}, {0x00001ED4, 0x0000004F}, {0x00001ED4, 0x00000302}, -{0x00001ED4, 0x00000309}, {0x00001ED5, 0x0000006F}, {0x00001ED5, 0x00000302}, {0x00001ED5, 0x00000309}, -{0x00001ED6, 0x0000004F}, {0x00001ED6, 0x00000302}, {0x00001ED6, 0x00000303}, {0x00001ED7, 0x0000006F}, -{0x00001ED7, 0x00000302}, {0x00001ED7, 0x00000303}, {0x00001ED8, 0x0000004F}, {0x00001ED8, 0x00000323}, -{0x00001ED8, 0x00000302}, {0x00001ED9, 0x0000006F}, {0x00001ED9, 0x00000323}, {0x00001ED9, 0x00000302}, -{0x00001EDA, 0x0000004F}, {0x00001EDA, 0x0000031B}, {0x00001EDA, 0x00000301}, {0x00001EDB, 0x0000006F}, -{0x00001EDB, 0x0000031B}, {0x00001EDB, 0x00000301}, {0x00001EDC, 0x0000004F}, {0x00001EDC, 0x0000031B}, -{0x00001EDC, 0x00000300}, {0x00001EDD, 0x0000006F}, {0x00001EDD, 0x0000031B}, {0x00001EDD, 0x00000300}, -{0x00001EDE, 0x0000004F}, {0x00001EDE, 0x0000031B}, {0x00001EDE, 0x00000309}, {0x00001EDF, 0x0000006F}, -{0x00001EDF, 0x0000031B}, {0x00001EDF, 0x00000309}, {0x00001EE0, 0x0000004F}, {0x00001EE0, 0x0000031B}, -{0x00001EE0, 0x00000303}, {0x00001EE1, 0x0000006F}, {0x00001EE1, 0x0000031B}, {0x00001EE1, 0x00000303}, -{0x00001EE2, 0x0000004F}, {0x00001EE2, 0x0000031B}, {0x00001EE2, 0x00000323}, {0x00001EE3, 0x0000006F}, -{0x00001EE3, 0x0000031B}, {0x00001EE3, 0x00000323}, {0x00001EE4, 0x00000055}, {0x00001EE4, 0x00000323}, -{0x00001EE5, 0x00000075}, {0x00001EE5, 0x00000323}, {0x00001EE6, 0x00000055}, {0x00001EE6, 0x00000309}, -{0x00001EE7, 0x00000075}, {0x00001EE7, 0x00000309}, {0x00001EE8, 0x00000055}, {0x00001EE8, 0x0000031B}, -{0x00001EE8, 0x00000301}, {0x00001EE9, 0x00000075}, {0x00001EE9, 0x0000031B}, {0x00001EE9, 0x00000301}, -{0x00001EEA, 0x00000055}, {0x00001EEA, 0x0000031B}, {0x00001EEA, 0x00000300}, {0x00001EEB, 0x00000075}, -{0x00001EEB, 0x0000031B}, {0x00001EEB, 0x00000300}, {0x00001EEC, 0x00000055}, {0x00001EEC, 0x0000031B}, -{0x00001EEC, 0x00000309}, {0x00001EED, 0x00000075}, {0x00001EED, 0x0000031B}, {0x00001EED, 0x00000309}, -{0x00001EEE, 0x00000055}, {0x00001EEE, 0x0000031B}, {0x00001EEE, 0x00000303}, {0x00001EEF, 0x00000075}, -{0x00001EEF, 0x0000031B}, {0x00001EEF, 0x00000303}, {0x00001EF0, 0x00000055}, {0x00001EF0, 0x0000031B}, -{0x00001EF0, 0x00000323}, {0x00001EF1, 0x00000075}, {0x00001EF1, 0x0000031B}, {0x00001EF1, 0x00000323}, -{0x00001EF2, 0x00000059}, {0x00001EF2, 0x00000300}, {0x00001EF3, 0x00000079}, {0x00001EF3, 0x00000300}, -{0x00001EF4, 0x00000059}, {0x00001EF4, 0x00000323}, {0x00001EF5, 0x00000079}, {0x00001EF5, 0x00000323}, -{0x00001EF6, 0x00000059}, {0x00001EF6, 0x00000309}, {0x00001EF7, 0x00000079}, {0x00001EF7, 0x00000309}, -{0x00001EF8, 0x00000059}, {0x00001EF8, 0x00000303}, {0x00001EF9, 0x00000079}, {0x00001EF9, 0x00000303}, -{0x00001F00, 0x000003B1}, {0x00001F00, 0x00000313}, {0x00001F01, 0x000003B1}, {0x00001F01, 0x00000314}, -{0x00001F02, 0x000003B1}, {0x00001F02, 0x00000313}, {0x00001F02, 0x00000300}, {0x00001F03, 0x000003B1}, -{0x00001F03, 0x00000314}, {0x00001F03, 0x00000300}, {0x00001F04, 0x000003B1}, {0x00001F04, 0x00000313}, -{0x00001F04, 0x00000301}, {0x00001F05, 0x000003B1}, {0x00001F05, 0x00000314}, {0x00001F05, 0x00000301}, -{0x00001F06, 0x000003B1}, {0x00001F06, 0x00000313}, {0x00001F06, 0x00000342}, {0x00001F07, 0x000003B1}, -{0x00001F07, 0x00000314}, {0x00001F07, 0x00000342}, {0x00001F08, 0x00000391}, {0x00001F08, 0x00000313}, -{0x00001F09, 0x00000391}, {0x00001F09, 0x00000314}, {0x00001F0A, 0x00000391}, {0x00001F0A, 0x00000313}, -{0x00001F0A, 0x00000300}, {0x00001F0B, 0x00000391}, {0x00001F0B, 0x00000314}, {0x00001F0B, 0x00000300}, -{0x00001F0C, 0x00000391}, {0x00001F0C, 0x00000313}, {0x00001F0C, 0x00000301}, {0x00001F0D, 0x00000391}, -{0x00001F0D, 0x00000314}, {0x00001F0D, 0x00000301}, {0x00001F0E, 0x00000391}, {0x00001F0E, 0x00000313}, -{0x00001F0E, 0x00000342}, {0x00001F0F, 0x00000391}, {0x00001F0F, 0x00000314}, {0x00001F0F, 0x00000342}, -{0x00001F10, 0x000003B5}, {0x00001F10, 0x00000313}, {0x00001F11, 0x000003B5}, {0x00001F11, 0x00000314}, -{0x00001F12, 0x000003B5}, {0x00001F12, 0x00000313}, {0x00001F12, 0x00000300}, {0x00001F13, 0x000003B5}, -{0x00001F13, 0x00000314}, {0x00001F13, 0x00000300}, {0x00001F14, 0x000003B5}, {0x00001F14, 0x00000313}, -{0x00001F14, 0x00000301}, {0x00001F15, 0x000003B5}, {0x00001F15, 0x00000314}, {0x00001F15, 0x00000301}, -{0x00001F18, 0x00000395}, {0x00001F18, 0x00000313}, {0x00001F19, 0x00000395}, {0x00001F19, 0x00000314}, -{0x00001F1A, 0x00000395}, {0x00001F1A, 0x00000313}, {0x00001F1A, 0x00000300}, {0x00001F1B, 0x00000395}, -{0x00001F1B, 0x00000314}, {0x00001F1B, 0x00000300}, {0x00001F1C, 0x00000395}, {0x00001F1C, 0x00000313}, -{0x00001F1C, 0x00000301}, {0x00001F1D, 0x00000395}, {0x00001F1D, 0x00000314}, {0x00001F1D, 0x00000301}, -{0x00001F20, 0x000003B7}, {0x00001F20, 0x00000313}, {0x00001F21, 0x000003B7}, {0x00001F21, 0x00000314}, -{0x00001F22, 0x000003B7}, {0x00001F22, 0x00000313}, {0x00001F22, 0x00000300}, {0x00001F23, 0x000003B7}, -{0x00001F23, 0x00000314}, {0x00001F23, 0x00000300}, {0x00001F24, 0x000003B7}, {0x00001F24, 0x00000313}, -{0x00001F24, 0x00000301}, {0x00001F25, 0x000003B7}, {0x00001F25, 0x00000314}, {0x00001F25, 0x00000301}, -{0x00001F26, 0x000003B7}, {0x00001F26, 0x00000313}, {0x00001F26, 0x00000342}, {0x00001F27, 0x000003B7}, -{0x00001F27, 0x00000314}, {0x00001F27, 0x00000342}, {0x00001F28, 0x00000397}, {0x00001F28, 0x00000313}, -{0x00001F29, 0x00000397}, {0x00001F29, 0x00000314}, {0x00001F2A, 0x00000397}, {0x00001F2A, 0x00000313}, -{0x00001F2A, 0x00000300}, {0x00001F2B, 0x00000397}, {0x00001F2B, 0x00000314}, {0x00001F2B, 0x00000300}, -{0x00001F2C, 0x00000397}, {0x00001F2C, 0x00000313}, {0x00001F2C, 0x00000301}, {0x00001F2D, 0x00000397}, -{0x00001F2D, 0x00000314}, {0x00001F2D, 0x00000301}, {0x00001F2E, 0x00000397}, {0x00001F2E, 0x00000313}, -{0x00001F2E, 0x00000342}, {0x00001F2F, 0x00000397}, {0x00001F2F, 0x00000314}, {0x00001F2F, 0x00000342}, -{0x00001F30, 0x000003B9}, {0x00001F30, 0x00000313}, {0x00001F31, 0x000003B9}, {0x00001F31, 0x00000314}, -{0x00001F32, 0x000003B9}, {0x00001F32, 0x00000313}, {0x00001F32, 0x00000300}, {0x00001F33, 0x000003B9}, -{0x00001F33, 0x00000314}, {0x00001F33, 0x00000300}, {0x00001F34, 0x000003B9}, {0x00001F34, 0x00000313}, -{0x00001F34, 0x00000301}, {0x00001F35, 0x000003B9}, {0x00001F35, 0x00000314}, {0x00001F35, 0x00000301}, -{0x00001F36, 0x000003B9}, {0x00001F36, 0x00000313}, {0x00001F36, 0x00000342}, {0x00001F37, 0x000003B9}, -{0x00001F37, 0x00000314}, {0x00001F37, 0x00000342}, {0x00001F38, 0x00000399}, {0x00001F38, 0x00000313}, -{0x00001F39, 0x00000399}, {0x00001F39, 0x00000314}, {0x00001F3A, 0x00000399}, {0x00001F3A, 0x00000313}, -{0x00001F3A, 0x00000300}, {0x00001F3B, 0x00000399}, {0x00001F3B, 0x00000314}, {0x00001F3B, 0x00000300}, -{0x00001F3C, 0x00000399}, {0x00001F3C, 0x00000313}, {0x00001F3C, 0x00000301}, {0x00001F3D, 0x00000399}, -{0x00001F3D, 0x00000314}, {0x00001F3D, 0x00000301}, {0x00001F3E, 0x00000399}, {0x00001F3E, 0x00000313}, -{0x00001F3E, 0x00000342}, {0x00001F3F, 0x00000399}, {0x00001F3F, 0x00000314}, {0x00001F3F, 0x00000342}, -{0x00001F40, 0x000003BF}, {0x00001F40, 0x00000313}, {0x00001F41, 0x000003BF}, {0x00001F41, 0x00000314}, -{0x00001F42, 0x000003BF}, {0x00001F42, 0x00000313}, {0x00001F42, 0x00000300}, {0x00001F43, 0x000003BF}, -{0x00001F43, 0x00000314}, {0x00001F43, 0x00000300}, {0x00001F44, 0x000003BF}, {0x00001F44, 0x00000313}, -{0x00001F44, 0x00000301}, {0x00001F45, 0x000003BF}, {0x00001F45, 0x00000314}, {0x00001F45, 0x00000301}, -{0x00001F48, 0x0000039F}, {0x00001F48, 0x00000313}, {0x00001F49, 0x0000039F}, {0x00001F49, 0x00000314}, -{0x00001F4A, 0x0000039F}, {0x00001F4A, 0x00000313}, {0x00001F4A, 0x00000300}, {0x00001F4B, 0x0000039F}, -{0x00001F4B, 0x00000314}, {0x00001F4B, 0x00000300}, {0x00001F4C, 0x0000039F}, {0x00001F4C, 0x00000313}, -{0x00001F4C, 0x00000301}, {0x00001F4D, 0x0000039F}, {0x00001F4D, 0x00000314}, {0x00001F4D, 0x00000301}, -{0x00001F50, 0x000003C5}, {0x00001F50, 0x00000313}, {0x00001F51, 0x000003C5}, {0x00001F51, 0x00000314}, -{0x00001F52, 0x000003C5}, {0x00001F52, 0x00000313}, {0x00001F52, 0x00000300}, {0x00001F53, 0x000003C5}, -{0x00001F53, 0x00000314}, {0x00001F53, 0x00000300}, {0x00001F54, 0x000003C5}, {0x00001F54, 0x00000313}, -{0x00001F54, 0x00000301}, {0x00001F55, 0x000003C5}, {0x00001F55, 0x00000314}, {0x00001F55, 0x00000301}, -{0x00001F56, 0x000003C5}, {0x00001F56, 0x00000313}, {0x00001F56, 0x00000342}, {0x00001F57, 0x000003C5}, -{0x00001F57, 0x00000314}, {0x00001F57, 0x00000342}, {0x00001F59, 0x000003A5}, {0x00001F59, 0x00000314}, -{0x00001F5B, 0x000003A5}, {0x00001F5B, 0x00000314}, {0x00001F5B, 0x00000300}, {0x00001F5D, 0x000003A5}, -{0x00001F5D, 0x00000314}, {0x00001F5D, 0x00000301}, {0x00001F5F, 0x000003A5}, {0x00001F5F, 0x00000314}, -{0x00001F5F, 0x00000342}, {0x00001F60, 0x000003C9}, {0x00001F60, 0x00000313}, {0x00001F61, 0x000003C9}, -{0x00001F61, 0x00000314}, {0x00001F62, 0x000003C9}, {0x00001F62, 0x00000313}, {0x00001F62, 0x00000300}, -{0x00001F63, 0x000003C9}, {0x00001F63, 0x00000314}, {0x00001F63, 0x00000300}, {0x00001F64, 0x000003C9}, -{0x00001F64, 0x00000313}, {0x00001F64, 0x00000301}, {0x00001F65, 0x000003C9}, {0x00001F65, 0x00000314}, -{0x00001F65, 0x00000301}, {0x00001F66, 0x000003C9}, {0x00001F66, 0x00000313}, {0x00001F66, 0x00000342}, -{0x00001F67, 0x000003C9}, {0x00001F67, 0x00000314}, {0x00001F67, 0x00000342}, {0x00001F68, 0x000003A9}, -{0x00001F68, 0x00000313}, {0x00001F69, 0x000003A9}, {0x00001F69, 0x00000314}, {0x00001F6A, 0x000003A9}, -{0x00001F6A, 0x00000313}, {0x00001F6A, 0x00000300}, {0x00001F6B, 0x000003A9}, {0x00001F6B, 0x00000314}, -{0x00001F6B, 0x00000300}, {0x00001F6C, 0x000003A9}, {0x00001F6C, 0x00000313}, {0x00001F6C, 0x00000301}, -{0x00001F6D, 0x000003A9}, {0x00001F6D, 0x00000314}, {0x00001F6D, 0x00000301}, {0x00001F6E, 0x000003A9}, -{0x00001F6E, 0x00000313}, {0x00001F6E, 0x00000342}, {0x00001F6F, 0x000003A9}, {0x00001F6F, 0x00000314}, -{0x00001F6F, 0x00000342}, {0x00001F70, 0x000003B1}, {0x00001F70, 0x00000300}, {0x00001F71, 0x000003B1}, -{0x00001F71, 0x00000301}, {0x00001F72, 0x000003B5}, {0x00001F72, 0x00000300}, {0x00001F73, 0x000003B5}, -{0x00001F73, 0x00000301}, {0x00001F74, 0x000003B7}, {0x00001F74, 0x00000300}, {0x00001F75, 0x000003B7}, -{0x00001F75, 0x00000301}, {0x00001F76, 0x000003B9}, {0x00001F76, 0x00000300}, {0x00001F77, 0x000003B9}, -{0x00001F77, 0x00000301}, {0x00001F78, 0x000003BF}, {0x00001F78, 0x00000300}, {0x00001F79, 0x000003BF}, -{0x00001F79, 0x00000301}, {0x00001F7A, 0x000003C5}, {0x00001F7A, 0x00000300}, {0x00001F7B, 0x000003C5}, -{0x00001F7B, 0x00000301}, {0x00001F7C, 0x000003C9}, {0x00001F7C, 0x00000300}, {0x00001F7D, 0x000003C9}, -{0x00001F7D, 0x00000301}, {0x00001F80, 0x000003B1}, {0x00001F80, 0x00000313}, {0x00001F80, 0x00000345}, -{0x00001F81, 0x000003B1}, {0x00001F81, 0x00000314}, {0x00001F81, 0x00000345}, {0x00001F82, 0x000003B1}, -{0x00001F82, 0x00000313}, {0x00001F82, 0x00000300}, {0x00001F82, 0x00000345}, {0x00001F83, 0x000003B1}, -{0x00001F83, 0x00000314}, {0x00001F83, 0x00000300}, {0x00001F83, 0x00000345}, {0x00001F84, 0x000003B1}, -{0x00001F84, 0x00000313}, {0x00001F84, 0x00000301}, {0x00001F84, 0x00000345}, {0x00001F85, 0x000003B1}, -{0x00001F85, 0x00000314}, {0x00001F85, 0x00000301}, {0x00001F85, 0x00000345}, {0x00001F86, 0x000003B1}, -{0x00001F86, 0x00000313}, {0x00001F86, 0x00000342}, {0x00001F86, 0x00000345}, {0x00001F87, 0x000003B1}, -{0x00001F87, 0x00000314}, {0x00001F87, 0x00000342}, {0x00001F87, 0x00000345}, {0x00001F88, 0x00000391}, -{0x00001F88, 0x00000313}, {0x00001F88, 0x00000345}, {0x00001F89, 0x00000391}, {0x00001F89, 0x00000314}, -{0x00001F89, 0x00000345}, {0x00001F8A, 0x00000391}, {0x00001F8A, 0x00000313}, {0x00001F8A, 0x00000300}, -{0x00001F8A, 0x00000345}, {0x00001F8B, 0x00000391}, {0x00001F8B, 0x00000314}, {0x00001F8B, 0x00000300}, -{0x00001F8B, 0x00000345}, {0x00001F8C, 0x00000391}, {0x00001F8C, 0x00000313}, {0x00001F8C, 0x00000301}, -{0x00001F8C, 0x00000345}, {0x00001F8D, 0x00000391}, {0x00001F8D, 0x00000314}, {0x00001F8D, 0x00000301}, -{0x00001F8D, 0x00000345}, {0x00001F8E, 0x00000391}, {0x00001F8E, 0x00000313}, {0x00001F8E, 0x00000342}, -{0x00001F8E, 0x00000345}, {0x00001F8F, 0x00000391}, {0x00001F8F, 0x00000314}, {0x00001F8F, 0x00000342}, -{0x00001F8F, 0x00000345}, {0x00001F90, 0x000003B7}, {0x00001F90, 0x00000313}, {0x00001F90, 0x00000345}, -{0x00001F91, 0x000003B7}, {0x00001F91, 0x00000314}, {0x00001F91, 0x00000345}, {0x00001F92, 0x000003B7}, -{0x00001F92, 0x00000313}, {0x00001F92, 0x00000300}, {0x00001F92, 0x00000345}, {0x00001F93, 0x000003B7}, -{0x00001F93, 0x00000314}, {0x00001F93, 0x00000300}, {0x00001F93, 0x00000345}, {0x00001F94, 0x000003B7}, -{0x00001F94, 0x00000313}, {0x00001F94, 0x00000301}, {0x00001F94, 0x00000345}, {0x00001F95, 0x000003B7}, -{0x00001F95, 0x00000314}, {0x00001F95, 0x00000301}, {0x00001F95, 0x00000345}, {0x00001F96, 0x000003B7}, -{0x00001F96, 0x00000313}, {0x00001F96, 0x00000342}, {0x00001F96, 0x00000345}, {0x00001F97, 0x000003B7}, -{0x00001F97, 0x00000314}, {0x00001F97, 0x00000342}, {0x00001F97, 0x00000345}, {0x00001F98, 0x00000397}, -{0x00001F98, 0x00000313}, {0x00001F98, 0x00000345}, {0x00001F99, 0x00000397}, {0x00001F99, 0x00000314}, -{0x00001F99, 0x00000345}, {0x00001F9A, 0x00000397}, {0x00001F9A, 0x00000313}, {0x00001F9A, 0x00000300}, -{0x00001F9A, 0x00000345}, {0x00001F9B, 0x00000397}, {0x00001F9B, 0x00000314}, {0x00001F9B, 0x00000300}, -{0x00001F9B, 0x00000345}, {0x00001F9C, 0x00000397}, {0x00001F9C, 0x00000313}, {0x00001F9C, 0x00000301}, -{0x00001F9C, 0x00000345}, {0x00001F9D, 0x00000397}, {0x00001F9D, 0x00000314}, {0x00001F9D, 0x00000301}, -{0x00001F9D, 0x00000345}, {0x00001F9E, 0x00000397}, {0x00001F9E, 0x00000313}, {0x00001F9E, 0x00000342}, -{0x00001F9E, 0x00000345}, {0x00001F9F, 0x00000397}, {0x00001F9F, 0x00000314}, {0x00001F9F, 0x00000342}, -{0x00001F9F, 0x00000345}, {0x00001FA0, 0x000003C9}, {0x00001FA0, 0x00000313}, {0x00001FA0, 0x00000345}, -{0x00001FA1, 0x000003C9}, {0x00001FA1, 0x00000314}, {0x00001FA1, 0x00000345}, {0x00001FA2, 0x000003C9}, -{0x00001FA2, 0x00000313}, {0x00001FA2, 0x00000300}, {0x00001FA2, 0x00000345}, {0x00001FA3, 0x000003C9}, -{0x00001FA3, 0x00000314}, {0x00001FA3, 0x00000300}, {0x00001FA3, 0x00000345}, {0x00001FA4, 0x000003C9}, -{0x00001FA4, 0x00000313}, {0x00001FA4, 0x00000301}, {0x00001FA4, 0x00000345}, {0x00001FA5, 0x000003C9}, -{0x00001FA5, 0x00000314}, {0x00001FA5, 0x00000301}, {0x00001FA5, 0x00000345}, {0x00001FA6, 0x000003C9}, -{0x00001FA6, 0x00000313}, {0x00001FA6, 0x00000342}, {0x00001FA6, 0x00000345}, {0x00001FA7, 0x000003C9}, -{0x00001FA7, 0x00000314}, {0x00001FA7, 0x00000342}, {0x00001FA7, 0x00000345}, {0x00001FA8, 0x000003A9}, -{0x00001FA8, 0x00000313}, {0x00001FA8, 0x00000345}, {0x00001FA9, 0x000003A9}, {0x00001FA9, 0x00000314}, -{0x00001FA9, 0x00000345}, {0x00001FAA, 0x000003A9}, {0x00001FAA, 0x00000313}, {0x00001FAA, 0x00000300}, -{0x00001FAA, 0x00000345}, {0x00001FAB, 0x000003A9}, {0x00001FAB, 0x00000314}, {0x00001FAB, 0x00000300}, -{0x00001FAB, 0x00000345}, {0x00001FAC, 0x000003A9}, {0x00001FAC, 0x00000313}, {0x00001FAC, 0x00000301}, -{0x00001FAC, 0x00000345}, {0x00001FAD, 0x000003A9}, {0x00001FAD, 0x00000314}, {0x00001FAD, 0x00000301}, -{0x00001FAD, 0x00000345}, {0x00001FAE, 0x000003A9}, {0x00001FAE, 0x00000313}, {0x00001FAE, 0x00000342}, -{0x00001FAE, 0x00000345}, {0x00001FAF, 0x000003A9}, {0x00001FAF, 0x00000314}, {0x00001FAF, 0x00000342}, -{0x00001FAF, 0x00000345}, {0x00001FB0, 0x000003B1}, {0x00001FB0, 0x00000306}, {0x00001FB1, 0x000003B1}, -{0x00001FB1, 0x00000304}, {0x00001FB2, 0x000003B1}, {0x00001FB2, 0x00000300}, {0x00001FB2, 0x00000345}, -{0x00001FB3, 0x000003B1}, {0x00001FB3, 0x00000345}, {0x00001FB4, 0x000003B1}, {0x00001FB4, 0x00000301}, -{0x00001FB4, 0x00000345}, {0x00001FB6, 0x000003B1}, {0x00001FB6, 0x00000342}, {0x00001FB7, 0x000003B1}, -{0x00001FB7, 0x00000342}, {0x00001FB7, 0x00000345}, {0x00001FB8, 0x00000391}, {0x00001FB8, 0x00000306}, -{0x00001FB9, 0x00000391}, {0x00001FB9, 0x00000304}, {0x00001FBA, 0x00000391}, {0x00001FBA, 0x00000300}, -{0x00001FBB, 0x00000391}, {0x00001FBB, 0x00000301}, {0x00001FBC, 0x00000391}, {0x00001FBC, 0x00000345}, -{0x00001FBE, 0x000003B9}, {0x00001FC1, 0x000000A8}, {0x00001FC1, 0x00000342}, {0x00001FC2, 0x000003B7}, -{0x00001FC2, 0x00000300}, {0x00001FC2, 0x00000345}, {0x00001FC3, 0x000003B7}, {0x00001FC3, 0x00000345}, -{0x00001FC4, 0x000003B7}, {0x00001FC4, 0x00000301}, {0x00001FC4, 0x00000345}, {0x00001FC6, 0x000003B7}, -{0x00001FC6, 0x00000342}, {0x00001FC7, 0x000003B7}, {0x00001FC7, 0x00000342}, {0x00001FC7, 0x00000345}, -{0x00001FC8, 0x00000395}, {0x00001FC8, 0x00000300}, {0x00001FC9, 0x00000395}, {0x00001FC9, 0x00000301}, -{0x00001FCA, 0x00000397}, {0x00001FCA, 0x00000300}, {0x00001FCB, 0x00000397}, {0x00001FCB, 0x00000301}, -{0x00001FCC, 0x00000397}, {0x00001FCC, 0x00000345}, {0x00001FCD, 0x00001FBF}, {0x00001FCD, 0x00000300}, -{0x00001FCE, 0x00001FBF}, {0x00001FCE, 0x00000301}, {0x00001FCF, 0x00001FBF}, {0x00001FCF, 0x00000342}, -{0x00001FD0, 0x000003B9}, {0x00001FD0, 0x00000306}, {0x00001FD1, 0x000003B9}, {0x00001FD1, 0x00000304}, -{0x00001FD2, 0x000003B9}, {0x00001FD2, 0x00000308}, {0x00001FD2, 0x00000300}, {0x00001FD3, 0x000003B9}, -{0x00001FD3, 0x00000308}, {0x00001FD3, 0x00000301}, {0x00001FD6, 0x000003B9}, {0x00001FD6, 0x00000342}, -{0x00001FD7, 0x000003B9}, {0x00001FD7, 0x00000308}, {0x00001FD7, 0x00000342}, {0x00001FD8, 0x00000399}, -{0x00001FD8, 0x00000306}, {0x00001FD9, 0x00000399}, {0x00001FD9, 0x00000304}, {0x00001FDA, 0x00000399}, -{0x00001FDA, 0x00000300}, {0x00001FDB, 0x00000399}, {0x00001FDB, 0x00000301}, {0x00001FDD, 0x00001FFE}, -{0x00001FDD, 0x00000300}, {0x00001FDE, 0x00001FFE}, {0x00001FDE, 0x00000301}, {0x00001FDF, 0x00001FFE}, -{0x00001FDF, 0x00000342}, {0x00001FE0, 0x000003C5}, {0x00001FE0, 0x00000306}, {0x00001FE1, 0x000003C5}, -{0x00001FE1, 0x00000304}, {0x00001FE2, 0x000003C5}, {0x00001FE2, 0x00000308}, {0x00001FE2, 0x00000300}, -{0x00001FE3, 0x000003C5}, {0x00001FE3, 0x00000308}, {0x00001FE3, 0x00000301}, {0x00001FE4, 0x000003C1}, -{0x00001FE4, 0x00000313}, {0x00001FE5, 0x000003C1}, {0x00001FE5, 0x00000314}, {0x00001FE6, 0x000003C5}, -{0x00001FE6, 0x00000342}, {0x00001FE7, 0x000003C5}, {0x00001FE7, 0x00000308}, {0x00001FE7, 0x00000342}, -{0x00001FE8, 0x000003A5}, {0x00001FE8, 0x00000306}, {0x00001FE9, 0x000003A5}, {0x00001FE9, 0x00000304}, -{0x00001FEA, 0x000003A5}, {0x00001FEA, 0x00000300}, {0x00001FEB, 0x000003A5}, {0x00001FEB, 0x00000301}, -{0x00001FEC, 0x000003A1}, {0x00001FEC, 0x00000314}, {0x00001FED, 0x000000A8}, {0x00001FED, 0x00000300}, -{0x00001FEE, 0x000000A8}, {0x00001FEE, 0x00000301}, {0x00001FEF, 0x00000060}, {0x00001FF2, 0x000003C9}, -{0x00001FF2, 0x00000300}, {0x00001FF2, 0x00000345}, {0x00001FF3, 0x000003C9}, {0x00001FF3, 0x00000345}, -{0x00001FF4, 0x000003C9}, {0x00001FF4, 0x00000301}, {0x00001FF4, 0x00000345}, {0x00001FF6, 0x000003C9}, -{0x00001FF6, 0x00000342}, {0x00001FF7, 0x000003C9}, {0x00001FF7, 0x00000342}, {0x00001FF7, 0x00000345}, -{0x00001FF8, 0x0000039F}, {0x00001FF8, 0x00000300}, {0x00001FF9, 0x0000039F}, {0x00001FF9, 0x00000301}, -{0x00001FFA, 0x000003A9}, {0x00001FFA, 0x00000300}, {0x00001FFB, 0x000003A9}, {0x00001FFB, 0x00000301}, -{0x00001FFC, 0x000003A9}, {0x00001FFC, 0x00000345}, {0x00001FFD, 0x000000B4}, {0x00002000, 0x00002002}, -{0x00002001, 0x00002003}, {0x00002126, 0x000003A9}, {0x0000212A, 0x0000004B}, {0x0000212B, 0x00000041}, -{0x0000212B, 0x0000030A}, {0x0000219A, 0x00002190}, {0x0000219A, 0x00000338}, {0x0000219B, 0x00002192}, -{0x0000219B, 0x00000338}, {0x000021AE, 0x00002194}, {0x000021AE, 0x00000338}, {0x000021CD, 0x000021D0}, -{0x000021CD, 0x00000338}, {0x000021CE, 0x000021D4}, {0x000021CE, 0x00000338}, {0x000021CF, 0x000021D2}, -{0x000021CF, 0x00000338}, {0x00002204, 0x00002203}, {0x00002204, 0x00000338}, {0x00002209, 0x00002208}, -{0x00002209, 0x00000338}, {0x0000220C, 0x0000220B}, {0x0000220C, 0x00000338}, {0x00002224, 0x00002223}, -{0x00002224, 0x00000338}, {0x00002226, 0x00002225}, {0x00002226, 0x00000338}, {0x00002241, 0x0000223C}, -{0x00002241, 0x00000338}, {0x00002244, 0x00002243}, {0x00002244, 0x00000338}, {0x00002247, 0x00002245}, -{0x00002247, 0x00000338}, {0x00002249, 0x00002248}, {0x00002249, 0x00000338}, {0x00002260, 0x0000003D}, -{0x00002260, 0x00000338}, {0x00002262, 0x00002261}, {0x00002262, 0x00000338}, {0x0000226D, 0x0000224D}, -{0x0000226D, 0x00000338}, {0x0000226E, 0x0000003C}, {0x0000226E, 0x00000338}, {0x0000226F, 0x0000003E}, -{0x0000226F, 0x00000338}, {0x00002270, 0x00002264}, {0x00002270, 0x00000338}, {0x00002271, 0x00002265}, -{0x00002271, 0x00000338}, {0x00002274, 0x00002272}, {0x00002274, 0x00000338}, {0x00002275, 0x00002273}, -{0x00002275, 0x00000338}, {0x00002278, 0x00002276}, {0x00002278, 0x00000338}, {0x00002279, 0x00002277}, -{0x00002279, 0x00000338}, {0x00002280, 0x0000227A}, {0x00002280, 0x00000338}, {0x00002281, 0x0000227B}, -{0x00002281, 0x00000338}, {0x00002284, 0x00002282}, {0x00002284, 0x00000338}, {0x00002285, 0x00002283}, -{0x00002285, 0x00000338}, {0x00002288, 0x00002286}, {0x00002288, 0x00000338}, {0x00002289, 0x00002287}, -{0x00002289, 0x00000338}, {0x000022AC, 0x000022A2}, {0x000022AC, 0x00000338}, {0x000022AD, 0x000022A8}, -{0x000022AD, 0x00000338}, {0x000022AE, 0x000022A9}, {0x000022AE, 0x00000338}, {0x000022AF, 0x000022AB}, -{0x000022AF, 0x00000338}, {0x000022E0, 0x0000227C}, {0x000022E0, 0x00000338}, {0x000022E1, 0x0000227D}, -{0x000022E1, 0x00000338}, {0x000022E2, 0x00002291}, {0x000022E2, 0x00000338}, {0x000022E3, 0x00002292}, -{0x000022E3, 0x00000338}, {0x000022EA, 0x000022B2}, {0x000022EA, 0x00000338}, {0x000022EB, 0x000022B3}, -{0x000022EB, 0x00000338}, {0x000022EC, 0x000022B4}, {0x000022EC, 0x00000338}, {0x000022ED, 0x000022B5}, -{0x000022ED, 0x00000338}, {0x00002329, 0x00003008}, {0x0000232A, 0x00003009}, {0x00002ADC, 0x00002ADD}, -{0x00002ADC, 0x00000338}, {0x0000304C, 0x0000304B}, {0x0000304C, 0x00003099}, {0x0000304E, 0x0000304D}, -{0x0000304E, 0x00003099}, {0x00003050, 0x0000304F}, {0x00003050, 0x00003099}, {0x00003052, 0x00003051}, -{0x00003052, 0x00003099}, {0x00003054, 0x00003053}, {0x00003054, 0x00003099}, {0x00003056, 0x00003055}, -{0x00003056, 0x00003099}, {0x00003058, 0x00003057}, {0x00003058, 0x00003099}, {0x0000305A, 0x00003059}, -{0x0000305A, 0x00003099}, {0x0000305C, 0x0000305B}, {0x0000305C, 0x00003099}, {0x0000305E, 0x0000305D}, -{0x0000305E, 0x00003099}, {0x00003060, 0x0000305F}, {0x00003060, 0x00003099}, {0x00003062, 0x00003061}, -{0x00003062, 0x00003099}, {0x00003065, 0x00003064}, {0x00003065, 0x00003099}, {0x00003067, 0x00003066}, -{0x00003067, 0x00003099}, {0x00003069, 0x00003068}, {0x00003069, 0x00003099}, {0x00003070, 0x0000306F}, -{0x00003070, 0x00003099}, {0x00003071, 0x0000306F}, {0x00003071, 0x0000309A}, {0x00003073, 0x00003072}, -{0x00003073, 0x00003099}, {0x00003074, 0x00003072}, {0x00003074, 0x0000309A}, {0x00003076, 0x00003075}, -{0x00003076, 0x00003099}, {0x00003077, 0x00003075}, {0x00003077, 0x0000309A}, {0x00003079, 0x00003078}, -{0x00003079, 0x00003099}, {0x0000307A, 0x00003078}, {0x0000307A, 0x0000309A}, {0x0000307C, 0x0000307B}, -{0x0000307C, 0x00003099}, {0x0000307D, 0x0000307B}, {0x0000307D, 0x0000309A}, {0x00003094, 0x00003046}, -{0x00003094, 0x00003099}, {0x0000309E, 0x0000309D}, {0x0000309E, 0x00003099}, {0x000030AC, 0x000030AB}, -{0x000030AC, 0x00003099}, {0x000030AE, 0x000030AD}, {0x000030AE, 0x00003099}, {0x000030B0, 0x000030AF}, -{0x000030B0, 0x00003099}, {0x000030B2, 0x000030B1}, {0x000030B2, 0x00003099}, {0x000030B4, 0x000030B3}, -{0x000030B4, 0x00003099}, {0x000030B6, 0x000030B5}, {0x000030B6, 0x00003099}, {0x000030B8, 0x000030B7}, -{0x000030B8, 0x00003099}, {0x000030BA, 0x000030B9}, {0x000030BA, 0x00003099}, {0x000030BC, 0x000030BB}, -{0x000030BC, 0x00003099}, {0x000030BE, 0x000030BD}, {0x000030BE, 0x00003099}, {0x000030C0, 0x000030BF}, -{0x000030C0, 0x00003099}, {0x000030C2, 0x000030C1}, {0x000030C2, 0x00003099}, {0x000030C5, 0x000030C4}, -{0x000030C5, 0x00003099}, {0x000030C7, 0x000030C6}, {0x000030C7, 0x00003099}, {0x000030C9, 0x000030C8}, -{0x000030C9, 0x00003099}, {0x000030D0, 0x000030CF}, {0x000030D0, 0x00003099}, {0x000030D1, 0x000030CF}, -{0x000030D1, 0x0000309A}, {0x000030D3, 0x000030D2}, {0x000030D3, 0x00003099}, {0x000030D4, 0x000030D2}, -{0x000030D4, 0x0000309A}, {0x000030D6, 0x000030D5}, {0x000030D6, 0x00003099}, {0x000030D7, 0x000030D5}, -{0x000030D7, 0x0000309A}, {0x000030D9, 0x000030D8}, {0x000030D9, 0x00003099}, {0x000030DA, 0x000030D8}, -{0x000030DA, 0x0000309A}, {0x000030DC, 0x000030DB}, {0x000030DC, 0x00003099}, {0x000030DD, 0x000030DB}, -{0x000030DD, 0x0000309A}, {0x000030F4, 0x000030A6}, {0x000030F4, 0x00003099}, {0x000030F7, 0x000030EF}, -{0x000030F7, 0x00003099}, {0x000030F8, 0x000030F0}, {0x000030F8, 0x00003099}, {0x000030F9, 0x000030F1}, -{0x000030F9, 0x00003099}, {0x000030FA, 0x000030F2}, {0x000030FA, 0x00003099}, {0x000030FE, 0x000030FD}, -{0x000030FE, 0x00003099}, {0x0000F900, 0x00008C48}, {0x0000F901, 0x000066F4}, {0x0000F902, 0x00008ECA}, -{0x0000F903, 0x00008CC8}, {0x0000F904, 0x00006ED1}, {0x0000F905, 0x00004E32}, {0x0000F906, 0x000053E5}, -{0x0000F907, 0x00009F9C}, {0x0000F908, 0x00009F9C}, {0x0000F909, 0x00005951}, {0x0000F90A, 0x000091D1}, -{0x0000F90B, 0x00005587}, {0x0000F90C, 0x00005948}, {0x0000F90D, 0x000061F6}, {0x0000F90E, 0x00007669}, -{0x0000F90F, 0x00007F85}, {0x0000F910, 0x0000863F}, {0x0000F911, 0x000087BA}, {0x0000F912, 0x000088F8}, -{0x0000F913, 0x0000908F}, {0x0000F914, 0x00006A02}, {0x0000F915, 0x00006D1B}, {0x0000F916, 0x000070D9}, -{0x0000F917, 0x000073DE}, {0x0000F918, 0x0000843D}, {0x0000F919, 0x0000916A}, {0x0000F91A, 0x000099F1}, -{0x0000F91B, 0x00004E82}, {0x0000F91C, 0x00005375}, {0x0000F91D, 0x00006B04}, {0x0000F91E, 0x0000721B}, -{0x0000F91F, 0x0000862D}, {0x0000F920, 0x00009E1E}, {0x0000F921, 0x00005D50}, {0x0000F922, 0x00006FEB}, -{0x0000F923, 0x000085CD}, {0x0000F924, 0x00008964}, {0x0000F925, 0x000062C9}, {0x0000F926, 0x000081D8}, -{0x0000F927, 0x0000881F}, {0x0000F928, 0x00005ECA}, {0x0000F929, 0x00006717}, {0x0000F92A, 0x00006D6A}, -{0x0000F92B, 0x000072FC}, {0x0000F92C, 0x000090CE}, {0x0000F92D, 0x00004F86}, {0x0000F92E, 0x000051B7}, -{0x0000F92F, 0x000052DE}, {0x0000F930, 0x000064C4}, {0x0000F931, 0x00006AD3}, {0x0000F932, 0x00007210}, -{0x0000F933, 0x000076E7}, {0x0000F934, 0x00008001}, {0x0000F935, 0x00008606}, {0x0000F936, 0x0000865C}, -{0x0000F937, 0x00008DEF}, {0x0000F938, 0x00009732}, {0x0000F939, 0x00009B6F}, {0x0000F93A, 0x00009DFA}, -{0x0000F93B, 0x0000788C}, {0x0000F93C, 0x0000797F}, {0x0000F93D, 0x00007DA0}, {0x0000F93E, 0x000083C9}, -{0x0000F93F, 0x00009304}, {0x0000F940, 0x00009E7F}, {0x0000F941, 0x00008AD6}, {0x0000F942, 0x000058DF}, -{0x0000F943, 0x00005F04}, {0x0000F944, 0x00007C60}, {0x0000F945, 0x0000807E}, {0x0000F946, 0x00007262}, -{0x0000F947, 0x000078CA}, {0x0000F948, 0x00008CC2}, {0x0000F949, 0x000096F7}, {0x0000F94A, 0x000058D8}, -{0x0000F94B, 0x00005C62}, {0x0000F94C, 0x00006A13}, {0x0000F94D, 0x00006DDA}, {0x0000F94E, 0x00006F0F}, -{0x0000F94F, 0x00007D2F}, {0x0000F950, 0x00007E37}, {0x0000F951, 0x0000964B}, {0x0000F952, 0x000052D2}, -{0x0000F953, 0x0000808B}, {0x0000F954, 0x000051DC}, {0x0000F955, 0x000051CC}, {0x0000F956, 0x00007A1C}, -{0x0000F957, 0x00007DBE}, {0x0000F958, 0x000083F1}, {0x0000F959, 0x00009675}, {0x0000F95A, 0x00008B80}, -{0x0000F95B, 0x000062CF}, {0x0000F95C, 0x00006A02}, {0x0000F95D, 0x00008AFE}, {0x0000F95E, 0x00004E39}, -{0x0000F95F, 0x00005BE7}, {0x0000F960, 0x00006012}, {0x0000F961, 0x00007387}, {0x0000F962, 0x00007570}, -{0x0000F963, 0x00005317}, {0x0000F964, 0x000078FB}, {0x0000F965, 0x00004FBF}, {0x0000F966, 0x00005FA9}, -{0x0000F967, 0x00004E0D}, {0x0000F968, 0x00006CCC}, {0x0000F969, 0x00006578}, {0x0000F96A, 0x00007D22}, -{0x0000F96B, 0x000053C3}, {0x0000F96C, 0x0000585E}, {0x0000F96D, 0x00007701}, {0x0000F96E, 0x00008449}, -{0x0000F96F, 0x00008AAA}, {0x0000F970, 0x00006BBA}, {0x0000F971, 0x00008FB0}, {0x0000F972, 0x00006C88}, -{0x0000F973, 0x000062FE}, {0x0000F974, 0x000082E5}, {0x0000F975, 0x000063A0}, {0x0000F976, 0x00007565}, -{0x0000F977, 0x00004EAE}, {0x0000F978, 0x00005169}, {0x0000F979, 0x000051C9}, {0x0000F97A, 0x00006881}, -{0x0000F97B, 0x00007CE7}, {0x0000F97C, 0x0000826F}, {0x0000F97D, 0x00008AD2}, {0x0000F97E, 0x000091CF}, -{0x0000F97F, 0x000052F5}, {0x0000F980, 0x00005442}, {0x0000F981, 0x00005973}, {0x0000F982, 0x00005EEC}, -{0x0000F983, 0x000065C5}, {0x0000F984, 0x00006FFE}, {0x0000F985, 0x0000792A}, {0x0000F986, 0x000095AD}, -{0x0000F987, 0x00009A6A}, {0x0000F988, 0x00009E97}, {0x0000F989, 0x00009ECE}, {0x0000F98A, 0x0000529B}, -{0x0000F98B, 0x000066C6}, {0x0000F98C, 0x00006B77}, {0x0000F98D, 0x00008F62}, {0x0000F98E, 0x00005E74}, -{0x0000F98F, 0x00006190}, {0x0000F990, 0x00006200}, {0x0000F991, 0x0000649A}, {0x0000F992, 0x00006F23}, -{0x0000F993, 0x00007149}, {0x0000F994, 0x00007489}, {0x0000F995, 0x000079CA}, {0x0000F996, 0x00007DF4}, -{0x0000F997, 0x0000806F}, {0x0000F998, 0x00008F26}, {0x0000F999, 0x000084EE}, {0x0000F99A, 0x00009023}, -{0x0000F99B, 0x0000934A}, {0x0000F99C, 0x00005217}, {0x0000F99D, 0x000052A3}, {0x0000F99E, 0x000054BD}, -{0x0000F99F, 0x000070C8}, {0x0000F9A0, 0x000088C2}, {0x0000F9A1, 0x00008AAA}, {0x0000F9A2, 0x00005EC9}, -{0x0000F9A3, 0x00005FF5}, {0x0000F9A4, 0x0000637B}, {0x0000F9A5, 0x00006BAE}, {0x0000F9A6, 0x00007C3E}, -{0x0000F9A7, 0x00007375}, {0x0000F9A8, 0x00004EE4}, {0x0000F9A9, 0x000056F9}, {0x0000F9AA, 0x00005BE7}, -{0x0000F9AB, 0x00005DBA}, {0x0000F9AC, 0x0000601C}, {0x0000F9AD, 0x000073B2}, {0x0000F9AE, 0x00007469}, -{0x0000F9AF, 0x00007F9A}, {0x0000F9B0, 0x00008046}, {0x0000F9B1, 0x00009234}, {0x0000F9B2, 0x000096F6}, -{0x0000F9B3, 0x00009748}, {0x0000F9B4, 0x00009818}, {0x0000F9B5, 0x00004F8B}, {0x0000F9B6, 0x000079AE}, -{0x0000F9B7, 0x000091B4}, {0x0000F9B8, 0x000096B8}, {0x0000F9B9, 0x000060E1}, {0x0000F9BA, 0x00004E86}, -{0x0000F9BB, 0x000050DA}, {0x0000F9BC, 0x00005BEE}, {0x0000F9BD, 0x00005C3F}, {0x0000F9BE, 0x00006599}, -{0x0000F9BF, 0x00006A02}, {0x0000F9C0, 0x000071CE}, {0x0000F9C1, 0x00007642}, {0x0000F9C2, 0x000084FC}, -{0x0000F9C3, 0x0000907C}, {0x0000F9C4, 0x00009F8D}, {0x0000F9C5, 0x00006688}, {0x0000F9C6, 0x0000962E}, -{0x0000F9C7, 0x00005289}, {0x0000F9C8, 0x0000677B}, {0x0000F9C9, 0x000067F3}, {0x0000F9CA, 0x00006D41}, -{0x0000F9CB, 0x00006E9C}, {0x0000F9CC, 0x00007409}, {0x0000F9CD, 0x00007559}, {0x0000F9CE, 0x0000786B}, -{0x0000F9CF, 0x00007D10}, {0x0000F9D0, 0x0000985E}, {0x0000F9D1, 0x0000516D}, {0x0000F9D2, 0x0000622E}, -{0x0000F9D3, 0x00009678}, {0x0000F9D4, 0x0000502B}, {0x0000F9D5, 0x00005D19}, {0x0000F9D6, 0x00006DEA}, -{0x0000F9D7, 0x00008F2A}, {0x0000F9D8, 0x00005F8B}, {0x0000F9D9, 0x00006144}, {0x0000F9DA, 0x00006817}, -{0x0000F9DB, 0x00007387}, {0x0000F9DC, 0x00009686}, {0x0000F9DD, 0x00005229}, {0x0000F9DE, 0x0000540F}, -{0x0000F9DF, 0x00005C65}, {0x0000F9E0, 0x00006613}, {0x0000F9E1, 0x0000674E}, {0x0000F9E2, 0x000068A8}, -{0x0000F9E3, 0x00006CE5}, {0x0000F9E4, 0x00007406}, {0x0000F9E5, 0x000075E2}, {0x0000F9E6, 0x00007F79}, -{0x0000F9E7, 0x000088CF}, {0x0000F9E8, 0x000088E1}, {0x0000F9E9, 0x000091CC}, {0x0000F9EA, 0x000096E2}, -{0x0000F9EB, 0x0000533F}, {0x0000F9EC, 0x00006EBA}, {0x0000F9ED, 0x0000541D}, {0x0000F9EE, 0x000071D0}, -{0x0000F9EF, 0x00007498}, {0x0000F9F0, 0x000085FA}, {0x0000F9F1, 0x000096A3}, {0x0000F9F2, 0x00009C57}, -{0x0000F9F3, 0x00009E9F}, {0x0000F9F4, 0x00006797}, {0x0000F9F5, 0x00006DCB}, {0x0000F9F6, 0x000081E8}, -{0x0000F9F7, 0x00007ACB}, {0x0000F9F8, 0x00007B20}, {0x0000F9F9, 0x00007C92}, {0x0000F9FA, 0x000072C0}, -{0x0000F9FB, 0x00007099}, {0x0000F9FC, 0x00008B58}, {0x0000F9FD, 0x00004EC0}, {0x0000F9FE, 0x00008336}, -{0x0000F9FF, 0x0000523A}, {0x0000FA00, 0x00005207}, {0x0000FA01, 0x00005EA6}, {0x0000FA02, 0x000062D3}, -{0x0000FA03, 0x00007CD6}, {0x0000FA04, 0x00005B85}, {0x0000FA05, 0x00006D1E}, {0x0000FA06, 0x000066B4}, -{0x0000FA07, 0x00008F3B}, {0x0000FA08, 0x0000884C}, {0x0000FA09, 0x0000964D}, {0x0000FA0A, 0x0000898B}, -{0x0000FA0B, 0x00005ED3}, {0x0000FA0C, 0x00005140}, {0x0000FA0D, 0x000055C0}, {0x0000FA10, 0x0000585A}, -{0x0000FA12, 0x00006674}, {0x0000FA15, 0x000051DE}, {0x0000FA16, 0x0000732A}, {0x0000FA17, 0x000076CA}, -{0x0000FA18, 0x0000793C}, {0x0000FA19, 0x0000795E}, {0x0000FA1A, 0x00007965}, {0x0000FA1B, 0x0000798F}, -{0x0000FA1C, 0x00009756}, {0x0000FA1D, 0x00007CBE}, {0x0000FA1E, 0x00007FBD}, {0x0000FA20, 0x00008612}, -{0x0000FA22, 0x00008AF8}, {0x0000FA25, 0x00009038}, {0x0000FA26, 0x000090FD}, {0x0000FA2A, 0x000098EF}, -{0x0000FA2B, 0x000098FC}, {0x0000FA2C, 0x00009928}, {0x0000FA2D, 0x00009DB4}, {0x0000FA2E, 0x000090DE}, -{0x0000FA2F, 0x000096B7}, {0x0000FA30, 0x00004FAE}, {0x0000FA31, 0x000050E7}, {0x0000FA32, 0x0000514D}, -{0x0000FA33, 0x000052C9}, {0x0000FA34, 0x000052E4}, {0x0000FA35, 0x00005351}, {0x0000FA36, 0x0000559D}, -{0x0000FA37, 0x00005606}, {0x0000FA38, 0x00005668}, {0x0000FA39, 0x00005840}, {0x0000FA3A, 0x000058A8}, -{0x0000FA3B, 0x00005C64}, {0x0000FA3C, 0x00005C6E}, {0x0000FA3D, 0x00006094}, {0x0000FA3E, 0x00006168}, -{0x0000FA3F, 0x0000618E}, {0x0000FA40, 0x000061F2}, {0x0000FA41, 0x0000654F}, {0x0000FA42, 0x000065E2}, -{0x0000FA43, 0x00006691}, {0x0000FA44, 0x00006885}, {0x0000FA45, 0x00006D77}, {0x0000FA46, 0x00006E1A}, -{0x0000FA47, 0x00006F22}, {0x0000FA48, 0x0000716E}, {0x0000FA49, 0x0000722B}, {0x0000FA4A, 0x00007422}, -{0x0000FA4B, 0x00007891}, {0x0000FA4C, 0x0000793E}, {0x0000FA4D, 0x00007949}, {0x0000FA4E, 0x00007948}, -{0x0000FA4F, 0x00007950}, {0x0000FA50, 0x00007956}, {0x0000FA51, 0x0000795D}, {0x0000FA52, 0x0000798D}, -{0x0000FA53, 0x0000798E}, {0x0000FA54, 0x00007A40}, {0x0000FA55, 0x00007A81}, {0x0000FA56, 0x00007BC0}, -{0x0000FA57, 0x00007DF4}, {0x0000FA58, 0x00007E09}, {0x0000FA59, 0x00007E41}, {0x0000FA5A, 0x00007F72}, -{0x0000FA5B, 0x00008005}, {0x0000FA5C, 0x000081ED}, {0x0000FA5D, 0x00008279}, {0x0000FA5E, 0x00008279}, -{0x0000FA5F, 0x00008457}, {0x0000FA60, 0x00008910}, {0x0000FA61, 0x00008996}, {0x0000FA62, 0x00008B01}, -{0x0000FA63, 0x00008B39}, {0x0000FA64, 0x00008CD3}, {0x0000FA65, 0x00008D08}, {0x0000FA66, 0x00008FB6}, -{0x0000FA67, 0x00009038}, {0x0000FA68, 0x000096E3}, {0x0000FA69, 0x000097FF}, {0x0000FA6A, 0x0000983B}, -{0x0000FA6B, 0x00006075}, {0x0000FA6C, 0x000242EE}, {0x0000FA6D, 0x00008218}, {0x0000FA70, 0x00004E26}, -{0x0000FA71, 0x000051B5}, {0x0000FA72, 0x00005168}, {0x0000FA73, 0x00004F80}, {0x0000FA74, 0x00005145}, -{0x0000FA75, 0x00005180}, {0x0000FA76, 0x000052C7}, {0x0000FA77, 0x000052FA}, {0x0000FA78, 0x0000559D}, -{0x0000FA79, 0x00005555}, {0x0000FA7A, 0x00005599}, {0x0000FA7B, 0x000055E2}, {0x0000FA7C, 0x0000585A}, -{0x0000FA7D, 0x000058B3}, {0x0000FA7E, 0x00005944}, {0x0000FA7F, 0x00005954}, {0x0000FA80, 0x00005A62}, -{0x0000FA81, 0x00005B28}, {0x0000FA82, 0x00005ED2}, {0x0000FA83, 0x00005ED9}, {0x0000FA84, 0x00005F69}, -{0x0000FA85, 0x00005FAD}, {0x0000FA86, 0x000060D8}, {0x0000FA87, 0x0000614E}, {0x0000FA88, 0x00006108}, -{0x0000FA89, 0x0000618E}, {0x0000FA8A, 0x00006160}, {0x0000FA8B, 0x000061F2}, {0x0000FA8C, 0x00006234}, -{0x0000FA8D, 0x000063C4}, {0x0000FA8E, 0x0000641C}, {0x0000FA8F, 0x00006452}, {0x0000FA90, 0x00006556}, -{0x0000FA91, 0x00006674}, {0x0000FA92, 0x00006717}, {0x0000FA93, 0x0000671B}, {0x0000FA94, 0x00006756}, -{0x0000FA95, 0x00006B79}, {0x0000FA96, 0x00006BBA}, {0x0000FA97, 0x00006D41}, {0x0000FA98, 0x00006EDB}, -{0x0000FA99, 0x00006ECB}, {0x0000FA9A, 0x00006F22}, {0x0000FA9B, 0x0000701E}, {0x0000FA9C, 0x0000716E}, -{0x0000FA9D, 0x000077A7}, {0x0000FA9E, 0x00007235}, {0x0000FA9F, 0x000072AF}, {0x0000FAA0, 0x0000732A}, -{0x0000FAA1, 0x00007471}, {0x0000FAA2, 0x00007506}, {0x0000FAA3, 0x0000753B}, {0x0000FAA4, 0x0000761D}, -{0x0000FAA5, 0x0000761F}, {0x0000FAA6, 0x000076CA}, {0x0000FAA7, 0x000076DB}, {0x0000FAA8, 0x000076F4}, -{0x0000FAA9, 0x0000774A}, {0x0000FAAA, 0x00007740}, {0x0000FAAB, 0x000078CC}, {0x0000FAAC, 0x00007AB1}, -{0x0000FAAD, 0x00007BC0}, {0x0000FAAE, 0x00007C7B}, {0x0000FAAF, 0x00007D5B}, {0x0000FAB0, 0x00007DF4}, -{0x0000FAB1, 0x00007F3E}, {0x0000FAB2, 0x00008005}, {0x0000FAB3, 0x00008352}, {0x0000FAB4, 0x000083EF}, -{0x0000FAB5, 0x00008779}, {0x0000FAB6, 0x00008941}, {0x0000FAB7, 0x00008986}, {0x0000FAB8, 0x00008996}, -{0x0000FAB9, 0x00008ABF}, {0x0000FABA, 0x00008AF8}, {0x0000FABB, 0x00008ACB}, {0x0000FABC, 0x00008B01}, -{0x0000FABD, 0x00008AFE}, {0x0000FABE, 0x00008AED}, {0x0000FABF, 0x00008B39}, {0x0000FAC0, 0x00008B8A}, -{0x0000FAC1, 0x00008D08}, {0x0000FAC2, 0x00008F38}, {0x0000FAC3, 0x00009072}, {0x0000FAC4, 0x00009199}, -{0x0000FAC5, 0x00009276}, {0x0000FAC6, 0x0000967C}, {0x0000FAC7, 0x000096E3}, {0x0000FAC8, 0x00009756}, -{0x0000FAC9, 0x000097DB}, {0x0000FACA, 0x000097FF}, {0x0000FACB, 0x0000980B}, {0x0000FACC, 0x0000983B}, -{0x0000FACD, 0x00009B12}, {0x0000FACE, 0x00009F9C}, {0x0000FACF, 0x0002284A}, {0x0000FAD0, 0x00022844}, -{0x0000FAD1, 0x000233D5}, {0x0000FAD2, 0x00003B9D}, {0x0000FAD3, 0x00004018}, {0x0000FAD4, 0x00004039}, -{0x0000FAD5, 0x00025249}, {0x0000FAD6, 0x00025CD0}, {0x0000FAD7, 0x00027ED3}, {0x0000FAD8, 0x00009F43}, -{0x0000FAD9, 0x00009F8E}, {0x0000FB1D, 0x000005D9}, {0x0000FB1D, 0x000005B4}, {0x0000FB1F, 0x000005F2}, -{0x0000FB1F, 0x000005B7}, {0x0000FB2A, 0x000005E9}, {0x0000FB2A, 0x000005C1}, {0x0000FB2B, 0x000005E9}, -{0x0000FB2B, 0x000005C2}, {0x0000FB2C, 0x000005E9}, {0x0000FB2C, 0x000005BC}, {0x0000FB2C, 0x000005C1}, -{0x0000FB2D, 0x000005E9}, {0x0000FB2D, 0x000005BC}, {0x0000FB2D, 0x000005C2}, {0x0000FB2E, 0x000005D0}, -{0x0000FB2E, 0x000005B7}, {0x0000FB2F, 0x000005D0}, {0x0000FB2F, 0x000005B8}, {0x0000FB30, 0x000005D0}, -{0x0000FB30, 0x000005BC}, {0x0000FB31, 0x000005D1}, {0x0000FB31, 0x000005BC}, {0x0000FB32, 0x000005D2}, -{0x0000FB32, 0x000005BC}, {0x0000FB33, 0x000005D3}, {0x0000FB33, 0x000005BC}, {0x0000FB34, 0x000005D4}, -{0x0000FB34, 0x000005BC}, {0x0000FB35, 0x000005D5}, {0x0000FB35, 0x000005BC}, {0x0000FB36, 0x000005D6}, -{0x0000FB36, 0x000005BC}, {0x0000FB38, 0x000005D8}, {0x0000FB38, 0x000005BC}, {0x0000FB39, 0x000005D9}, -{0x0000FB39, 0x000005BC}, {0x0000FB3A, 0x000005DA}, {0x0000FB3A, 0x000005BC}, {0x0000FB3B, 0x000005DB}, -{0x0000FB3B, 0x000005BC}, {0x0000FB3C, 0x000005DC}, {0x0000FB3C, 0x000005BC}, {0x0000FB3E, 0x000005DE}, -{0x0000FB3E, 0x000005BC}, {0x0000FB40, 0x000005E0}, {0x0000FB40, 0x000005BC}, {0x0000FB41, 0x000005E1}, -{0x0000FB41, 0x000005BC}, {0x0000FB43, 0x000005E3}, {0x0000FB43, 0x000005BC}, {0x0000FB44, 0x000005E4}, -{0x0000FB44, 0x000005BC}, {0x0000FB46, 0x000005E6}, {0x0000FB46, 0x000005BC}, {0x0000FB47, 0x000005E7}, -{0x0000FB47, 0x000005BC}, {0x0000FB48, 0x000005E8}, {0x0000FB48, 0x000005BC}, {0x0000FB49, 0x000005E9}, -{0x0000FB49, 0x000005BC}, {0x0000FB4A, 0x000005EA}, {0x0000FB4A, 0x000005BC}, {0x0000FB4B, 0x000005D5}, -{0x0000FB4B, 0x000005B9}, {0x0000FB4C, 0x000005D1}, {0x0000FB4C, 0x000005BF}, {0x0000FB4D, 0x000005DB}, -{0x0000FB4D, 0x000005BF}, {0x0000FB4E, 0x000005E4}, {0x0000FB4E, 0x000005BF}, {0x0001109A, 0x00011099}, -{0x0001109A, 0x000110BA}, {0x0001109C, 0x0001109B}, {0x0001109C, 0x000110BA}, {0x000110AB, 0x000110A5}, -{0x000110AB, 0x000110BA}, {0x0001112E, 0x00011131}, {0x0001112E, 0x00011127}, {0x0001112F, 0x00011132}, -{0x0001112F, 0x00011127}, {0x0001134B, 0x00011347}, {0x0001134B, 0x0001133E}, {0x0001134C, 0x00011347}, -{0x0001134C, 0x00011357}, {0x000114BB, 0x000114B9}, {0x000114BB, 0x000114BA}, {0x000114BC, 0x000114B9}, -{0x000114BC, 0x000114B0}, {0x000114BE, 0x000114B9}, {0x000114BE, 0x000114BD}, {0x000115BA, 0x000115B8}, -{0x000115BA, 0x000115AF}, {0x000115BB, 0x000115B9}, {0x000115BB, 0x000115AF}, {0x0001D15E, 0x0001D157}, -{0x0001D15E, 0x0001D165}, {0x0001D15F, 0x0001D158}, {0x0001D15F, 0x0001D165}, {0x0001D160, 0x0001D158}, -{0x0001D160, 0x0001D165}, {0x0001D160, 0x0001D16E}, {0x0001D161, 0x0001D158}, {0x0001D161, 0x0001D165}, -{0x0001D161, 0x0001D16F}, {0x0001D162, 0x0001D158}, {0x0001D162, 0x0001D165}, {0x0001D162, 0x0001D170}, -{0x0001D163, 0x0001D158}, {0x0001D163, 0x0001D165}, {0x0001D163, 0x0001D171}, {0x0001D164, 0x0001D158}, -{0x0001D164, 0x0001D165}, {0x0001D164, 0x0001D172}, {0x0001D1BB, 0x0001D1B9}, {0x0001D1BB, 0x0001D165}, -{0x0001D1BC, 0x0001D1BA}, {0x0001D1BC, 0x0001D165}, {0x0001D1BD, 0x0001D1B9}, {0x0001D1BD, 0x0001D165}, -{0x0001D1BD, 0x0001D16E}, {0x0001D1BE, 0x0001D1BA}, {0x0001D1BE, 0x0001D165}, {0x0001D1BE, 0x0001D16E}, -{0x0001D1BF, 0x0001D1B9}, {0x0001D1BF, 0x0001D165}, {0x0001D1BF, 0x0001D16F}, {0x0001D1C0, 0x0001D1BA}, -{0x0001D1C0, 0x0001D165}, {0x0001D1C0, 0x0001D16F}, {0x0002F800, 0x00004E3D}, {0x0002F801, 0x00004E38}, -{0x0002F802, 0x00004E41}, {0x0002F803, 0x00020122}, {0x0002F804, 0x00004F60}, {0x0002F805, 0x00004FAE}, -{0x0002F806, 0x00004FBB}, {0x0002F807, 0x00005002}, {0x0002F808, 0x0000507A}, {0x0002F809, 0x00005099}, -{0x0002F80A, 0x000050E7}, {0x0002F80B, 0x000050CF}, {0x0002F80C, 0x0000349E}, {0x0002F80D, 0x0002063A}, -{0x0002F80E, 0x0000514D}, {0x0002F80F, 0x00005154}, {0x0002F810, 0x00005164}, {0x0002F811, 0x00005177}, -{0x0002F812, 0x0002051C}, {0x0002F813, 0x000034B9}, {0x0002F814, 0x00005167}, {0x0002F815, 0x0000518D}, -{0x0002F816, 0x0002054B}, {0x0002F817, 0x00005197}, {0x0002F818, 0x000051A4}, {0x0002F819, 0x00004ECC}, -{0x0002F81A, 0x000051AC}, {0x0002F81B, 0x000051B5}, {0x0002F81C, 0x000291DF}, {0x0002F81D, 0x000051F5}, -{0x0002F81E, 0x00005203}, {0x0002F81F, 0x000034DF}, {0x0002F820, 0x0000523B}, {0x0002F821, 0x00005246}, -{0x0002F822, 0x00005272}, {0x0002F823, 0x00005277}, {0x0002F824, 0x00003515}, {0x0002F825, 0x000052C7}, -{0x0002F826, 0x000052C9}, {0x0002F827, 0x000052E4}, {0x0002F828, 0x000052FA}, {0x0002F829, 0x00005305}, -{0x0002F82A, 0x00005306}, {0x0002F82B, 0x00005317}, {0x0002F82C, 0x00005349}, {0x0002F82D, 0x00005351}, -{0x0002F82E, 0x0000535A}, {0x0002F82F, 0x00005373}, {0x0002F830, 0x0000537D}, {0x0002F831, 0x0000537F}, -{0x0002F832, 0x0000537F}, {0x0002F833, 0x0000537F}, {0x0002F834, 0x00020A2C}, {0x0002F835, 0x00007070}, -{0x0002F836, 0x000053CA}, {0x0002F837, 0x000053DF}, {0x0002F838, 0x00020B63}, {0x0002F839, 0x000053EB}, -{0x0002F83A, 0x000053F1}, {0x0002F83B, 0x00005406}, {0x0002F83C, 0x0000549E}, {0x0002F83D, 0x00005438}, -{0x0002F83E, 0x00005448}, {0x0002F83F, 0x00005468}, {0x0002F840, 0x000054A2}, {0x0002F841, 0x000054F6}, -{0x0002F842, 0x00005510}, {0x0002F843, 0x00005553}, {0x0002F844, 0x00005563}, {0x0002F845, 0x00005584}, -{0x0002F846, 0x00005584}, {0x0002F847, 0x00005599}, {0x0002F848, 0x000055AB}, {0x0002F849, 0x000055B3}, -{0x0002F84A, 0x000055C2}, {0x0002F84B, 0x00005716}, {0x0002F84C, 0x00005606}, {0x0002F84D, 0x00005717}, -{0x0002F84E, 0x00005651}, {0x0002F84F, 0x00005674}, {0x0002F850, 0x00005207}, {0x0002F851, 0x000058EE}, -{0x0002F852, 0x000057CE}, {0x0002F853, 0x000057F4}, {0x0002F854, 0x0000580D}, {0x0002F855, 0x0000578B}, -{0x0002F856, 0x00005832}, {0x0002F857, 0x00005831}, {0x0002F858, 0x000058AC}, {0x0002F859, 0x000214E4}, -{0x0002F85A, 0x000058F2}, {0x0002F85B, 0x000058F7}, {0x0002F85C, 0x00005906}, {0x0002F85D, 0x0000591A}, -{0x0002F85E, 0x00005922}, {0x0002F85F, 0x00005962}, {0x0002F860, 0x000216A8}, {0x0002F861, 0x000216EA}, -{0x0002F862, 0x000059EC}, {0x0002F863, 0x00005A1B}, {0x0002F864, 0x00005A27}, {0x0002F865, 0x000059D8}, -{0x0002F866, 0x00005A66}, {0x0002F867, 0x000036EE}, {0x0002F868, 0x000036FC}, {0x0002F869, 0x00005B08}, -{0x0002F86A, 0x00005B3E}, {0x0002F86B, 0x00005B3E}, {0x0002F86C, 0x000219C8}, {0x0002F86D, 0x00005BC3}, -{0x0002F86E, 0x00005BD8}, {0x0002F86F, 0x00005BE7}, {0x0002F870, 0x00005BF3}, {0x0002F871, 0x00021B18}, -{0x0002F872, 0x00005BFF}, {0x0002F873, 0x00005C06}, {0x0002F874, 0x00005F53}, {0x0002F875, 0x00005C22}, -{0x0002F876, 0x00003781}, {0x0002F877, 0x00005C60}, {0x0002F878, 0x00005C6E}, {0x0002F879, 0x00005CC0}, -{0x0002F87A, 0x00005C8D}, {0x0002F87B, 0x00021DE4}, {0x0002F87C, 0x00005D43}, {0x0002F87D, 0x00021DE6}, -{0x0002F87E, 0x00005D6E}, {0x0002F87F, 0x00005D6B}, {0x0002F880, 0x00005D7C}, {0x0002F881, 0x00005DE1}, -{0x0002F882, 0x00005DE2}, {0x0002F883, 0x0000382F}, {0x0002F884, 0x00005DFD}, {0x0002F885, 0x00005E28}, -{0x0002F886, 0x00005E3D}, {0x0002F887, 0x00005E69}, {0x0002F888, 0x00003862}, {0x0002F889, 0x00022183}, -{0x0002F88A, 0x0000387C}, {0x0002F88B, 0x00005EB0}, {0x0002F88C, 0x00005EB3}, {0x0002F88D, 0x00005EB6}, -{0x0002F88E, 0x00005ECA}, {0x0002F88F, 0x0002A392}, {0x0002F890, 0x00005EFE}, {0x0002F891, 0x00022331}, -{0x0002F892, 0x00022331}, {0x0002F893, 0x00008201}, {0x0002F894, 0x00005F22}, {0x0002F895, 0x00005F22}, -{0x0002F896, 0x000038C7}, {0x0002F897, 0x000232B8}, {0x0002F898, 0x000261DA}, {0x0002F899, 0x00005F62}, -{0x0002F89A, 0x00005F6B}, {0x0002F89B, 0x000038E3}, {0x0002F89C, 0x00005F9A}, {0x0002F89D, 0x00005FCD}, -{0x0002F89E, 0x00005FD7}, {0x0002F89F, 0x00005FF9}, {0x0002F8A0, 0x00006081}, {0x0002F8A1, 0x0000393A}, -{0x0002F8A2, 0x0000391C}, {0x0002F8A3, 0x00006094}, {0x0002F8A4, 0x000226D4}, {0x0002F8A5, 0x000060C7}, -{0x0002F8A6, 0x00006148}, {0x0002F8A7, 0x0000614C}, {0x0002F8A8, 0x0000614E}, {0x0002F8A9, 0x0000614C}, -{0x0002F8AA, 0x0000617A}, {0x0002F8AB, 0x0000618E}, {0x0002F8AC, 0x000061B2}, {0x0002F8AD, 0x000061A4}, -{0x0002F8AE, 0x000061AF}, {0x0002F8AF, 0x000061DE}, {0x0002F8B0, 0x000061F2}, {0x0002F8B1, 0x000061F6}, -{0x0002F8B2, 0x00006210}, {0x0002F8B3, 0x0000621B}, {0x0002F8B4, 0x0000625D}, {0x0002F8B5, 0x000062B1}, -{0x0002F8B6, 0x000062D4}, {0x0002F8B7, 0x00006350}, {0x0002F8B8, 0x00022B0C}, {0x0002F8B9, 0x0000633D}, -{0x0002F8BA, 0x000062FC}, {0x0002F8BB, 0x00006368}, {0x0002F8BC, 0x00006383}, {0x0002F8BD, 0x000063E4}, -{0x0002F8BE, 0x00022BF1}, {0x0002F8BF, 0x00006422}, {0x0002F8C0, 0x000063C5}, {0x0002F8C1, 0x000063A9}, -{0x0002F8C2, 0x00003A2E}, {0x0002F8C3, 0x00006469}, {0x0002F8C4, 0x0000647E}, {0x0002F8C5, 0x0000649D}, -{0x0002F8C6, 0x00006477}, {0x0002F8C7, 0x00003A6C}, {0x0002F8C8, 0x0000654F}, {0x0002F8C9, 0x0000656C}, -{0x0002F8CA, 0x0002300A}, {0x0002F8CB, 0x000065E3}, {0x0002F8CC, 0x000066F8}, {0x0002F8CD, 0x00006649}, -{0x0002F8CE, 0x00003B19}, {0x0002F8CF, 0x00006691}, {0x0002F8D0, 0x00003B08}, {0x0002F8D1, 0x00003AE4}, -{0x0002F8D2, 0x00005192}, {0x0002F8D3, 0x00005195}, {0x0002F8D4, 0x00006700}, {0x0002F8D5, 0x0000669C}, -{0x0002F8D6, 0x000080AD}, {0x0002F8D7, 0x000043D9}, {0x0002F8D8, 0x00006717}, {0x0002F8D9, 0x0000671B}, -{0x0002F8DA, 0x00006721}, {0x0002F8DB, 0x0000675E}, {0x0002F8DC, 0x00006753}, {0x0002F8DD, 0x000233C3}, -{0x0002F8DE, 0x00003B49}, {0x0002F8DF, 0x000067FA}, {0x0002F8E0, 0x00006785}, {0x0002F8E1, 0x00006852}, -{0x0002F8E2, 0x00006885}, {0x0002F8E3, 0x0002346D}, {0x0002F8E4, 0x0000688E}, {0x0002F8E5, 0x0000681F}, -{0x0002F8E6, 0x00006914}, {0x0002F8E7, 0x00003B9D}, {0x0002F8E8, 0x00006942}, {0x0002F8E9, 0x000069A3}, -{0x0002F8EA, 0x000069EA}, {0x0002F8EB, 0x00006AA8}, {0x0002F8EC, 0x000236A3}, {0x0002F8ED, 0x00006ADB}, -{0x0002F8EE, 0x00003C18}, {0x0002F8EF, 0x00006B21}, {0x0002F8F0, 0x000238A7}, {0x0002F8F1, 0x00006B54}, -{0x0002F8F2, 0x00003C4E}, {0x0002F8F3, 0x00006B72}, {0x0002F8F4, 0x00006B9F}, {0x0002F8F5, 0x00006BBA}, -{0x0002F8F6, 0x00006BBB}, {0x0002F8F7, 0x00023A8D}, {0x0002F8F8, 0x00021D0B}, {0x0002F8F9, 0x00023AFA}, -{0x0002F8FA, 0x00006C4E}, {0x0002F8FB, 0x00023CBC}, {0x0002F8FC, 0x00006CBF}, {0x0002F8FD, 0x00006CCD}, -{0x0002F8FE, 0x00006C67}, {0x0002F8FF, 0x00006D16}, {0x0002F900, 0x00006D3E}, {0x0002F901, 0x00006D77}, -{0x0002F902, 0x00006D41}, {0x0002F903, 0x00006D69}, {0x0002F904, 0x00006D78}, {0x0002F905, 0x00006D85}, -{0x0002F906, 0x00023D1E}, {0x0002F907, 0x00006D34}, {0x0002F908, 0x00006E2F}, {0x0002F909, 0x00006E6E}, -{0x0002F90A, 0x00003D33}, {0x0002F90B, 0x00006ECB}, {0x0002F90C, 0x00006EC7}, {0x0002F90D, 0x00023ED1}, -{0x0002F90E, 0x00006DF9}, {0x0002F90F, 0x00006F6E}, {0x0002F910, 0x00023F5E}, {0x0002F911, 0x00023F8E}, -{0x0002F912, 0x00006FC6}, {0x0002F913, 0x00007039}, {0x0002F914, 0x0000701E}, {0x0002F915, 0x0000701B}, -{0x0002F916, 0x00003D96}, {0x0002F917, 0x0000704A}, {0x0002F918, 0x0000707D}, {0x0002F919, 0x00007077}, -{0x0002F91A, 0x000070AD}, {0x0002F91B, 0x00020525}, {0x0002F91C, 0x00007145}, {0x0002F91D, 0x00024263}, -{0x0002F91E, 0x0000719C}, {0x0002F91F, 0x000243AB}, {0x0002F920, 0x00007228}, {0x0002F921, 0x00007235}, -{0x0002F922, 0x00007250}, {0x0002F923, 0x00024608}, {0x0002F924, 0x00007280}, {0x0002F925, 0x00007295}, -{0x0002F926, 0x00024735}, {0x0002F927, 0x00024814}, {0x0002F928, 0x0000737A}, {0x0002F929, 0x0000738B}, -{0x0002F92A, 0x00003EAC}, {0x0002F92B, 0x000073A5}, {0x0002F92C, 0x00003EB8}, {0x0002F92D, 0x00003EB8}, -{0x0002F92E, 0x00007447}, {0x0002F92F, 0x0000745C}, {0x0002F930, 0x00007471}, {0x0002F931, 0x00007485}, -{0x0002F932, 0x000074CA}, {0x0002F933, 0x00003F1B}, {0x0002F934, 0x00007524}, {0x0002F935, 0x00024C36}, -{0x0002F936, 0x0000753E}, {0x0002F937, 0x00024C92}, {0x0002F938, 0x00007570}, {0x0002F939, 0x0002219F}, -{0x0002F93A, 0x00007610}, {0x0002F93B, 0x00024FA1}, {0x0002F93C, 0x00024FB8}, {0x0002F93D, 0x00025044}, -{0x0002F93E, 0x00003FFC}, {0x0002F93F, 0x00004008}, {0x0002F940, 0x000076F4}, {0x0002F941, 0x000250F3}, -{0x0002F942, 0x000250F2}, {0x0002F943, 0x00025119}, {0x0002F944, 0x00025133}, {0x0002F945, 0x0000771E}, -{0x0002F946, 0x0000771F}, {0x0002F947, 0x0000771F}, {0x0002F948, 0x0000774A}, {0x0002F949, 0x00004039}, -{0x0002F94A, 0x0000778B}, {0x0002F94B, 0x00004046}, {0x0002F94C, 0x00004096}, {0x0002F94D, 0x0002541D}, -{0x0002F94E, 0x0000784E}, {0x0002F94F, 0x0000788C}, {0x0002F950, 0x000078CC}, {0x0002F951, 0x000040E3}, -{0x0002F952, 0x00025626}, {0x0002F953, 0x00007956}, {0x0002F954, 0x0002569A}, {0x0002F955, 0x000256C5}, -{0x0002F956, 0x0000798F}, {0x0002F957, 0x000079EB}, {0x0002F958, 0x0000412F}, {0x0002F959, 0x00007A40}, -{0x0002F95A, 0x00007A4A}, {0x0002F95B, 0x00007A4F}, {0x0002F95C, 0x0002597C}, {0x0002F95D, 0x00025AA7}, -{0x0002F95E, 0x00025AA7}, {0x0002F95F, 0x00007AEE}, {0x0002F960, 0x00004202}, {0x0002F961, 0x00025BAB}, -{0x0002F962, 0x00007BC6}, {0x0002F963, 0x00007BC9}, {0x0002F964, 0x00004227}, {0x0002F965, 0x00025C80}, -{0x0002F966, 0x00007CD2}, {0x0002F967, 0x000042A0}, {0x0002F968, 0x00007CE8}, {0x0002F969, 0x00007CE3}, -{0x0002F96A, 0x00007D00}, {0x0002F96B, 0x00025F86}, {0x0002F96C, 0x00007D63}, {0x0002F96D, 0x00004301}, -{0x0002F96E, 0x00007DC7}, {0x0002F96F, 0x00007E02}, {0x0002F970, 0x00007E45}, {0x0002F971, 0x00004334}, -{0x0002F972, 0x00026228}, {0x0002F973, 0x00026247}, {0x0002F974, 0x00004359}, {0x0002F975, 0x000262D9}, -{0x0002F976, 0x00007F7A}, {0x0002F977, 0x0002633E}, {0x0002F978, 0x00007F95}, {0x0002F979, 0x00007FFA}, -{0x0002F97A, 0x00008005}, {0x0002F97B, 0x000264DA}, {0x0002F97C, 0x00026523}, {0x0002F97D, 0x00008060}, -{0x0002F97E, 0x000265A8}, {0x0002F97F, 0x00008070}, {0x0002F980, 0x0002335F}, {0x0002F981, 0x000043D5}, -{0x0002F982, 0x000080B2}, {0x0002F983, 0x00008103}, {0x0002F984, 0x0000440B}, {0x0002F985, 0x0000813E}, -{0x0002F986, 0x00005AB5}, {0x0002F987, 0x000267A7}, {0x0002F988, 0x000267B5}, {0x0002F989, 0x00023393}, -{0x0002F98A, 0x0002339C}, {0x0002F98B, 0x00008201}, {0x0002F98C, 0x00008204}, {0x0002F98D, 0x00008F9E}, -{0x0002F98E, 0x0000446B}, {0x0002F98F, 0x00008291}, {0x0002F990, 0x0000828B}, {0x0002F991, 0x0000829D}, -{0x0002F992, 0x000052B3}, {0x0002F993, 0x000082B1}, {0x0002F994, 0x000082B3}, {0x0002F995, 0x000082BD}, -{0x0002F996, 0x000082E6}, {0x0002F997, 0x00026B3C}, {0x0002F998, 0x000082E5}, {0x0002F999, 0x0000831D}, -{0x0002F99A, 0x00008363}, {0x0002F99B, 0x000083AD}, {0x0002F99C, 0x00008323}, {0x0002F99D, 0x000083BD}, -{0x0002F99E, 0x000083E7}, {0x0002F99F, 0x00008457}, {0x0002F9A0, 0x00008353}, {0x0002F9A1, 0x000083CA}, -{0x0002F9A2, 0x000083CC}, {0x0002F9A3, 0x000083DC}, {0x0002F9A4, 0x00026C36}, {0x0002F9A5, 0x00026D6B}, -{0x0002F9A6, 0x00026CD5}, {0x0002F9A7, 0x0000452B}, {0x0002F9A8, 0x000084F1}, {0x0002F9A9, 0x000084F3}, -{0x0002F9AA, 0x00008516}, {0x0002F9AB, 0x000273CA}, {0x0002F9AC, 0x00008564}, {0x0002F9AD, 0x00026F2C}, -{0x0002F9AE, 0x0000455D}, {0x0002F9AF, 0x00004561}, {0x0002F9B0, 0x00026FB1}, {0x0002F9B1, 0x000270D2}, -{0x0002F9B2, 0x0000456B}, {0x0002F9B3, 0x00008650}, {0x0002F9B4, 0x0000865C}, {0x0002F9B5, 0x00008667}, -{0x0002F9B6, 0x00008669}, {0x0002F9B7, 0x000086A9}, {0x0002F9B8, 0x00008688}, {0x0002F9B9, 0x0000870E}, -{0x0002F9BA, 0x000086E2}, {0x0002F9BB, 0x00008779}, {0x0002F9BC, 0x00008728}, {0x0002F9BD, 0x0000876B}, -{0x0002F9BE, 0x00008786}, {0x0002F9BF, 0x000045D7}, {0x0002F9C0, 0x000087E1}, {0x0002F9C1, 0x00008801}, -{0x0002F9C2, 0x000045F9}, {0x0002F9C3, 0x00008860}, {0x0002F9C4, 0x00008863}, {0x0002F9C5, 0x00027667}, -{0x0002F9C6, 0x000088D7}, {0x0002F9C7, 0x000088DE}, {0x0002F9C8, 0x00004635}, {0x0002F9C9, 0x000088FA}, -{0x0002F9CA, 0x000034BB}, {0x0002F9CB, 0x000278AE}, {0x0002F9CC, 0x00027966}, {0x0002F9CD, 0x000046BE}, -{0x0002F9CE, 0x000046C7}, {0x0002F9CF, 0x00008AA0}, {0x0002F9D0, 0x00008AED}, {0x0002F9D1, 0x00008B8A}, -{0x0002F9D2, 0x00008C55}, {0x0002F9D3, 0x00027CA8}, {0x0002F9D4, 0x00008CAB}, {0x0002F9D5, 0x00008CC1}, -{0x0002F9D6, 0x00008D1B}, {0x0002F9D7, 0x00008D77}, {0x0002F9D8, 0x00027F2F}, {0x0002F9D9, 0x00020804}, -{0x0002F9DA, 0x00008DCB}, {0x0002F9DB, 0x00008DBC}, {0x0002F9DC, 0x00008DF0}, {0x0002F9DD, 0x000208DE}, -{0x0002F9DE, 0x00008ED4}, {0x0002F9DF, 0x00008F38}, {0x0002F9E0, 0x000285D2}, {0x0002F9E1, 0x000285ED}, -{0x0002F9E2, 0x00009094}, {0x0002F9E3, 0x000090F1}, {0x0002F9E4, 0x00009111}, {0x0002F9E5, 0x0002872E}, -{0x0002F9E6, 0x0000911B}, {0x0002F9E7, 0x00009238}, {0x0002F9E8, 0x000092D7}, {0x0002F9E9, 0x000092D8}, -{0x0002F9EA, 0x0000927C}, {0x0002F9EB, 0x000093F9}, {0x0002F9EC, 0x00009415}, {0x0002F9ED, 0x00028BFA}, -{0x0002F9EE, 0x0000958B}, {0x0002F9EF, 0x00004995}, {0x0002F9F0, 0x000095B7}, {0x0002F9F1, 0x00028D77}, -{0x0002F9F2, 0x000049E6}, {0x0002F9F3, 0x000096C3}, {0x0002F9F4, 0x00005DB2}, {0x0002F9F5, 0x00009723}, -{0x0002F9F6, 0x00029145}, {0x0002F9F7, 0x0002921A}, {0x0002F9F8, 0x00004A6E}, {0x0002F9F9, 0x00004A76}, -{0x0002F9FA, 0x000097E0}, {0x0002F9FB, 0x0002940A}, {0x0002F9FC, 0x00004AB2}, {0x0002F9FD, 0x00029496}, -{0x0002F9FE, 0x0000980B}, {0x0002F9FF, 0x0000980B}, {0x0002FA00, 0x00009829}, {0x0002FA01, 0x000295B6}, -{0x0002FA02, 0x000098E2}, {0x0002FA03, 0x00004B33}, {0x0002FA04, 0x00009929}, {0x0002FA05, 0x000099A7}, -{0x0002FA06, 0x000099C2}, {0x0002FA07, 0x000099FE}, {0x0002FA08, 0x00004BCE}, {0x0002FA09, 0x00029B30}, -{0x0002FA0A, 0x00009B12}, {0x0002FA0B, 0x00009C40}, {0x0002FA0C, 0x00009CFD}, {0x0002FA0D, 0x00004CCE}, -{0x0002FA0E, 0x00004CED}, {0x0002FA0F, 0x00009D67}, {0x0002FA10, 0x0002A0CE}, {0x0002FA11, 0x00004CF8}, -{0x0002FA12, 0x0002A105}, {0x0002FA13, 0x0002A20E}, {0x0002FA14, 0x0002A291}, {0x0002FA15, 0x00009EBB}, -{0x0002FA16, 0x00004D56}, {0x0002FA17, 0x00009EF9}, {0x0002FA18, 0x00009EFE}, {0x0002FA19, 0x00009F05}, -{0x0002FA1A, 0x00009F0F}, {0x0002FA1B, 0x00009F16}, {0x0002FA1D, 0x0002A600}, -}; +#include +#include static std::string unicode_cpts_to_utf8(const std::vector & cps) { std::string result; @@ -1452,23 +57,22 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) offset += 4; return result; } - throw std::invalid_argument("invalid string"); + throw std::invalid_argument("failed to convert utf8 to codepoint"); } -static std::vector unicode_cpt_to_utf16(uint32_t cp) { - std::vector result; - if (/* 0x0000 <= cp && */ cp <= 0xffff) { - result.emplace_back(cp); - } - else if (0x10000 <= cp && cp <= 0x10ffff) { - result.emplace_back(0xd800 | ((cp - 0x10000) >> 10)); - result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff)); - } - else { - throw std::invalid_argument("invalid cpt"); - } - return result; -} +//static std::vector unicode_cpt_to_utf16(uint32_t cp) { +// std::vector result; +// if (/* 0x0000 <= cp && */ cp <= 0xffff) { +// result.emplace_back(cp); +// return result; +// } +// if (0x10000 <= cp && cp <= 0x10ffff) { +// result.emplace_back(0xd800 | ((cp - 0x10000) >> 10)); +// result.emplace_back(0xdc00 | ((cp - 0x10000) & 0x03ff)); +// return result; +// } +// throw std::invalid_argument("failed to convert codepoint to utf16"); +//} //static std::vector unicode_cpts_to_utf16(const std::vector & cps) { // std::vector result; @@ -1479,56 +83,56 @@ static std::vector unicode_cpt_to_utf16(uint32_t cp) { // return result; //} -static uint32_t cpt_from_utf16(const std::vector & utf16, size_t & offset) { - assert(offset < utf16.size()); - if (((utf16[0] >> 10) << 10) != 0xd800) { - auto result = utf16[offset + 0]; - offset += 1; - return result; - } - - if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { - throw std::invalid_argument("invalid character"); - } - - auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); - offset += 2; - return result; -} +//static uint32_t unicode_cpt_from_utf16(const std::vector & utf16, size_t & offset) { +// assert(offset < utf16.size()); +// if (((utf16[0] >> 10) << 10) != 0xd800) { +// auto result = utf16[offset + 0]; +// offset += 1; +// return result; +// } +// +// if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) { +// throw std::invalid_argument("invalid character"); +// } +// +// auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff)); +// offset += 2; +// return result; +//} //static std::vector unicode_cpts_from_utf16(const std::vector & utf16) { // std::vector result; // size_t offset = 0; // while (offset < utf16.size()) { -// result.push_back(cpt_from_utf16(utf16, offset)); +// result.push_back(unicode_cpt_from_utf16(utf16, offset)); // } // return result; //} static std::unordered_map unicode_cpt_type_map() { std::unordered_map cpt_types; - for (auto p : unicode_ranges_digit) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_DIGIT; + for (auto p : unicode_ranges_number) { + for (auto i = p.first; i <= p.second; ++i) { + cpt_types[i] = CODEPOINT_TYPE_NUMBER; } } for (auto p : unicode_ranges_letter) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_LETTER; } } - for (auto p : unicode_ranges_whitespace) { - for (auto i = p.first; i <= p.second; ++ i) { - cpt_types[i] = CODEPOINT_TYPE_WHITESPACE; + for (auto p : unicode_ranges_separator) { + for (auto i = p.first; i <= p.second; ++i) { + cpt_types[i] = CODEPOINT_TYPE_SEPARATOR; } } for (auto p : unicode_ranges_accent_mark) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK; } } for (auto p : unicode_ranges_punctuation) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION; } } @@ -1538,7 +142,7 @@ static std::unordered_map unicode_cpt_type_map() { } } for (auto p : unicode_ranges_control) { - for (auto i = p.first; i <= p.second; ++ i) { + for (auto i = p.first; i <= p.second; ++i) { cpt_types[i] = CODEPOINT_TYPE_CONTROL; } } @@ -1593,34 +197,395 @@ static std::unordered_map unicode_utf8_to_byte_map() { return map; } +static inline std::wstring unicode_wstring_from_utf8(const std::string & s) { + std::wstring_convert> conv; + return conv.from_bytes(s); +} + +static std::vector unicode_byte_encoding_process(const std::vector & bpe_words) { + std::vector bpe_encoded_words; + for (const auto & word : bpe_words) { + std::string text_utf; + auto utf_word = unicode_cpts_from_utf8(word); + for (size_t i = 0; i < utf_word.size(); ++i) { + text_utf += unicode_cpt_to_utf8(utf_word[i]); + } + + std::string encoded_token; + for (char & c : text_utf) { + encoded_token += unicode_byte_to_utf8(c); + } + bpe_encoded_words.emplace_back(encoded_token); + } + return bpe_encoded_words; +} + +// GPT2 system regex: 's|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+ +static std::vector unicode_regex_split_custom_gpt2(const std::string & text, const std::vector & offsets) { + std::vector bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + + const auto cpts = unicode_cpts_from_utf8(text); + + size_t start = 0; + for (auto offset : offsets) { + const size_t offset_ini = start; + const size_t offset_end = start + offset; + assert(offset_end <= cpts.size()); + start = offset_end; + + auto _get_cpt = [&] (const size_t pos) -> char32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + }; + + auto _get_cpt_type = [&] (const size_t pos) -> int { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED; + }; + + size_t _prev_end = offset_ini; + auto _add_token = [&] (const size_t end) -> size_t { + assert(_prev_end <= end && end <= offset_end); + size_t len = end - _prev_end; + if (len > 0) { + bpe_offsets.push_back(len); + } + _prev_end = end; + //if (len > 0) { + // std::string s = ""; + // for(size_t p = end-len; p < end; p++) + // s += unicode_cpt_to_utf8(cpts[p]); + // printf(">>> '%s'\n", s.c_str()); + //} + return len; + }; + + for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { + const char32_t cpt = _get_cpt(pos); + const int cpt_type = _get_cpt_type(pos); + + // regex: 's|'t|'re|'ve|'m|'ll|'d + if (cpt == '\'' && pos+1 < offset_end) { + char32_t cpt_next = _get_cpt(pos+1); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') { + pos += _add_token(pos+2); + continue; + } + if (pos+2 < offset_end) { + char32_t cpt_next_next = _get_cpt(pos+2); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + pos += _add_token(pos+3); + continue; + } + } + } + + char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt); + int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type); + // regex: ?\p{L}+ + if (cpt2_type == CODEPOINT_TYPE_LETTER) { + pos += (cpt == ' '); + while (cpt2_type == CODEPOINT_TYPE_LETTER) { + cpt2_type = _get_cpt_type(++pos); + } + _add_token(pos); + continue; + } + // regex: ?\p{N}+ + if (cpt2_type == CODEPOINT_TYPE_NUMBER) { + pos += (cpt == ' '); + while (cpt2_type == CODEPOINT_TYPE_NUMBER) { + cpt2_type = _get_cpt_type(++pos); + } + _add_token(pos); + continue; + } + // regex: ?[^\s\p{L}\p{N}]+ + if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + pos += (cpt == ' '); + while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + cpt2_type = _get_cpt_type(++pos); + cpt2 = _get_cpt(pos); + } + _add_token(pos); + continue; + } + + size_t num_whitespaces = 0; + while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) { + num_whitespaces++; + } + + // regex: \s+(?!\S) + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + pos += num_whitespaces - 1; + _add_token(pos); + continue; + } + + // regex: \s+ + if (num_whitespaces > 0) { + pos += num_whitespaces; + _add_token(pos); + continue; + } + + // no matches + _add_token(++pos); + } + } + + return bpe_offsets; +} + +// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" +static std::vector unicode_regex_split_custom_llama3(const std::string & text, const std::vector & offsets) { + std::vector bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + + const auto cpts = unicode_cpts_from_utf8(text); + + size_t start = 0; + for (auto offset : offsets) { + const size_t offset_ini = start; + const size_t offset_end = start + offset; + assert(offset_end <= cpts.size()); + start = offset_end; + + auto _get_cpt = [&] (const size_t pos) -> char32_t { + return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0; + }; + + auto _get_cpt_type = [&] (const size_t pos) -> int { + return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED; + }; + + size_t _prev_end = offset_ini; + auto _add_token = [&] (const size_t end) -> size_t { + assert(_prev_end <= end && end <= offset_end); + size_t len = end - _prev_end; + if (len > 0) { + bpe_offsets.push_back(len); + } + _prev_end = end; + //if (len > 0) { + // std::string s = ""; + // for(size_t p = end-len; p < end; p++) + // s += unicode_cpt_to_utf8(cpts[p]); + // printf(">>> '%s'\n", s.c_str()); + //} + return len; + }; + + for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) { + const char32_t cpt = _get_cpt(pos); + const int cpt_type = _get_cpt_type(pos); + + // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive + if (cpt == '\'' && pos+1 < offset_end) { + char32_t cpt_next = unicode_tolower(_get_cpt(pos+1)); + if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') { + pos += _add_token(pos+2); + continue; + } + if (pos+2 < offset_end) { + char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2)); + if ((cpt_next == 'r' && cpt_next_next == 'e') || + (cpt_next == 'v' && cpt_next_next == 'e') || + (cpt_next == 'l' && cpt_next_next == 'l')) { + pos += _add_token(pos+3); + continue; + } + } + } + + // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct? + if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) { + if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters + pos++; + while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) { + pos++; + } + _add_token(pos); + continue; + } + } + + // regex: \p{N}{1,3} + if (cpt_type == CODEPOINT_TYPE_NUMBER) { + size_t ini = pos; + while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) { + if (++pos - ini >= 3 ) { + _add_token(pos); + ini = pos; + } + } + _add_token(pos); + continue; + } + + // regex: ?[^\s\p{L}\p{N}]+[\r\n]* + char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt); + int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type); + if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + pos += (cpt == ' '); + while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) { + cpt2_type = _get_cpt_type(++pos); + cpt2 = _get_cpt(pos); + } + while (cpt2 == '\r' || cpt2 == '\n') { + cpt2 = _get_cpt(++pos); + } + _add_token(pos); + continue; + } + + size_t num_whitespaces = 0; + size_t last_end_r_or_n = 0; + while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) { + char32_t cpt2 = _get_cpt(pos+num_whitespaces); + if (cpt2 == '\r' || cpt2 == '\n') { + last_end_r_or_n = pos + num_whitespaces + 1; + } + num_whitespaces++; + } + + // regex: \s*[\r\n]+ + if (last_end_r_or_n > 0) { + pos = last_end_r_or_n; + _add_token(pos); + continue; + } + + // regex: \s+(?!\S) + if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) { + pos += num_whitespaces - 1; + _add_token(pos); + continue; + } + + // regex: \s+ + if (num_whitespaces > 0) { + pos += num_whitespaces; + _add_token(pos); + continue; + } + + // no matches + _add_token(++pos); + } + } + + return bpe_offsets; +} + +// use std::wregex to split the text +static std::vector unicode_regex_split_stl(const std::wstring & wtext, const std::wstring & regex_expr, const std::vector & offsets) { + std::wregex expr(regex_expr); + std::vector bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for (auto offset : offsets) { + std::wcregex_iterator it(wtext.data() + start, wtext.data() + start + offset, expr); + std::wcregex_iterator end; + + int64_t start_idx = 0; + while (it != end) { + std::wcmatch match = *it; + if (match.position() > start_idx) { + bpe_offsets.emplace_back(match.position() - start_idx); + } + bpe_offsets.emplace_back(match.length()); + start_idx = match.position() + match.length(); + ++it; + } + + if (start_idx < (int64_t) offset) { + bpe_offsets.emplace_back(offset - start_idx); + } + start += offset; + } + + return bpe_offsets; +} + +// use std::regex to split the text +static std::vector unicode_regex_split_stl(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { + std::regex expr(regex_expr); + std::vector bpe_offsets; // store the offset of each word + bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size + size_t start = 0; + for (auto offset : offsets) { + std::cregex_iterator it(text.data() + start, text.data() + start + offset, expr); + std::cregex_iterator end; + + int64_t start_idx = 0; + while (it != end) { + std::cmatch match = *it; + if (match.position() > start_idx) { + bpe_offsets.emplace_back(match.position() - start_idx); + } + bpe_offsets.emplace_back(match.length()); + start_idx = match.position() + match.length(); + ++it; + } + + if (start_idx < (int64_t) offset) { + bpe_offsets.emplace_back(offset - start_idx); + } + start += offset; + } + + return bpe_offsets; +} + +static std::vector unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector & offsets) { + std::vector bpe_offsets; + + if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") { + bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); + } else if ( + regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" || + regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") { + + bpe_offsets = unicode_regex_split_custom_llama3(text, offsets); + } + + return bpe_offsets; +} + // // interface // std::string unicode_cpt_to_utf8(uint32_t cp) { std::string result; + if (/* 0x00 <= cp && */ cp <= 0x7f) { result.push_back(cp); + return result; } - else if (0x80 <= cp && cp <= 0x7ff) { + if (0x80 <= cp && cp <= 0x7ff) { result.push_back(0xc0 | ((cp >> 6) & 0x1f)); result.push_back(0x80 | (cp & 0x3f)); + return result; } - else if (0x800 <= cp && cp <= 0xffff) { + if (0x800 <= cp && cp <= 0xffff) { result.push_back(0xe0 | ((cp >> 12) & 0x0f)); result.push_back(0x80 | ((cp >> 6) & 0x3f)); result.push_back(0x80 | (cp & 0x3f)); + return result; } - else if (0x10000 <= cp && cp <= 0x10ffff) { + if (0x10000 <= cp && cp <= 0x10ffff) { result.push_back(0xf0 | ((cp >> 18) & 0x07)); result.push_back(0x80 | ((cp >> 12) & 0x3f)); result.push_back(0x80 | ((cp >> 6) & 0x3f)); result.push_back(0x80 | (cp & 0x3f)); + return result; } - else { - throw std::invalid_argument("invalid codepoint"); - } - return result; + + throw std::invalid_argument("invalid codepoint"); } std::vector unicode_cpts_normalize_nfd(const std::vector & cpts) { @@ -1660,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) { return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset)); } +bool unicode_cpt_is_whitespace(uint32_t cp) { + static const std::unordered_set is_whitespace = [] { + std::unordered_set is_whitespace; + for (auto p : unicode_ranges_whitespace) { + for (auto i = p.first; i <= p.second; ++i) { + is_whitespace.insert(i); + } + } + return is_whitespace; + }(); + return (bool)is_whitespace.count(cp); +} + std::string unicode_byte_to_utf8(uint8_t byte) { static std::unordered_map map = unicode_byte_to_utf8_map(); return map.at(byte); @@ -1670,3 +648,171 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) { return map.at(utf8); } +char32_t unicode_tolower(char32_t cp) { + auto it = unicode_map_lowercase.find(cp); + return it == unicode_map_lowercase.end() ? cp : it->second; +} + +std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs) { + // unicode categories + static const std::map k_ucat_enum = { + { "\\p{N}", CODEPOINT_TYPE_NUMBER }, + { "\\p{L}", CODEPOINT_TYPE_LETTER }, + { "\\p{P}", CODEPOINT_TYPE_PUNCTUATION }, + }; + + static const std::map k_ucat_cpt = { + { CODEPOINT_TYPE_NUMBER, 0xD1 }, + { CODEPOINT_TYPE_LETTER, 0xD2 }, + { CODEPOINT_TYPE_PUNCTUATION, 0xD3 }, + }; + + static const std::map k_ucat_map = { + { CODEPOINT_TYPE_NUMBER, "\x30-\x39" }, // 0-9 + { CODEPOINT_TYPE_LETTER, "\x41-\x5A\x61-\x7A" }, // A-Za-z + { CODEPOINT_TYPE_PUNCTUATION, "\x21-\x23\x25-\x2A\x2C-\x2F\x3A-\x3B\x3F-\x40\\\x5B-\\\x5D\x5F\\\x7B\\\x7D" }, // !-#%-*,-/:-;?-@\[-\]_\{\} + }; + + // compute collapsed codepoints only if needed by at least one regex + bool need_collapse = false; + for (auto & regex_expr : regex_exprs) { + // search for unicode categories + for (const auto & ucat : k_ucat_enum) { + if (std::string::npos != regex_expr.find(ucat.first)) { + need_collapse = true; + break; + } + } + } + + const auto cpts = unicode_cpts_from_utf8(text); + + // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte + // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935 + std::string text_collapsed; + if (need_collapse) { + // collapse all unicode categories + text_collapsed.resize(cpts.size()); + + for (size_t i = 0; i < cpts.size(); ++i) { + // keep single-byte codepoints as is + if (cpts[i] < 128) { + text_collapsed[i] = cpts[i]; + continue; + } + + const int cpt_type = unicode_cpt_type(cpts[i]); + + if (k_ucat_cpt.find(cpt_type) != k_ucat_cpt.end()) { + text_collapsed[i] = k_ucat_cpt.at(cpt_type); + } else { + text_collapsed[i] = (char) 0xD0; // fallback + } + } + } + + std::vector bpe_offsets = { cpts.size() }; + + for (auto & regex_expr : regex_exprs) { + // first, see if we have an efficient custom regex implementation + auto tmp = unicode_regex_split_custom(text, regex_expr, bpe_offsets); + + if (!tmp.empty()) { + bpe_offsets = std::move(tmp); + continue; + } + + // fallback to general-purpose std::regex / std::wregex + try { + // if a unicode category is used in the regex, we use the collapsed text and replace the unicode category + // with the corresponding collapsed representation + bool use_collapsed = false; + for (auto & ucat : k_ucat_enum) { + if (std::string::npos != regex_expr.find(ucat.first)) { + use_collapsed = true; + break; + } + } + + if (use_collapsed) { + // sanity-check that the original regex does not contain any non-ASCII characters + const auto cpts_regex = unicode_cpts_from_utf8(regex_expr); + for (size_t i = 0; i < cpts_regex.size(); ++i) { + if (cpts_regex[i] >= 128) { + throw std::runtime_error("Regex includes both unicode categories and non-ASCII characters - not supported"); + } + } + + // generate a collapsed representation of the regex + std::string regex_expr_collapsed; + + // track if we are inside [], because nested [] are not allowed + bool inside = false; + for (size_t i = 0; i < regex_expr.size(); ++i) { + if (regex_expr[i] == '[' && (i == 0 || regex_expr[i - 1] != '\\')) { + regex_expr_collapsed += '['; + inside = true; + continue; + } + + if (inside && regex_expr[i] == ']' && regex_expr[i - 1] != '\\') { + regex_expr_collapsed += ']'; + inside = false; + continue; + } + + if (regex_expr[i + 0] == '\\' && i + 4 < regex_expr.size() && + regex_expr[i + 1] == 'p' && + regex_expr[i + 2] == '{' && + regex_expr[i + 4] == '}') { + const std::string pat = regex_expr.substr(i, 5); + if (k_ucat_enum.find(pat) != k_ucat_enum.end()) { + if (!inside) { + regex_expr_collapsed += '['; + } + regex_expr_collapsed += k_ucat_cpt.at(k_ucat_enum.at(pat)); + regex_expr_collapsed += k_ucat_map.at(k_ucat_enum.at(pat)); + if (!inside) { + regex_expr_collapsed += ']'; + } + i += 4; + continue; + } + } + + regex_expr_collapsed += regex_expr[i]; + } + + //printf("text_collapsed: %s\n", text_collapsed.c_str()); + //printf("regex_expr_collapsed: %s\n", regex_expr_collapsed.c_str()); + bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets); + } else { + // no unicode category used, we can use std::wregex directly + const std::wstring wtext = unicode_wstring_from_utf8(text); + const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr); + + //printf("text: %s\n", text.c_str()); + //printf("regex_expr: %s\n", regex_expr.c_str()); + bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets); + } + } catch (std::regex_error & e) { + fprintf(stderr, "Failed to process regex: '%s'\n", regex_expr.c_str()); + fprintf(stderr, "Regex error: %s\n", e.what()); + throw std::runtime_error("Failed to process regex"); + } + } + + std::vector bpe_words; + bpe_words.reserve(bpe_offsets.size()); // reserve memory for the approximate size + + size_t start = 0; + for (size_t & offset : bpe_offsets) { + bpe_words.emplace_back(); + for (size_t i = start; i < start + offset; ++i) { + bpe_words.back() += unicode_cpt_to_utf8(cpts[i]); + } + start += offset; + } + + return unicode_byte_encoding_process(bpe_words); +} diff --git a/unicode.h b/unicode.h index 6d14a5a33..d6a14d470 100644 --- a/unicode.h +++ b/unicode.h @@ -5,9 +5,9 @@ #include #define CODEPOINT_TYPE_UNIDENTIFIED 0 -#define CODEPOINT_TYPE_DIGIT 1 +#define CODEPOINT_TYPE_NUMBER 1 #define CODEPOINT_TYPE_LETTER 2 -#define CODEPOINT_TYPE_WHITESPACE 3 +#define CODEPOINT_TYPE_SEPARATOR 3 #define CODEPOINT_TYPE_ACCENT_MARK 4 #define CODEPOINT_TYPE_PUNCTUATION 5 #define CODEPOINT_TYPE_SYMBOL 6 @@ -21,6 +21,11 @@ std::vector unicode_cpts_normalize_nfd(const std::vector & c int unicode_cpt_type(uint32_t cp); int unicode_cpt_type(const std::string & utf8); +bool unicode_cpt_is_whitespace(uint32_t cp); + std::string unicode_byte_to_utf8(uint8_t byte); uint8_t unicode_utf8_to_byte(const std::string & utf8); +char32_t unicode_tolower(char32_t cp); + +std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs);