sync: minja (a72057e519) (#11774 )

Update README.md [no ci] (#11781 )
typo: `\` -> `/` Change the UNIX path separator to` \`.
2025-02-10 09:34:09 +00:00 · 2025-02-10 09:05:57 +01:00 · 2025-02-10 07:17:21 +01:00 · 2025-02-10 07:08:22 +01:00 · 2025-02-09 10:34:49 +00:00 · 2025-02-09 08:43:51 +01:00
1192 changed files with 318276 additions and 200859 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,161 @@
 ---
 Language:        Cpp
 AlignAfterOpenBracket: Align
 AlignArrayOfStructures: Left
 AlignConsecutiveAssignments: AcrossComments
 AlignConsecutiveBitFields: AcrossComments
 AlignConsecutiveDeclarations: AcrossComments
 AlignConsecutiveMacros: AcrossComments
 # AlignConsecutiveShortCaseStatements: AcrossComments
 AlignEscapedNewlines: Left # LeftWithLastLine
 AlignOperands:   Align
 AlignTrailingComments:
  Kind: Always
  OverEmptyLines: 1
 AllowAllArgumentsOnNextLine: true
 AllowAllParametersOfDeclarationOnNextLine: false
 # AllowBreakBeforeNoexceptSpecifier: OnlyWithParen
 AllowShortBlocksOnASingleLine: Never
 AllowShortCaseLabelsOnASingleLine: false
 AllowShortFunctionsOnASingleLine: Inline
 AllowShortIfStatementsOnASingleLine: Never
 AllowShortLambdasOnASingleLine: Inline
 AllowShortLoopsOnASingleLine: false
 AlwaysBreakBeforeMultilineStrings: true
 BinPackArguments: true
 BinPackParameters: true # OnePerLine
 BitFieldColonSpacing: Both
 BreakBeforeBraces: Custom # Attach
 BraceWrapping:
  AfterCaseLabel:  true
  AfterClass:      false
  AfterControlStatement: false
  AfterEnum:       false
  AfterFunction:   false
  AfterNamespace:  false
  AfterObjCDeclaration: false
  AfterStruct:     false
  AfterUnion:      false
  AfterExternBlock: false
  BeforeCatch:     false
  BeforeElse:      false
  BeforeLambdaBody: false
  BeforeWhile: false
  IndentBraces:    false
  SplitEmptyFunction: false
  SplitEmptyRecord: false
  SplitEmptyNamespace: false
 # BreakAdjacentStringLiterals: true
 BreakAfterAttributes: Never
 BreakBeforeBinaryOperators: None
 BreakBeforeInlineASMColon: OnlyMultiline
 BreakBeforeTernaryOperators: false
 # BreakBinaryOperations: Never
 BreakConstructorInitializers: AfterColon
 # BreakFunctionDefinitionParameters: false
 BreakInheritanceList: AfterComma
 BreakStringLiterals: true
 # BreakTemplateDeclarations: Yes
 ColumnLimit:     120
 CommentPragmas:  '^ IWYU pragma:'
 CompactNamespaces: false
 ConstructorInitializerIndentWidth: 4
 ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
 DisableFormat:   false
 EmptyLineBeforeAccessModifier: Leave
 EmptyLineAfterAccessModifier: Never
 ExperimentalAutoDetectBinPacking: false
 FixNamespaceComments: true
 IncludeBlocks:   Regroup
 IncludeCategories:
  - Regex:           '^<.*\.h>'
    Priority:        1
    SortPriority:    0
  - Regex:           '^<.*'
    Priority:        2
    SortPriority:    0
  - Regex:           '.*'
    Priority:        3
    SortPriority:    0
 IncludeIsMainRegex: '([-_](test|unittest))?$'
 IncludeIsMainSourceRegex: ''
 IndentAccessModifiers: false
 IndentCaseBlocks: true
 IndentCaseLabels: true
 IndentExternBlock: NoIndent
 IndentGotoLabels: false
 IndentPPDirectives: AfterHash
 IndentWidth:     4
 IndentWrappedFunctionNames: false
 InsertBraces:    true # NOTE: may lead to incorrect formatting
 InsertNewlineAtEOF: true
 JavaScriptQuotes: Leave
 JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 LambdaBodyIndentation: Signature
 LineEnding: LF
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 ObjCBinPackProtocolList: Auto
 ObjCBlockIndentWidth: 4
 ObjCSpaceAfterProperty: true
 ObjCSpaceBeforeProtocolList: true
 PPIndentWidth: -1
 PackConstructorInitializers: CurrentLine
 PenaltyBreakAssignment: 2
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
 PenaltyBreakTemplateDeclaration: 10
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Middle
 QualifierAlignment: Left
 #QualifierOrder: ['static', 'inline', 'friend', 'constexpr', 'const', 'volatile', 'type', 'restrict']
 RawStringFormats:
  - Language:        Cpp
    Delimiters:
      - cc
      - CC
      - cpp
      - Cpp
      - CPP
      - 'c++'
      - 'C++'
    CanonicalDelimiter: ''
 ReferenceAlignment: Middle
 ReflowComments:  false # IndentOnly
 SeparateDefinitionBlocks: Always
 SortIncludes:    CaseInsensitive
 SortUsingDeclarations: LexicographicNumeric
 SpaceAfterCStyleCast: true
 SpaceAfterLogicalNot: false
 SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeCpp11BracedList: false
 SpaceBeforeCtorInitializerColon: true
 SpaceBeforeInheritanceColon: true
 SpaceBeforeParens: ControlStatements
 SpaceBeforeRangeBasedForLoopColon: true
 SpaceInEmptyBlock: false
 SpaceInEmptyParentheses: false
 SpacesBeforeTrailingComments: 2
 SpacesInAngles:  Never
 SpacesInContainerLiterals: true
 SpacesInLineCommentPrefix:
  Minimum: 1
  Maximum: -1
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 SpaceBeforeSquareBrackets: false
 Standard:        c++17
 TabWidth:        4
 UseTab:          Never
 WhitespaceSensitiveMacros: ['STRINGIZE']
 ...
--- a/.clang-tidy
+++ b/.clang-tidy
@ -12,12 +12,15 @@ Checks: >
    -readability-implicit-bool-conversion,
    -readability-magic-numbers,
    -readability-uppercase-literal-suffix,
    -readability-simplify-boolean-expr,
    clang-analyzer-*,
    -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
    performance-*,
    portability-*,
    -portability-simd-intrinsics,
    misc-*,
    -misc-const-correctness,
    -misc-non-private-member-variables-in-classes,
    -misc-no-recursion,
    -misc-use-anonymous-namespace,
 FormatStyle: none
--- a/.devops/cloud-v-pipeline
+++ b/.devops/cloud-v-pipeline
@ -15,7 +15,7 @@ node('x86_runner1'){            // Running on x86 runner containing latest vecto
    stage('Running llama.cpp'){
        sh'''#!/bin/bash
            module load gnu-bin2/0.1            # loading latest versions of vector qemu and vector gcc
-            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./main -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
+            qemu-riscv64 -L /softwares/gnu-bin2/sysroot  -cpu rv64,v=true,vlen=256,elen=64,vext_spec=v1.0 ./llama-cli -m /home/alitariq/codellama-7b.Q4_K_M.gguf -p "Anything" -n 9 > llama_log.txt            # Running llama.cpp on vector qemu-riscv64
            cat llama_log.txt                   # Printing results
        '''
    }
--- a/.devops/cpu.Dockerfile
+++ b/.devops/cpu.Dockerfile
@ -0,0 +1,92 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 ARG TARGETARCH
 ARG GGML_CPU_ARM_ARCH=armv8-a
 RUN apt-get update && \
    apt-get install -y build-essential git cmake libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON; \
    elif [ "$TARGETARCH" = "arm64" ]; then \
        cmake -S . -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=${GGML_CPU_ARM_ARCH}; \
    else \
        echo "Unsupported architecture"; \
        exit 1; \
    fi && \
    cmake --build build -j $(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/cuda.Dockerfile
+++ b/.devops/cuda.Dockerfile
@ -0,0 +1,94 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=12.6.0
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} AS build
 # CUDA architecture to build for (defaults to all supported archs)
 ARG CUDA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y build-essential cmake python3 python3-pip git libcurl4-openssl-dev libgomp1
 WORKDIR /app
 COPY . .
 RUN if [ "${CUDA_DOCKER_ARCH}" != "default" ]; then \
    export CMAKE_ARGS="-DCMAKE_CUDA_ARCHITECTURES=${CUDA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_CUDA_RUN_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/full-cuda.Dockerfile
+++ b/.devops/full-cuda.Dockerfile
@ -1,34 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
 ENV LLAMA_CUBLAS=1
 RUN make
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full-rocm.Dockerfile
+++ b/.devops/full-rocm.Dockerfile
@ -1,45 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/full.Dockerfile
+++ b/.devops/full.Dockerfile
@ -1,22 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential python3 python3-pip git
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 RUN make
 ENV LC_ALL=C.utf8
 ENTRYPOINT ["/app/.devops/tools.sh"]
--- a/.devops/intel.Dockerfile
+++ b/.devops/intel.Dockerfile
@ -0,0 +1,91 @@
 ARG ONEAPI_VERSION=2025.0.0-0-devel-ubuntu22.04
 ## Build Image
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
 ARG GGML_SYCL_F16=OFF
 RUN apt-get update && \
    apt-get install -y git libcurl4-openssl-dev
 WORKDIR /app
 COPY . .
 RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
        echo "GGML_SYCL_F16 is set" \
        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
    fi && \
    echo "Building with dynamic libs" && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_CURL=ON ${OPT_SYCL_F16} && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 FROM intel/oneapi-basekit:$ONEAPI_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ### Full
 FROM base AS full
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/lib/ /app
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/llama-cli-cann.Dockerfile
+++ b/.devops/llama-cli-cann.Dockerfile
@ -0,0 +1,44 @@
 ARG ASCEND_VERSION=8.0.rc2.alpha003-910b-openeuler22.03-py3.8
 FROM ascendai/cann:$ASCEND_VERSION AS build
 WORKDIR /app
 COPY . .
 RUN yum install -y gcc g++ cmake make
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
 ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
 ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
 ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
 ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
 ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
 ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
 # find libascend_hal.so, because the drive hasn`t been mounted.
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/runtime/lib64/stub:$LD_LIBRARY_PATH
 RUN echo "Building with static libs" && \
    source /usr/local/Ascend/ascend-toolkit/set_env.sh --force && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_CANN=ON -DBUILD_SHARED_LIBS=OFF  && \
    cmake --build build --config Release --target llama-cli
 # TODO: use image with NNRT
 FROM ascendai/cann:$ASCEND_VERSION AS runtime
 COPY --from=build /app/build/bin/llama-cli /llama-cli
 ENV LC_ALL=C.utf8
 ENV ASCEND_TOOLKIT_HOME=/usr/local/Ascend/ascend-toolkit/latest
 ENV LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:$LIBRARY_PATH
 ENV LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/lib64/plugin/opskernel:${ASCEND_TOOLKIT_HOME}/lib64/plugin/nnengine:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe/op_tiling:${LD_LIBRARY_PATH}
 ENV PYTHONPATH=${ASCEND_TOOLKIT_HOME}/python/site-packages:${ASCEND_TOOLKIT_HOME}/opp/built-in/op_impl/ai_core/tbe:${PYTHONPATH}
 ENV PATH=${ASCEND_TOOLKIT_HOME}/bin:${ASCEND_TOOLKIT_HOME}/compiler/ccec_compiler/bin:${PATH}
 ENV ASCEND_AICPU_PATH=${ASCEND_TOOLKIT_HOME}
 ENV ASCEND_OPP_PATH=${ASCEND_TOOLKIT_HOME}/opp
 ENV TOOLCHAIN_HOME=${ASCEND_TOOLKIT_HOME}/toolkit
 ENV ASCEND_HOME_PATH=${ASCEND_TOOLKIT_HOME}
 ENTRYPOINT ["/llama-cli" ]
--- a/.devops/llama-cpp-clblast.srpm.spec
+++ b/.devops/llama-cpp-clblast.srpm.spec
@ -1,84 +0,0 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
 # https://fedoraproject.org/wiki/How_to_create_an_RPM_package
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
 # Notes for llama.cpp:
 # 1. Tags are currently based on hash - which will not sort asciibetically.
 #    We need to declare standard versioning if people want to sort latest releases.
 # 2. Builds for CUDA/OpenCL support are separate, with different depenedencies.
 # 3. NVidia's developer repo must be enabled with nvcc, cublas, clblas, etc installed.
 #    Example: https://developer.download.nvidia.com/compute/cuda/repos/fedora37/x86_64/cuda-fedora37.repo
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
 Name:           llama.cpp-clblast
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        OpenCL Inference of LLaMA model in C/C++
 License:        MIT
 Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
 BuildRequires:  coreutils make gcc-c++ git mesa-libOpenCL-devel clblast-devel
 Requires:       clblast
 URL:            https://github.com/ggerganov/llama.cpp
 %define debug_package %{nil}
 %define source_date_epoch_from_changelog 0
 %description
 CPU inference for Meta's Lllama2 models using default options.
 %prep
 %setup -n llama.cpp-master
 %build
 make -j LLAMA_CLBLAST=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
 cp -p main %{buildroot}%{_bindir}/llamaclblast
 cp -p server %{buildroot}%{_bindir}/llamaclblastserver
 cp -p simple %{buildroot}%{_bindir}/llamaclblastsimple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamaclblast.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
 ExecStart=/usr/bin/llamaclblastserver $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
 [Install]
 WantedBy=default.target
 EOF
 mkdir -p %{buildroot}/etc/sysconfig
 %{__cat} <<EOF  > %{buildroot}/etc/sysconfig/llama
 LLAMA_ARGS="-m /opt/llama2/ggml-model-f32.bin"
 EOF
 %clean
 rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
 %{_bindir}/llamaclblast
 %{_bindir}/llamaclblastserver
 %{_bindir}/llamaclblastsimple
 /usr/lib/systemd/system/llamaclblast.service
 %config /etc/sysconfig/llama
 %pre
 %post
 %preun
 %postun
 %changelog
--- a/.devops/llama-cpp-cublas.srpm.spec
+++ b/.devops/llama-cpp-cublas.srpm.spec
@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
@ -12,7 +12,7 @@
 # 4. OpenCL/CLBLAST support simply requires the ICD loader and basic opencl libraries.
 #    It is up to the user to install the correct vendor-specific support.
-Name:           llama.cpp-cublas
+Name:           llama.cpp-cuda
 Version:        %( date "+%%Y%%m%%d" )
 Release:        1%{?dist}
 Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
@ -32,16 +32,16 @@ CPU inference for Meta's Lllama2 models using default options.
 %setup -n llama.cpp-master
 %build
-make -j LLAMA_CUBLAS=1
+make -j GGML_CUDA=1
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llamacppcublas
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cuda-cli
-cp -p server %{buildroot}%{_bindir}/llamacppcublasserver
+cp -p llama-server %{buildroot}%{_bindir}/llama-cuda-server
-cp -p simple %{buildroot}%{_bindir}/llamacppcublassimple
+cp -p llama-simple %{buildroot}%{_bindir}/llama-cuda-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
-%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacublas.service
+%{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llamacuda.service
 [Unit]
 Description=Llama.cpp server, CPU only (no GPU support in this build).
 After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.target
@ -49,7 +49,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamacppcublasserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-cuda-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
@ -67,10 +67,10 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
-%{_bindir}/llamacppcublas
+%{_bindir}/llama-cuda-cli
-%{_bindir}/llamacppcublasserver
+%{_bindir}/llama-cuda-server
-%{_bindir}/llamacppcublassimple
+%{_bindir}/llama-cuda-simple
-/usr/lib/systemd/system/llamacublas.service
+/usr/lib/systemd/system/llamacuda.service
 %config /etc/sysconfig/llama
 %pre
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@ -1,5 +1,5 @@
 # SRPM for building from source and packaging an RPM for RPM-based distros.
-# https://fedoraproject.org/wiki/How_to_create_an_RPM_package
+# https://docs.fedoraproject.org/en-US/quick-docs/creating-rpm-packages
 # Built and maintained by John Boero - boeroboy@gmail.com
 # In honor of Seth Vidal https://www.redhat.com/it/blog/thank-you-seth-vidal
@ -38,9 +38,9 @@ make -j
 %install
 mkdir -p %{buildroot}%{_bindir}/
-cp -p main %{buildroot}%{_bindir}/llama
+cp -p llama-cli %{buildroot}%{_bindir}/llama-cli
-cp -p server %{buildroot}%{_bindir}/llamaserver
+cp -p llama-server %{buildroot}%{_bindir}/llama-server
-cp -p simple %{buildroot}%{_bindir}/llamasimple
+cp -p llama-simple %{buildroot}%{_bindir}/llama-simple
 mkdir -p %{buildroot}/usr/lib/systemd/system
 %{__cat} <<EOF  > %{buildroot}/usr/lib/systemd/system/llama.service
@ -51,7 +51,7 @@ After=syslog.target network.target local-fs.target remote-fs.target nss-lookup.t
 [Service]
 Type=simple
 EnvironmentFile=/etc/sysconfig/llama
-ExecStart=/usr/bin/llamaserver $LLAMA_ARGS
+ExecStart=/usr/bin/llama-server $LLAMA_ARGS
 ExecReload=/bin/kill -s HUP $MAINPID
 Restart=never
@ -69,9 +69,9 @@ rm -rf %{buildroot}
 rm -rf %{_builddir}/*
 %files
-%{_bindir}/llama
+%{_bindir}/llama-cli
-%{_bindir}/llamaserver
+%{_bindir}/llama-server
-%{_bindir}/llamasimple
+%{_bindir}/llama-simple
 /usr/lib/systemd/system/llama.service
 %config /etc/sysconfig/llama
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -1,32 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
 ENV LLAMA_CUBLAS=1
 RUN make
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 COPY --from=build /app/main /main
 ENTRYPOINT [ "/main" ]
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -1,26 +0,0 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 ARG UBUNTU_VERSION=22.04
 FROM intel/hpckit:$ONEAPI_VERSION as build
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 # for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
 RUN mkdir build && \
    cd build && \
    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
    cmake --build . --config Release --target main server
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/build/bin/main /main
 COPY --from=build /app/build/bin/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/main" ]
--- a/.devops/main-rocm.Dockerfile
+++ b/.devops/main-rocm.Dockerfile
@ -1,45 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT [ "/app/main" ]
--- a/.devops/main.Dockerfile
+++ b/.devops/main.Dockerfile
@ -1,20 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 RUN make
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/main /main
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/main" ]
--- a/.devops/musa.Dockerfile
+++ b/.devops/musa.Dockerfile
@ -0,0 +1,108 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG MUSA_VERSION=rc3.1.0
 # Target the MUSA build image
 ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 # MUSA architecture to build for (defaults to all supported archs)
 ARG MUSA_DOCKER_ARCH=default
 RUN apt-get update && \
    apt-get install -y \
    build-essential \
    cmake \
    python3 \
    python3-pip \
    git \
    libcurl4-openssl-dev \
    libgomp1
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Use the default MUSA archs if not specified
 RUN if [ "${MUSA_DOCKER_ARCH}" != "default" ]; then \
        export CMAKE_ARGS="-DMUSA_ARCHITECTURES=${MUSA_DOCKER_ARCH}"; \
    fi && \
    cmake -B build -DGGML_NATIVE=OFF -DGGML_MUSA=ON -DLLAMA_CURL=ON ${CMAKE_ARGS} -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined . && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_MUSA_RUN_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    && pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/nix/apps.nix
+++ b/.devops/nix/apps.nix
@ -6,11 +6,10 @@
        let
          inherit (config.packages) default;
          binaries = [
-            "llama"
+            "llama-cli"
            "llama-embedding"
            "llama-server"
-            "quantize"
+            "llama-quantize"
            "train-text-from-scratch"
          ];
          mkApp = name: {
            type = "app";
--- a/.devops/nix/devshells.nix
+++ b/.devops/nix/devshells.nix
@ -1,13 +1,52 @@
 { inputs, ... }:
 {
  perSystem =
-    { config, lib, ... }:
+    {
      config,
      lib,
      system,
      ...
    }:
    {
      devShells =
-        lib.concatMapAttrs
+        let
-          (name: package: {
+          pkgs = import inputs.nixpkgs { inherit system; };
-            ${name} = package.passthru.shell;
+          stdenv = pkgs.stdenv;
-            ${name + "-extra"} = package.passthru.shell-extra;
+          scripts = config.packages.python-scripts;
-          })
+        in
-          config.packages;
+        lib.pipe (config.packages) [
          (lib.concatMapAttrs (
            name: package: {
              ${name} = pkgs.mkShell {
                name = "${name}";
                inputsFrom = [ package ];
                shellHook = ''
                  echo "Entering ${name} devShell"
                '';
              };
              "${name}-extra" =
                if (name == "python-scripts") then
                  null
                else
                  pkgs.mkShell {
                    name = "${name}-extra";
                    inputsFrom = [
                      package
                      scripts
                    ];
                    # Extra packages that *may* be used by some scripts
                    packages = [
                        pkgs.python3Packages.tiktoken
                    ];
                    shellHook = ''
                      echo "Entering ${name} devShell"
                      addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib stdenv.cc.cc}/lib"
                    '';
                  };
            }
          ))
          (lib.filterAttrs (name: value: value != null))
        ];
    };
 }
--- a/.devops/nix/docker.nix
+++ b/.devops/nix/docker.nix
@ -0,0 +1,37 @@
 {
  lib,
  dockerTools,
  buildEnv,
  llama-cpp,
  interactive ? true,
  coreutils,
 }:
 # A tar that can be fed into `docker load`:
 #
 # $ nix build .#llamaPackages.docker
 # $ docker load < result
 # For details and variations cf.
 # - https://nixos.org/manual/nixpkgs/unstable/#ssec-pkgs-dockerTools-buildLayeredImage
 # - https://discourse.nixos.org/t/a-faster-dockertools-buildimage-prototype/16922
 # - https://nixery.dev/
 # Approximate (compressed) sizes, at the time of writing, are:
 #
 # .#llamaPackages.docker: 125M;
 # .#llamaPackagesCuda.docker: 537M;
 # .#legacyPackages.aarch64-linux.llamaPackagesXavier.docker: 415M.
 dockerTools.buildLayeredImage {
  name = llama-cpp.pname;
  tag = "latest";
  contents =
    [ llama-cpp ]
    ++ lib.optionals interactive [
      coreutils
      dockerTools.binSh
      dockerTools.caCertificates
    ];
 }
--- a/.devops/nix/nixpkgs-instances.nix
+++ b/.devops/nix/nixpkgs-instances.nix
@ -26,16 +26,14 @@
          config.cudaSupport = true;
          config.allowUnfreePredicate =
            p:
-            builtins.all
+            builtins.all (
-              (
+              license:
-                license:
+              license.free
-                license.free
+              || builtins.elem license.shortName [
-                || builtins.elem license.shortName [
+                "CUDA EULA"
-                  "CUDA EULA"
+                "cuDNN EULA"
-                  "cuDNN EULA"
+              ]
-                ]
+            ) (p.meta.licenses or [ p.meta.license ]);
              )
              (p.meta.licenses or [ p.meta.license ]);
        };
        # Ensure dependencies use ROCm consistently
        pkgsRocm = import inputs.nixpkgs {
--- a/.devops/nix/package-gguf-py.nix
+++ b/.devops/nix/package-gguf-py.nix
@ -0,0 +1,36 @@
 {
  lib,
  llamaVersion,
  numpy,
  tqdm,
  sentencepiece,
  pyyaml,
  poetry-core,
  buildPythonPackage,
  pytestCheckHook,
 }:
 buildPythonPackage {
  pname = "gguf";
  version = llamaVersion;
  pyproject = true;
  nativeBuildInputs = [ poetry-core ];
  propagatedBuildInputs = [
    numpy
    tqdm
    sentencepiece
    pyyaml
  ];
  src = lib.cleanSource ../../gguf-py;
  pythonImportsCheck = [
    "numpy"
    "gguf"
  ];
  nativeCheckInputs = [ pytestCheckHook ];
  doCheck = true;
  meta = with lib; {
    description = "Python package for writing binary files in the GGUF format";
    license = licenses.mit;
    maintainers = [ maintainers.ditsuke ];
  };
 }
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@ -1,32 +1,47 @@
 {
  lib,
  glibc,
  config,
  stdenv,
-  mkShell,
+  runCommand,
  cmake,
  ninja,
  pkg-config,
  git,
  python3,
  mpi,
-  openblas, # TODO: Use the generic `blas` so users could switch between alternative implementations
+  blas,
  cudaPackages,
  autoAddDriverRunpath,
  darwin,
  rocmPackages,
-  clblast,
+  vulkan-headers,
-  useBlas ? builtins.all (x: !x) [
+  vulkan-loader,
-    useCuda
+  curl,
-    useMetalKit
+  shaderc,
-    useOpenCL
+  useBlas ?
-    useRocm
+    builtins.all (x: !x) [
-  ],
+      useCuda
      useMetalKit
      useRocm
      useVulkan
    ]
    && blas.meta.available,
  useCuda ? config.cudaSupport,
-  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin && !useOpenCL,
+  useMetalKit ? stdenv.isAarch64 && stdenv.isDarwin,
-  useMpi ? false, # Increases the runtime closure size by ~700M
+  # Increases the runtime closure size by ~700M
-  useOpenCL ? false,
+  useMpi ? false,
  useRocm ? config.rocmSupport,
  rocmGpuTargets ? builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets,
  enableCurl ? true,
  useVulkan ? false,
  llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
-}@inputs:
+
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
  enableStatic ? effectiveStdenv.hostPlatform.isStatic,
  precompileMetalShaders ? false,
 }:
 let
  inherit (lib)
@ -34,50 +49,29 @@ let
    cmakeFeature
    optionals
    strings
    versionOlder
    ;
  # It's necessary to consistently use backendStdenv when building with CUDA support,
  # otherwise we get libstdc++ errors downstream.
  stdenv = throw "Use effectiveStdenv instead";
  effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
  suffices =
    lib.optionals useBlas [ "BLAS" ]
    ++ lib.optionals useCuda [ "CUDA" ]
    ++ lib.optionals useMetalKit [ "MetalKit" ]
    ++ lib.optionals useMpi [ "MPI" ]
-    ++ lib.optionals useOpenCL [ "OpenCL" ]
+    ++ lib.optionals useRocm [ "ROCm" ]
-    ++ lib.optionals useRocm [ "ROCm" ];
+    ++ lib.optionals useVulkan [ "Vulkan" ];
  pnameSuffix =
    strings.optionalString (suffices != [ ])
      "-${strings.concatMapStringsSep "-" strings.toLower suffices}";
-  descriptionSuffix =
+  descriptionSuffix = strings.optionalString (
-    strings.optionalString (suffices != [ ])
+    suffices != [ ]
-      ", accelerated with ${strings.concatStringsSep ", " suffices}";
+  ) ", accelerated with ${strings.concatStringsSep ", " suffices}";
-  # TODO: package the Python in this repository in a Nix-like way.
+  xcrunHost = runCommand "xcrunHost" { } ''
-  # It'd be nice to migrate to buildPythonPackage, as well as ensure this repo
+    mkdir -p $out/bin
-  # is PEP 517-compatible, and ensure the correct .dist-info is generated.
+    ln -s /usr/bin/xcrun $out/bin
-  # https://peps.python.org/pep-0517/
+  '';
  llama-python = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
    ]
  );
  # TODO(Green-Sky): find a better way to opt-into the heavy ml python runtime
  llama-python-extra = python3.withPackages (
    ps: [
      ps.numpy
      ps.sentencepiece
      ps.tiktoken
      ps.torchWithoutCuda
      ps.transformers
    ]
  );
  # apple_sdk is supposed to choose sane defaults, no need to handle isAarch64
  # separately
@ -91,16 +85,9 @@ let
    ++ optionals useMetalKit [ MetalKit ];
  cudaBuildInputs = with cudaPackages; [
-    cuda_cccl.dev # <nv/target>
+    cuda_cudart
-
+    cuda_cccl # <nv/target>
-    # A temporary hack for reducing the closure size, remove once cudaPackages
+    libcublas
    # have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
    cuda_cudart.dev
    cuda_cudart.lib
    cuda_cudart.static
    libcublas.dev
    libcublas.lib
    libcublas.static
  ];
  rocmBuildInputs = with rocmPackages; [
@ -108,170 +95,153 @@ let
    hipblas
    rocblas
  ];
  vulkanBuildInputs = [
    vulkan-headers
    vulkan-loader
    shaderc
  ];
 in
-effectiveStdenv.mkDerivation (
+effectiveStdenv.mkDerivation (finalAttrs: {
-  finalAttrs: {
+  pname = "llama-cpp${pnameSuffix}";
-    pname = "llama-cpp${pnameSuffix}";
+  version = llamaVersion;
    version = llamaVersion;
-    # Note: none of the files discarded here are visible in the sandbox or
+  # Note: none of the files discarded here are visible in the sandbox or
-    # affect the output hash. This also means they can be modified without
+  # affect the output hash. This also means they can be modified without
-    # triggering a rebuild.
+  # triggering a rebuild.
-    src = lib.cleanSourceWith {
+  src = lib.cleanSourceWith {
-      filter =
+    filter =
-        name: type:
+      name: type:
-        let
+      let
-          noneOf = builtins.all (x: !x);
+        noneOf = builtins.all (x: !x);
-          baseName = baseNameOf name;
+        baseName = baseNameOf name;
-        in
+      in
-        noneOf [
+      noneOf [
-          (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
+        (lib.hasSuffix ".nix" name) # Ignore *.nix files when computing outPaths
-          (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
+        (lib.hasSuffix ".md" name) # Ignore *.md changes whe computing outPaths
-          (lib.hasPrefix "." baseName) # Skip hidden files and directories
+        (lib.hasPrefix "." baseName) # Skip hidden files and directories
-          (baseName == "flake.lock")
+        (baseName == "flake.lock")
        ];
      src = lib.cleanSource ../../.;
    };
    postPatch = ''
      substituteInPlace ./ggml-metal.m \
        --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
      # TODO: Package up each Python script or service appropriately.
      # If we were to migrate to buildPythonPackage and prepare the `pyproject.toml`,
      # we could make those *.py into setuptools' entrypoints
      substituteInPlace ./*.py --replace "/usr/bin/env python" "${llama-python}/bin/python"
    '';
    nativeBuildInputs =
      [
        cmake
        ninja
        pkg-config
        git
      ]
      ++ optionals useCuda [
        cudaPackages.cuda_nvcc
        # TODO: Replace with autoAddDriverRunpath
        # once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
        cudaPackages.autoAddOpenGLRunpathHook
      ];
    src = lib.cleanSource ../../.;
  };
-    buildInputs =
+  postPatch = ''
-      optionals effectiveStdenv.isDarwin darwinBuildInputs
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
-      ++ optionals useCuda cudaBuildInputs
+      --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";"
-      ++ optionals useMpi [ mpi ]
+    substituteInPlace ./ggml/src/ggml-metal/ggml-metal.m \
-      ++ optionals useOpenCL [ clblast ]
+      --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
-      ++ optionals useRocm rocmBuildInputs;
+  '';
-    cmakeFlags =
+  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
-      [
+  # `default.metallib` may be compiled with Metal compiler from XCode
-        (cmakeBool "LLAMA_NATIVE" false)
+  # and we need to escape sandbox on MacOS to access Metal compiler.
-        (cmakeBool "LLAMA_BUILD_SERVER" true)
+  # `xcrun` is used find the path of the Metal compiler, which is varible
-        (cmakeBool "BUILD_SHARED_LIBS" true)
+  # and not on $PATH
-        (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
+  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
-        (cmakeBool "LLAMA_BLAS" useBlas)
+  __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
-        (cmakeBool "LLAMA_CLBLAST" useOpenCL)
+
-        (cmakeBool "LLAMA_CUBLAS" useCuda)
+  nativeBuildInputs =
-        (cmakeBool "LLAMA_HIPBLAS" useRocm)
+    [
-        (cmakeBool "LLAMA_METAL" useMetalKit)
+      cmake
-        (cmakeBool "LLAMA_MPI" useMpi)
+      ninja
-      ]
+      pkg-config
-      ++ optionals useCuda [
+      git
-        (
+    ]
-          with cudaPackages.flags;
+    ++ optionals useCuda [
-          cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
+      cudaPackages.cuda_nvcc
-            builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
+
-          )
+      autoAddDriverRunpath
    ]
    ++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [ glibc.static ]
    ++ optionals (effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders) [ xcrunHost ];
  buildInputs =
    optionals effectiveStdenv.isDarwin darwinBuildInputs
    ++ optionals useCuda cudaBuildInputs
    ++ optionals useMpi [ mpi ]
    ++ optionals useRocm rocmBuildInputs
    ++ optionals useBlas [ blas ]
    ++ optionals useVulkan vulkanBuildInputs
    ++ optionals enableCurl [ curl ];
  cmakeFlags =
    [
      (cmakeBool "LLAMA_BUILD_SERVER" true)
      (cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
      (cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
      (cmakeBool "LLAMA_CURL" enableCurl)
      (cmakeBool "GGML_NATIVE" false)
      (cmakeBool "GGML_BLAS" useBlas)
      (cmakeBool "GGML_CUDA" useCuda)
      (cmakeBool "GGML_HIP" useRocm)
      (cmakeBool "GGML_METAL" useMetalKit)
      (cmakeBool "GGML_VULKAN" useVulkan)
      (cmakeBool "GGML_STATIC" enableStatic)
    ]
    ++ optionals useCuda [
      (
        with cudaPackages.flags;
        cmakeFeature "CMAKE_CUDA_ARCHITECTURES" (
          builtins.concatStringsSep ";" (map dropDot cudaCapabilities)
        )
-      ]
+      )
-      ++ optionals useRocm [
+    ]
-        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
+    ++ optionals useRocm [
-        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+      (cmakeFeature "CMAKE_HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
      (cmakeFeature "CMAKE_HIP_ARCHITECTURES" rocmGpuTargets)
    ]
    ++ optionals useMetalKit [
      (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1")
      (cmakeBool "GGML_METAL_EMBED_LIBRARY" (!precompileMetalShaders))
    ];
-        # Build all targets supported by rocBLAS. When updating search for TARGET_LIST_ROCM
+  # Environment variables needed for ROCm
-        # in https://github.com/ROCmSoftwarePlatform/rocBLAS/blob/develop/CMakeLists.txt
+  env = optionals useRocm {
-        # and select the line that matches the current nixpkgs version of rocBLAS.
+    ROCM_PATH = "${rocmPackages.clr}";
-        # Should likely use `rocmPackages.clr.gpuTargets`.
+    HIP_DEVICE_LIB_PATH = "${rocmPackages.rocm-device-libs}/amdgcn/bitcode";
-        "-DAMDGPU_TARGETS=gfx803;gfx900;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack+;gfx90a:xnack-;gfx940;gfx941;gfx942;gfx1010;gfx1012;gfx1030;gfx1100;gfx1101;gfx1102"
+  };
      ]
      ++ optionals useMetalKit [ (lib.cmakeFeature "CMAKE_C_FLAGS" "-D__ARM_FEATURE_DOTPROD=1") ]
      ++ optionals useBlas [ (lib.cmakeFeature "LLAMA_BLAS_VENDOR" "OpenBLAS") ];
-    # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
+  # TODO(SomeoneSerge): It's better to add proper install targets at the CMake level,
-    # if they haven't been added yet.
+  # if they haven't been added yet.
-    postInstall = ''
+  postInstall = ''
-      mv $out/bin/main $out/bin/llama
+    mkdir -p $out/include
-      mv $out/bin/server $out/bin/llama-server
+    cp $src/include/llama.h $out/include/
-      mkdir -p $out/include
+  '';
      cp $src/llama.h $out/include/
    '';
-    # Define the shells here, but don't add in the inputsFrom to avoid recursion.
+  meta = {
-    passthru = {
+    # Configurations we don't want even the CI to evaluate. Results in the
-      inherit
+    # "unsupported platform" messages. This is mostly a no-op, because
-        useBlas
+    # cudaPackages would've refused to evaluate anyway.
-        useCuda
+    badPlatforms = optionals useCuda lib.platforms.darwin;
        useMetalKit
        useMpi
        useOpenCL
        useRocm
        ;
-      shell = mkShell {
+    # Configurations that are known to result in build failures. Can be
-        name = "shell-${finalAttrs.finalPackage.name}";
+    # overridden by importing Nixpkgs with `allowBroken = true`.
-        description = "contains numpy and sentencepiece";
+    broken = (useMetalKit && !effectiveStdenv.isDarwin);
        buildInputs = [ llama-python ];
        inputsFrom = [ finalAttrs.finalPackage ];
        shellHook = ''
          addToSearchPath "LD_LIBRARY_PATH" "${lib.getLib effectiveStdenv.cc.cc}/lib"
        '';
      };
-      shell-extra = mkShell {
+    description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-        name = "shell-extra-${finalAttrs.finalPackage.name}";
+    homepage = "https://github.com/ggerganov/llama.cpp/";
-        description = "contains numpy, sentencepiece, torchWithoutCuda, and transformers";
+    license = lib.licenses.mit;
        buildInputs = [ llama-python-extra ];
        inputsFrom = [ finalAttrs.finalPackage ];
      };
    };
-    meta = {
+    # Accommodates `nix run` and `lib.getExe`
-      # Configurations we don't want even the CI to evaluate. Results in the
+    mainProgram = "llama-cli";
      # "unsupported platform" messages. This is mostly a no-op, because
      # cudaPackages would've refused to evaluate anyway.
      badPlatforms = optionals (useCuda || useOpenCL) lib.platforms.darwin;
-      # Configurations that are known to result in build failures. Can be
+    # These people might respond, on the best effort basis, if you ping them
-      # overridden by importing Nixpkgs with `allowBroken = true`.
+    # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
-      broken = (useMetalKit && !effectiveStdenv.isDarwin);
+    # Consider adding yourself to this list if you want to ensure this flake
    # stays maintained and you're willing to invest your time. Do not add
    # other people without their consent. Consider removing people after
    # they've been unreachable for long periods of time.
-      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
+    # Note that lib.maintainers is defined in Nixpkgs, but you may just add
-      homepage = "https://github.com/ggerganov/llama.cpp/";
+    # an attrset following the same format as in
-      license = lib.licenses.mit;
+    # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
    maintainers = with lib.maintainers; [
      philiptaron
      SomeoneSerge
    ];
-      # Accommodates `nix run` and `lib.getExe`
+    # Extend `badPlatforms` instead
-      mainProgram = "llama";
+    platforms = lib.platforms.all;
-
+  };
-      # These people might respond, on the best effort basis, if you ping them
+})
      # in case of Nix-specific regressions or for reviewing Nix-specific PRs.
      # Consider adding yourself to this list if you want to ensure this flake
      # stays maintained and you're willing to invest your time. Do not add
      # other people without their consent. Consider removing people after
      # they've been unreachable for long periods of time.
      # Note that lib.maintainers is defined in Nixpkgs, but you may just add
      # an attrset following the same format as in
      # https://github.com/NixOS/nixpkgs/blob/f36a80e54da29775c78d7eff0e628c2b4e34d1d7/maintainers/maintainer-list.nix
      maintainers = with lib.maintainers; [
        philiptaron
        SomeoneSerge
      ];
      # Extend `badPlatforms` instead
      platforms = lib.platforms.all;
    };
  }
 )
--- a/.devops/nix/python-scripts.nix
+++ b/.devops/nix/python-scripts.nix
@ -0,0 +1,66 @@
 {
  lib,
  stdenv,
  buildPythonPackage,
  poetry-core,
  mkShell,
  python3Packages,
  gguf-py,
 }@inputs:
 let
  llama-python-deps = with python3Packages; [
    numpy
    sentencepiece
    transformers
    protobuf
    torchWithoutCuda
    gguf-py
    tqdm
    # for scripts/compare-llama-bench.py
    gitpython
    tabulate
    # for examples/pydantic-models-to-grammar-examples.py
    docstring-parser
    pydantic
  ];
  llama-python-test-deps = with python3Packages; [
    # Server bench
    matplotlib
    # server tests
    openai
    pytest
    prometheus-client
  ];
 in
 buildPythonPackage ({
  pname = "llama-scripts";
  version = "0.0.0";
  pyproject = true;
  # NOTE: The files filtered out here are not visible in the build sandbox, neither
  # do they affect the output hash. They can be modified without triggering a rebuild.
  src = lib.cleanSourceWith {
    filter =
      name: type:
      let
        any = builtins.any (x: x);
        baseName = builtins.baseNameOf name;
      in
      any [
        (lib.hasSuffix ".py" name)
        (baseName == "README.md")
        (baseName == "pyproject.toml")
      ];
    src = lib.cleanSource ../../.;
  };
  nativeBuildInputs = [ poetry-core ];
  nativeCheckInputs = llama-python-test-deps;
  dependencies = llama-python-deps;
 })
--- a/.devops/nix/scope.nix
+++ b/.devops/nix/scope.nix
@ -1,16 +1,41 @@
 {
  lib,
  newScope,
  python3,
  llamaVersion ? "0.0.0",
 }:
 let
  pythonPackages = python3.pkgs;
  buildPythonPackage = pythonPackages.buildPythonPackage;
  numpy = pythonPackages.numpy;
  tqdm = pythonPackages.tqdm;
  sentencepiece = pythonPackages.sentencepiece;
  pyyaml = pythonPackages.pyyaml;
  poetry-core = pythonPackages.poetry-core;
  pytestCheckHook = pythonPackages.pytestCheckHook;
 in
 # We're using `makeScope` instead of just writing out an attrset
 # because it allows users to apply overlays later using `overrideScope'`.
 # Cf. https://noogle.dev/f/lib/makeScope
-lib.makeScope newScope (
+lib.makeScope newScope (self: {
-  self: {
+  inherit llamaVersion;
-    inherit llamaVersion;
+  gguf-py = self.callPackage ./package-gguf-py.nix {
-    llama-cpp = self.callPackage ./package.nix { };
+    inherit
-  }
+      buildPythonPackage
-)
+      numpy
      tqdm
      sentencepiece
      poetry-core
      pyyaml
      pytestCheckHook
      ;
  };
  python-scripts = self.callPackage ./python-scripts.nix { inherit buildPythonPackage poetry-core; };
  llama-cpp = self.callPackage ./package.nix { };
  docker = self.callPackage ./docker.nix { };
  docker-min = self.callPackage ./docker.nix { interactive = false; };
  sif = self.callPackage ./sif.nix { };
 })
--- a/.devops/nix/sif.nix
+++ b/.devops/nix/sif.nix
@ -0,0 +1,27 @@
 {
  lib,
  singularity-tools,
  llama-cpp,
  bashInteractive,
  interactive ? false,
 }:
 let
  optionalInt = cond: x: if cond then x else 0;
 in
 singularity-tools.buildImage rec {
  inherit (llama-cpp) name;
  contents = [ llama-cpp ] ++ lib.optionals interactive [ bashInteractive ];
  # These are excessive (but safe) for most variants. Building singularity
  # images requires superuser privileges, so we build them inside a VM in a
  # writable image of pre-determined size.
  #
  # ROCm is currently affected by https://github.com/NixOS/nixpkgs/issues/276846
  #
  # Expected image sizes:
  # - cpu/blas: 150M,
  # - cuda, all gencodes: 560M,
  diskSize = 4096 + optionalInt llama-cpp.useRocm 16384;
  memSize = diskSize;
 }
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@ -0,0 +1,113 @@
 ARG UBUNTU_VERSION=24.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=6.3
 ARG AMDGPU_VERSION=6.3
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 ### Build image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
 # gfx906 is deprecated
 #check https://rocm.docs.amd.com/projects/install-on-linux/en/docs-6.2.4/reference/system-requirements.html
 #ARG ROCM_DOCKER_ARCH='gfx803,gfx900,gfx906,gfx908,gfx90a,gfx942,gfx1010,gfx1030,gfx1032,gfx1100,gfx1101,gfx1102'
 ARG ROCM_DOCKER_ARCH=gfx1100
 # Set nvcc architectured
 ENV AMDGPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 # ENV CC=/opt/rocm/llvm/bin/clang
 # ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN apt-get update \
    && apt-get install -y \
    build-essential \
    cmake \
    git \
    libcurl4-openssl-dev \
    curl \
    libgomp1
 WORKDIR /app
 COPY . .
 RUN HIPCXX="$(hipconfig -l)/clang" HIP_PATH="$(hipconfig -R)" \
    cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=$ROCM_DOCKER_ARCH -DCMAKE_BUILD_TYPE=Release -DLLAMA_CURL=ON \
    && cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib \
    && find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ${BASE_ROCM_DEV_CONTAINER} AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl\
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3-pip \
    python3 \
    python3-wheel\
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.devops/server-cuda.Dockerfile
+++ b/.devops/server-cuda.Dockerfile
@ -1,32 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG CUDA_VERSION=11.7.1
 # Target the CUDA build image
 ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
 # Target the CUDA runtime image
 ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
 FROM ${BASE_CUDA_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 ARG CUDA_DOCKER_ARCH=all
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
 # Enable cuBLAS
 ENV LLAMA_CUBLAS=1
 RUN make
 FROM ${BASE_CUDA_RUN_CONTAINER} as runtime
 COPY --from=build /app/server /server
 ENTRYPOINT [ "/server" ]
--- a/.devops/server-intel.Dockerfile
+++ b/.devops/server-intel.Dockerfile
@ -1,25 +0,0 @@
 ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
 ARG UBUNTU_VERSION=22.04
 FROM intel/hpckit:$ONEAPI_VERSION as build
 RUN apt-get update && \
    apt-get install -y git
 WORKDIR /app
 COPY . .
 # for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
 RUN mkdir build && \
    cd build && \
    cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
    cmake --build . --config Release --target main server
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/build/bin/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.devops/server-rocm.Dockerfile
+++ b/.devops/server-rocm.Dockerfile
@ -1,45 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
 ARG ROCM_VERSION=5.6
 # Target the CUDA build image
 ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-complete
 FROM ${BASE_ROCM_DEV_CONTAINER} as build
 # Unless otherwise specified, we make a fat build.
 # List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
 # This is mostly tied to rocBLAS supported archs.
 ARG ROCM_DOCKER_ARCH=\
    gfx803 \
    gfx900 \
    gfx906 \
    gfx908 \
    gfx90a \
    gfx1010 \
    gfx1030 \
    gfx1100 \
    gfx1101 \
    gfx1102
 COPY requirements.txt   requirements.txt
 COPY requirements       requirements
 RUN pip install --upgrade pip setuptools wheel \
    && pip install -r requirements.txt
 WORKDIR /app
 COPY . .
 # Set nvcc architecture
 ENV GPU_TARGETS=${ROCM_DOCKER_ARCH}
 # Enable ROCm
 ENV LLAMA_HIPBLAS=1
 ENV CC=/opt/rocm/llvm/bin/clang
 ENV CXX=/opt/rocm/llvm/bin/clang++
 RUN make
 ENTRYPOINT [ "/app/server" ]
--- a/.devops/server.Dockerfile
+++ b/.devops/server.Dockerfile
@ -1,20 +0,0 @@
 ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION as build
 RUN apt-get update && \
    apt-get install -y build-essential git
 WORKDIR /app
 COPY . .
 RUN make
 FROM ubuntu:$UBUNTU_VERSION as runtime
 COPY --from=build /app/server /server
 ENV LC_ALL=C.utf8
 ENTRYPOINT [ "/server" ]
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@ -8,36 +8,40 @@ arg1="$1"
 shift
 if [[ "$arg1" == '--convert' || "$arg1" == '-c' ]]; then
-    python3 ./convert.py "$@"
+    exec python3 ./convert_hf_to_gguf.py "$@"
 elif [[ "$arg1" == '--quantize' || "$arg1" == '-q' ]]; then
-    ./quantize "$@"
+    exec ./llama-quantize "$@"
 elif [[ "$arg1" == '--run' || "$arg1" == '-r' ]]; then
-    ./main "$@"
+    exec ./llama-cli "$@"
-elif [[ "$arg1" == '--finetune' || "$arg1" == '-f' ]]; then
+elif [[ "$arg1" == '--bench' || "$arg1" == '-b' ]]; then
-    ./finetune "$@"
+    exec ./llama-bench "$@"
 elif [[ "$arg1" == '--perplexity' || "$arg1" == '-p' ]]; then
    exec ./llama-perplexity "$@"
 elif [[ "$arg1" == '--all-in-one' || "$arg1" == '-a' ]]; then
    echo "Converting PTH to GGML..."
-    for i in `ls $1/$2/ggml-model-f16.bin*`; do
+    for i in $(ls $1/$2/ggml-model-f16.bin*); do
        if [ -f "${i/f16/q4_0}" ]; then
            echo "Skip model quantization, it already exists: ${i/f16/q4_0}"
        else
            echo "Converting PTH to GGML: $i into ${i/f16/q4_0}..."
-            ./quantize "$i" "${i/f16/q4_0}" q4_0
+            exec ./llama-quantize "$i" "${i/f16/q4_0}" q4_0
        fi
    done
 elif [[ "$arg1" == '--server' || "$arg1" == '-s' ]]; then
-    ./server "$@"
+    exec ./llama-server "$@"
 else
    echo "Unknown command: $arg1"
    echo "Available commands: "
    echo "  --run (-r): Run a model previously converted into ggml"
    echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
    echo "  --bench (-b): Benchmark the performance of the inference for various parameters."
    echo "              ex: -m model.gguf"
    echo "  --perplexity (-p): Measure the perplexity of a model over a given text."
    echo "              ex: -m model.gguf -f file.txt"
    echo "  --convert (-c): Convert a llama model into ggml"
    echo "              ex: --outtype f16 \"/models/7B/\" "
    echo "  --quantize (-q): Optimize with quantization process ggml"
    echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
    echo "  --finetune (-f): Run finetune command to create a lora finetune of the model"
    echo "              See documentation for finetune for command-line parameters"
    echo "  --all-in-one (-a): Execute --convert & --quantize"
    echo "              ex: \"/models/\" 7B"
    echo "  --server (-s): Run a model on the server"
--- a/.devops/vulkan.Dockerfile
+++ b/.devops/vulkan.Dockerfile
@ -0,0 +1,89 @@
 ARG UBUNTU_VERSION=24.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 # Install build tools
 RUN apt update && apt install -y git build-essential cmake wget
 # Install Vulkan SDK and cURL
 RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
    wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \
    apt update -y && \
    apt-get install -y vulkan-sdk libcurl4-openssl-dev curl
 # Build it
 WORKDIR /app
 COPY . .
 RUN cmake -B build -DGGML_NATIVE=OFF -DGGML_VULKAN=1 -DLLAMA_CURL=1 && \
    cmake --build build --config Release -j$(nproc)
 RUN mkdir -p /app/lib && \
    find build -name "*.so" -exec cp {} /app/lib \;
 RUN mkdir -p /app/full \
    && cp build/bin/* /app/full \
    && cp *.py /app/full \
    && cp -r gguf-py /app/full \
    && cp -r requirements /app/full \
    && cp requirements.txt /app/full \
    && cp .devops/tools.sh /app/full/tools.sh
 ## Base image
 FROM ubuntu:$UBUNTU_VERSION AS base
 RUN apt-get update \
    && apt-get install -y libgomp1 curl libvulkan-dev \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 COPY --from=build /app/lib/ /app
 ### Full
 FROM base AS full
 COPY --from=build /app/full /app
 WORKDIR /app
 RUN apt-get update \
    && apt-get install -y \
    git \
    python3 \
    python3-pip \
    python3-wheel \
    && pip install --break-system-packages --upgrade setuptools \
    && pip install --break-system-packages -r requirements.txt \
    && apt autoremove -y \
    && apt clean -y \
    && rm -rf /tmp/* /var/tmp/* \
    && find /var/cache/apt/archives /var/lib/apt/lists -not -name lock -type f -delete \
    && find /var/cache -type f -delete
 ENTRYPOINT ["/app/tools.sh"]
 ### Light, CLI only
 FROM base AS light
 COPY --from=build /app/full/llama-cli /app
 WORKDIR /app
 ENTRYPOINT [ "/app/llama-cli" ]
 ### Server, Server only
 FROM base AS server
 ENV LLAMA_ARG_HOST=0.0.0.0
 COPY --from=build /app/full/llama-server /app
 WORKDIR /app
 HEALTHCHECK CMD [ "curl", "-f", "http://localhost:8080/health" ]
 ENTRYPOINT [ "/app/llama-server" ]
--- a/.dockerignore
+++ b/.dockerignore
@ -1,7 +1,7 @@
 *.o
 *.a
 .cache/
-.git/
+# Do not ignore .git directory, otherwise the reported build number will always be 0
 .github/
 .gitignore
 .vs/
@ -12,8 +12,8 @@ build*/
 models/*
-/main
+/llama-cli
-/quantize
+/llama-quantize
 arm_neon.h
 compile_commands.json
--- a/.ecrc
+++ b/.ecrc
@ -1,5 +1,5 @@
 {
-  "Exclude": ["^\\.gitmodules$"],
+  "Exclude": ["^\\.gitmodules$", "stb_image\\.h"],
  "Disable": {
    "IndentSize": true
  }
--- a/.editorconfig
+++ b/.editorconfig
@ -24,5 +24,27 @@ insert_final_newline = unset
 [examples/server/public/*]
 indent_size = 2
 [examples/server/public/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
 [examples/server/deps_*]
 trim_trailing_whitespace = unset
 indent_style = unset
 indent_size = unset
 [examples/llama.swiftui/llama.swiftui.xcodeproj/*]
 indent_style = tab
 [examples/cvector-generator/*.txt]
 trim_trailing_whitespace = unset
 insert_final_newline = unset
 [models/templates/*.jinja]
 indent_style = unset
 indent_size = unset
 end_of_line = unset
 charset = unset
 trim_trailing_whitespace = unset
 insert_final_newline = unset
--- a/.flake8
+++ b/.flake8
@ -1,2 +1,17 @@
 [flake8]
 max-line-length = 125
 ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
 exclude =
    # Do not traverse examples
    examples,
    # Do not include package initializers
    __init__.py,
    # No need to traverse our git directory
    .git,
    # There's no value in checking cache directories
    __pycache__,
    # No need to include the build path
    build,
    # This contains builds that we don't want to check
    dist  # This is generated with `python build .` for package releases
 # max-complexity = 10
--- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
+++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml
@ -0,0 +1,87 @@
 name: Bug (compilation)
 description: Something goes wrong when trying to compile llama.cpp.
 title: "Compile bug: "
 labels: ["bug-unconfirmed", "compilation"]
 body:
  - type: markdown
    attributes:
      value: >
        Thanks for taking the time to fill out this bug report!
        This issue template is intended for bug reports where the compilation of llama.cpp fails.
        Before opening an issue, please confirm that the compilation still fails with `-DGGML_CCACHE=OFF`.
        If the compilation succeeds with ccache disabled you should be able to permanently fix the issue
        by clearing `~/.cache/ccache` (on Linux).
  - type: textarea
    id: commit
    attributes:
      label: Git commit
      description: Which commit are you trying to compile?
      placeholder: |
        $git rev-parse HEAD
        84a07a17b1b08cf2b9747c633a2372782848a27f
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: true
  - type: dropdown
    id: backends
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
        multiple: true
    validations:
      required: true
  - type: textarea
    id: info
    attributes:
      label: Problem description & steps to reproduce
      description: >
        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific compile flags, that information would be very much appreciated by us.
      placeholder: >
        I'm trying to compile llama.cpp with CUDA support on a fresh install of Ubuntu and get error XY.
        Here are the exact commands that I used: ...
    validations:
      required: true
  - type: textarea
    id: first_bad_commit
    attributes:
      label: First Bad Commit
      description: >
        If the bug was not present on an earlier version: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
  - type: textarea
    id: command
    attributes:
      label: Compile command
      description: >
        Please provide the exact command you used to compile llama.cpp. For example: `cmake -B ...`.
        This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: true
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/011-bug-results.yml
+++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml
@ -0,0 +1,101 @@
 name: Bug (model use)
 description: Something goes wrong when using a model (in general, not specific to a single llama.cpp module).
 title: "Eval bug: "
 labels: ["bug-unconfirmed", "model evaluation"]
 body:
  - type: markdown
    attributes:
      value: >
        Thanks for taking the time to fill out this bug report!
        This issue template is intended for bug reports where the model evaluation results
        (i.e. the generated text) are incorrect or llama.cpp crashes during model evaluation.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
        The `llama-cli` binary can be used for simple and reproducible model inference.
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which version of our software are you running? (use `--version` to get a version string)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: true
  - type: dropdown
    id: backends
    attributes:
        label: GGML backends
        description: Which GGML backends do you know to be affected?
        options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan]
        multiple: true
    validations:
      required: true
  - type: textarea
    id: hardware
    attributes:
      label: Hardware
      description: Which CPUs/GPUs are you using?
      placeholder: >
        e.g. Ryzen 5950X + 2x RTX 4090
    validations:
      required: true
  - type: textarea
    id: model
    attributes:
      label: Models
      description: >
        Which model(s) at which quantization were you using when encountering the bug?
        If you downloaded a GGUF file off of Huggingface, please provide a link.
      placeholder: >
        e.g. Meta LLaMA 3.1 Instruct 8b q4_K_M
    validations:
      required: false
  - type: textarea
    id: info
    attributes:
      label: Problem description & steps to reproduce
      description: >
        Please give us a summary of the problem and tell us how to reproduce it.
        If you can narrow down the bug to specific hardware, compile flags, or command line arguments,
        that information would be very much appreciated by us.
      placeholder: >
        e.g. when I run llama-cli with -ngl 99 I get garbled outputs.
        When I use -ngl 0 it works correctly.
        Here are the exact commands that I used: ...
    validations:
      required: true
  - type: textarea
    id: first_bad_commit
    attributes:
      label: First Bad Commit
      description: >
        If the bug was not present on an earlier version: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
          Please copy and paste any relevant log output, including the command that you entered and any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/019-bug-misc.yml
+++ b/.github/ISSUE_TEMPLATE/019-bug-misc.yml
@ -0,0 +1,91 @@
 name: Bug (misc.)
 description: Something is not working the way it should (and it's not covered by any of the above cases).
 title: "Misc. bug: "
 labels: ["bug-unconfirmed"]
 body:
  - type: markdown
    attributes:
      value: >
        Thanks for taking the time to fill out this bug report!
        This issue template is intended for miscellaneous bugs that don't fit into any other category.
        If you encountered the issue while using an external UI (e.g. ollama),
        please reproduce your issue using one of the examples/binaries in this repository.
  - type: textarea
    id: version
    attributes:
      label: Name and Version
      description: Which version of our software is affected? (You can use `--version` to get a version string.)
      placeholder: |
        $./llama-cli --version
        version: 2999 (42b4109e)
        built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu
    validations:
      required: true
  - type: dropdown
    id: operating-system
    attributes:
      label: Operating systems
      description: Which operating systems do you know to be affected?
      multiple: true
      options:
        - Linux
        - Mac
        - Windows
        - BSD
        - Other? (Please let us know in description)
    validations:
      required: false
  - type: dropdown
    id: module
    attributes:
      label: Which llama.cpp modules do you know to be affected?
      multiple: true
      options:
        - Documentation/Github
        - libllama (core library)
        - llama-cli
        - llama-server
        - llama-bench
        - llama-quantize
        - Python/Bash scripts
        - Test code
        - Other (Please specify in the next section)
    validations:
      required: false
  - type: textarea
    id: command
    attributes:
      label: Command line
      description: >
        Please provide the exact commands you entered, if applicable. For example: `llama-server -m ... -c ...`, `llama-cli -m ...`, etc.
        This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: false
  - type: textarea
    id: info
    attributes:
      label: Problem description & steps to reproduce
      description: >
        Please give us a summary of the problem and tell us how to reproduce it (if applicable).
    validations:
      required: true
  - type: textarea
    id: first_bad_commit
    attributes:
      label: First Bad Commit
      description: >
        If the bug was not present on an earlier version and it's not trivial to track down: when did it start appearing?
        If possible, please do a git bisect and identify the exact commit that introduced the bug.
    validations:
      required: false
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: >
          If applicable, please copy and paste any relevant log output, including any generated text.
          This will be automatically formatted into code, so no need for backticks.
      render: shell
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@ -0,0 +1,51 @@
 name: Enhancement
 description: Used to request enhancements for llama.cpp.
 title: "Feature Request: "
 labels: ["enhancement"]
 body:
  - type: markdown
    attributes:
      value: |
        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
  - type: checkboxes
    id: prerequisites
    attributes:
      label: Prerequisites
      description: Please confirm the following before submitting your enhancement request.
      options:
        - label: I am running the latest code. Mention the version if possible as well.
          required: true
        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
          required: true
        - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
          required: true
        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
          required: true
  - type: textarea
    id: feature-description
    attributes:
      label: Feature Description
      description: Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
      placeholder: Detailed description of the enhancement
    validations:
      required: true
  - type: textarea
    id: motivation
    attributes:
      label: Motivation
      description: Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
      placeholder: Explanation of why this feature is needed and its benefits
    validations:
      required: true
  - type: textarea
    id: possible-implementation
    attributes:
      label: Possible Implementation
      description: If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
      placeholder: Detailed description of potential implementation
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@ -0,0 +1,52 @@
 name: Research
 description: Track new technical research area.
 title: "Research: "
 labels: ["research 🔬"]
 body:
  - type: markdown
    attributes:
      value: |
        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
  - type: checkboxes
    id: research-stage
    attributes:
      label: Research Stage
      description: Track general state of this research ticket
      options:
        - label: Background Research (Let's try to avoid reinventing the wheel)
        - label: Hypothesis Formed (How do you think this will work and it's effect?)
        - label: Strategy / Implementation Forming
        - label: Analysis of results
        - label: Debrief / Documentation (So people in the future can learn from us)
  - type: textarea
    id: background
    attributes:
      label: Previous existing literature and research
      description: Whats the current state of the art and whats the motivation for this research?
  - type: textarea
    id: hypothesis
    attributes:
      label: Hypothesis
      description: How do you think this will work and it's effect?
  - type: textarea
    id: implementation
    attributes:
      label: Implementation
      description: Got an approach? e.g. a PR ready to go?
  - type: textarea
    id: analysis
    attributes:
      label: Analysis
      description: How does the proposed implementation behave?
  - type: textarea
    id: logs
    attributes:
      label: Relevant log output
      description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
      render: shell
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@ -0,0 +1,28 @@
 name: Refactor (Maintainers)
 description: Used to track refactoring opportunities.
 title: "Refactor: "
 labels: ["refactor"]
 body:
  - type: markdown
    attributes:
      value: |
        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
  - type: textarea
    id: background-description
    attributes:
      label: Background Description
      description: Please provide a detailed written description of the pain points you are trying to solve.
      placeholder: Detailed description behind your motivation to request refactor
    validations:
      required: true
  - type: textarea
    id: possible-approaches
    attributes:
      label: Possible Refactor Approaches
      description: If you have some idea of possible approaches to solve this problem. You may want to make it a todo list.
      placeholder: Your idea of possible refactoring opportunity/approaches
    validations:
      required: false
--- a/.github/ISSUE_TEMPLATE/bug.md
+++ b/.github/ISSUE_TEMPLATE/bug.md
@ -1,9 +0,0 @@
 ---
 name: Bug template
 about: Used to report bugs in llama.cpp
 labels: ["bug-unconfirmed"]
 assignees: ''
 ---
 Please include information about your system, the steps to reproduce the bug, and the version of llama.cpp that you are using. If possible, please provide a minimal code example that reproduces the bug.
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@ -0,0 +1,11 @@
 blank_issues_enabled: true
 contact_links:
  - name: Got an idea?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
    about: Pop it there. It may then become an enhancement ticket.
  - name: Got a question?
    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
    about: Ask a question there!
  - name: Want to contribute?
    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
    about: Head to the contribution guide page of the wiki for areas you can help with
--- a/.github/ISSUE_TEMPLATE/enhancement.md
+++ b/.github/ISSUE_TEMPLATE/enhancement.md
@ -1,28 +0,0 @@
 ---
 name: Enhancement template
 about: Used to request enhancements for llama.cpp
 labels: ["enhancement"]
 assignees: ''
 ---
 # Prerequisites
 Please answer the following questions for yourself before submitting an issue.
 - [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now.
 - [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
 - [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed).
 - [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share.
 # Feature Description
 Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement.
 # Motivation
 Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users.
 # Possible Implementation
 If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better.
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -0,0 +1,86 @@
 # https://github.com/actions/labeler
 Kompute:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-kompute.h
            - ggml/src/ggml-kompute/**
            - README-kompute.md
 Apple Metal:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-metal.h
            - ggml/src/ggml-metal/**
            - README-metal.md
 SYCL:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-sycl.h
            - ggml/src/ggml-sycl/**
            - docs/backend/SYCL.md
            - examples/sycl/**
 Nvidia GPU:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-cuda.h
            - ggml/src/ggml-cuda/**
 Vulkan:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/include/ggml-vulkan.h
            - ggml/src/ggml-vulkan/**
 documentation:
    - changed-files:
        - any-glob-to-any-file:
            - docs/**
            - media/**
 testing:
    - changed-files:
        - any-glob-to-any-file:
            - tests/**
 build:
    - changed-files:
        - any-glob-to-any-file:
            - cmake/**
            - CMakeLists.txt
            - CMakePresets.json
 examples:
    - changed-files:
        - any-glob-to-any-file: examples/**
 devops:
    - changed-files:
        - any-glob-to-any-file:
            - .devops/**
            - .github/**
            - ci/**
 python:
    - changed-files:
        - any-glob-to-any-file:
            - "**/*.py"
            - requirements/**
            - gguf-py/**
            - .flake8
 script:
    - changed-files:
        - any-glob-to-any-file:
            - scripts/**
 android:
    - changed-files:
        - any-glob-to-any-file:
            - examples/llama.android/**
 server:
    - changed-files:
        - any-glob-to-any-file:
            - examples/server/**
 ggml:
    - changed-files:
        - any-glob-to-any-file:
            - ggml/**
 nix:
    - changed-files:
        - any-glob-to-any-file:
            - "**/*.nix"
            - .github/workflows/nix-*.yml
            - .devops/nix/nixpkgs-instances.nix
 embedding:
    - changed-files:
        - any-glob-to-any-file: examples/embedding/
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -0,0 +1 @@
 *Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@ -0,0 +1,315 @@
 # TODO: there have been some issues with the workflow, so disabling for now
 #       https://github.com/ggerganov/llama.cpp/issues/7893
 #
 # Benchmark
 name: Benchmark
 on:
  workflow_dispatch:
    inputs:
      gpu-series:
        description: 'Azure GPU series to run with'
        required: true
        type: choice
        options:
          - Standard_NC4as_T4_v3
          - Standard_NC24ads_A100_v4
          - Standard_NC80adis_H100_v5
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      duration:
        description: 'Duration of the bench'
        type: string
        default: 10m
  push:
    branches:
      - master
    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  pull_request_target:
    types: [opened, synchronize, reopened]
    paths: ['llama.cpp', 'ggml.c', 'ggml-backend.cpp', 'ggml-quants.c', '**/*.cu', 'examples/server/*.h*', 'examples/server/*.cpp']
  schedule:
    -  cron: '04 2 * * *'
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}-${{ github.event.inputs.sha }}
  cancel-in-progress: true
 jobs:
  bench-server-baseline:
    runs-on: Standard_NC4as_T4_v3
    env:
      RUNNER_LABEL: Standard_NC4as_T4_v3 # FIXME Do not find a way to not duplicate it
      N_USERS: 8
      DURATION: 10m
    strategy:
      matrix:
        model: [phi-2]
        ftype: [q4_0, q8_0, f16]
        include:
          - model: phi-2
            ftype: q4_0
            pr_comment_enabled: "true"
    if: |
      inputs.gpu-series == 'Standard_NC4as_T4_v3'
      || (
        github.event_name == 'schedule'
        && github.ref_name == 'master'
        && github.repository_owner == 'ggerganov'
      )
      || github.event_name == 'pull_request_target'
      || (
        github.event_name == 'push'
        && github.event.ref == 'refs/heads/master'
        && github.repository_owner == 'ggerganov'
      )
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Install python env
        id: pipenv
        run: |
          cd examples/server/bench
          python3 -m venv venv
          source venv/bin/activate
          pip install -r requirements.txt
      - name: Prometheus
        id: install_prometheus
        run: |
          wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
          tar xzf prometheus*.tar.gz --strip-components=1
          ./prometheus --config.file=examples/server/bench/prometheus.yml &
          while ! nc -z localhost 9090; do
            sleep 0.1
          done
      - name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: '1.21'
      - name: Install k6 and xk6-sse
        id: k6_installation
        run: |
          cd examples/server/bench
          go install go.k6.io/xk6/cmd/xk6@latest
          xk6 build master \
              --with github.com/phymbert/xk6-sse
      - name: Build
        id: cmake_build
        run: |
          set -eux
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DLLAMA_CUBLAS=ON \
              -DCUDAToolkit_ROOT=/usr/local/cuda \
              -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
              -DCMAKE_CUDA_ARCHITECTURES=75 \
              -DLLAMA_FATAL_WARNINGS=OFF \
              -DLLAMA_ALL_WARNINGS=OFF \
              -DCMAKE_BUILD_TYPE=Release;
          cmake --build build --config Release -j $(nproc) --target llama-server
      - name: Download the dataset
        id: download_dataset
        run: |
          cd examples/server/bench
          wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
      - name: Server bench
        id: server_bench
        env:
            HEAD_REF: ${{ github.head_ref || github.ref_name }}
        run: |
          set -eux
          cd examples/server/bench
          source venv/bin/activate
          python bench.py \
              --runner-label ${{ env.RUNNER_LABEL }} \
              --name ${{ github.job }} \
              --branch $HEAD_REF \
              --commit ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha }} \
              --scenario script.js \
              --duration ${{ github.event.inputs.duration || env.DURATION }} \
              --hf-repo ggml-org/models	 \
              --hf-file ${{ matrix.model }}/ggml-model-${{ matrix.ftype }}.gguf \
              --model-path-prefix /models \
              --parallel ${{ env.N_USERS }} \
              -ngl 33 \
              --batch-size 2048 \
              --ubatch-size	256 \
              --ctx-size 16384 \
              --n-prompts 1000 \
              --max-prompt-tokens 1024 \
              --max-tokens 2048
          cat results.github.env >> $GITHUB_ENV
          # Remove dataset as we do not want it in the artefact
          rm ShareGPT_V3_unfiltered_cleaned_split.json
      - uses: actions/upload-artifact@v4
        with:
          name: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          compression-level: 9
          path: |
            examples/server/bench/*.jpg
            examples/server/bench/*.json
            examples/server/bench/*.log
      - name: Commit status
        uses: Sibz/github-status-action@v1
        with:
          authToken: ${{secrets.GITHUB_TOKEN}}
          sha: ${{ inputs.sha || github.event.pull_request.head.sha || github.sha }}
          context: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          description: |
            ${{ env.BENCH_RESULTS }}
          state: 'success'
      - name: Upload benchmark images
        uses: devicons/public-upload-to-imgur@v2.2.2
        continue-on-error: true # Important as it looks unstable: 503
        id: imgur_step
        with:
          client_id: ${{secrets.IMGUR_CLIENT_ID}}
          path: |
            examples/server/bench/prompt_tokens_seconds.jpg
            examples/server/bench/predicted_tokens_seconds.jpg
            examples/server/bench/kv_cache_usage_ratio.jpg
            examples/server/bench/requests_processing.jpg
      - name: Extract mermaid
        id: set_mermaid
        run: |
          set -eux
          cd examples/server/bench
          PROMPT_TOKENS_SECONDS=$(cat prompt_tokens_seconds.mermaid)
          echo "PROMPT_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PROMPT_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          PREDICTED_TOKENS_SECONDS=$(cat predicted_tokens_seconds.mermaid)
          echo "PREDICTED_TOKENS_SECONDS<<EOF" >> $GITHUB_ENV
          echo "$PREDICTED_TOKENS_SECONDS" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          KV_CACHE_USAGE_RATIO=$(cat kv_cache_usage_ratio.mermaid)
          echo "KV_CACHE_USAGE_RATIO<<EOF" >> $GITHUB_ENV
          echo "$KV_CACHE_USAGE_RATIO" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
          REQUESTS_PROCESSING=$(cat requests_processing.mermaid)
          echo "REQUESTS_PROCESSING<<EOF" >> $GITHUB_ENV
          echo "$REQUESTS_PROCESSING" >> $GITHUB_ENV
          echo "EOF" >> $GITHUB_ENV
      - name: Extract image url
        id: extract_image_url
        continue-on-error: true
        run: |
          set -eux
          echo "IMAGE_O=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[0] }}" >> $GITHUB_ENV
          echo "IMAGE_1=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[1] }}" >> $GITHUB_ENV
          echo "IMAGE_2=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[2] }}" >> $GITHUB_ENV
          echo "IMAGE_3=${{ fromJSON(steps.imgur_step.outputs.imgur_urls)[3] }}" >> $GITHUB_ENV
      - name: Comment PR
        uses: mshick/add-pr-comment@v2
        id: comment_pr
        if: ${{ github.event.pull_request != '' && matrix.pr_comment_enabled == 'true' }}
        with:
          message-id: bench-server-${{ github.job }}-${{ env.RUNNER_LABEL }}-${{ matrix.model }}-${{ matrix.ftype }}
          message: |
            <p align="center">
            📈 **llama.cpp server** for _${{ github.job }}_ on _${{ env.RUNNER_LABEL }}_ for `${{ matrix.model }}`-`${{ matrix.ftype }}`: **${{ env.BENCH_ITERATIONS}} iterations** 🚀
            </p>
            <details>
            <summary>Expand details for performance related PR only</summary>
            - Concurrent users: ${{ env.N_USERS }}, duration: ${{ github.event.inputs.duration || env.DURATION }}
            - HTTP request          : avg=${{ env.HTTP_REQ_DURATION_AVG }}ms        p(95)=${{ env.HTTP_REQ_DURATION_P_95_ }}ms fails=${{ env.HTTP_REQ_FAILED_PASSES }}, finish reason: stop=${{ env.LLAMACPP_COMPLETIONS_STOP_RATE_PASSES }} truncated=${{ env.LLAMACPP_COMPLETIONS_TRUNCATED_RATE_PASSES }}
            - Prompt processing (pp): avg=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_PROMPT_PROCESSING_SECOND_P_95_ }}tk/s
            - Token generation  (tg): avg=${{ env.LLAMACPP_TOKENS_SECOND_AVG }}tk/s p(95)=${{ env.LLAMACPP_TOKENS_SECOND_P_95_ }}tk/s
            - ${{ env.BENCH_GRAPH_XLABEL }}
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_O }}" alt="prompt_tokens_seconds" />
            <details>
            <summary>More</summary>
            ```mermaid
            ${{ env.PROMPT_TOKENS_SECONDS }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_1 }}" alt="predicted_tokens_seconds"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.PREDICTED_TOKENS_SECONDS }}
            ```
            </details>
            </p>
            <details>
            <summary>Details</summary>
            <p align="center">
            <img width="100%" height="100%" src="${{ env.IMAGE_2 }}" alt="kv_cache_usage_ratio" />
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.KV_CACHE_USAGE_RATIO }}
            ```
            </details>
            <img width="100%" height="100%" src="${{ env.IMAGE_3 }}" alt="requests_processing"/>
            <details>
                <summary>More</summary>
            ```mermaid
            ${{ env.REQUESTS_PROCESSING }}
            ```
            </details>
            </p>
            </details>
            </details>
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
--- a/.github/workflows/close-issue.yml
+++ b/.github/workflows/close-issue.yml
@ -0,0 +1,28 @@
 name: Close inactive issues
 on:
  schedule:
    - cron: "42 0 * * *"
 # Fine-grant permission
 # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
 permissions:
  issues: write
 jobs:
  close-issues:
    runs-on: ubuntu-latest
    permissions:
      issues: write
      pull-requests: write
    steps:
      - uses: actions/stale@v5
        with:
          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug,roadmap"
          days-before-issue-stale: 30
          days-before-issue-close: 14
          stale-issue-label: "stale"
          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
          days-before-pr-stale: -1
          days-before-pr-close: -1
          operations-per-run: 10000
          repo-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/code-coverage.yml
+++ b/.github/workflows/code-coverage.yml
@ -1,36 +0,0 @@
 name: Code Coverage
 on: [push, pull_request]
 env:
  GGML_NLOOP: 3
  GGML_N_THREADS: 1
 jobs:
  run:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Dependencies
        run: |
          sudo apt-get update
          sudo apt-get install build-essential gcc-8 lcov
      - name: Build
        run: CC=gcc-8 make -j LLAMA_CODE_COVERAGE=1 tests
      - name: Run tests
        run: CC=gcc-8 make test
      - name: Generate coverage report
        run: |
          make coverage
          make lcov-report
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v3
        env:
           CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        with:
          files: lcov-report/coverage.info
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -10,45 +10,50 @@
 name: Publish Docker image
 on:
-  pull_request:
+  workflow_dispatch: # allows manual triggering
-  push:
+  schedule:
-    branches:
+    # Rebuild daily rather than on every push because it is expensive
-      - master
+    - cron: '12 4 * * *'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 # Fine-grant permission
 # https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#modifying-the-permissions-for-the-github_token
 permissions:
  packages: write
 jobs:
  push_to_registry:
    name: Push Docker image to Docker Hub
    if: github.event.pull_request.draft == false
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    env:
      COMMIT_SHA: ${{ github.sha }}
    strategy:
      fail-fast: false
      matrix:
        config:
-          - { tag: "light", dockerfile: ".devops/main.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          # Multi-stage build
-          - { tag: "full", dockerfile: ".devops/full.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "server", dockerfile: ".devops/server.Dockerfile", platforms: "linux/amd64,linux/arm64" }
+          - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          # NOTE(canardletter): The CUDA builds on arm64 are very slow, so I
+          - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          #                     have disabled them for now until the reason why
+          - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          #                     is understood.
+          - { tag: "vulkan", dockerfile: ".devops/vulkan.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false}
-          - { tag: "light-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platforms: "linux/amd64" }
+          # Note: the rocm images are failing due to a compiler error and are disabled until this is fixed to allow the workflow to complete
-          - { tag: "full-cuda", dockerfile: ".devops/full-cuda.Dockerfile", platforms: "linux/amd64" }
+          #- {tag: "rocm", dockerfile: ".devops/rocm.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: true }
          - { tag: "server-cuda", dockerfile: ".devops/server-cuda.Dockerfile", platforms: "linux/amd64" }
          - { tag: "light-rocm", dockerfile: ".devops/main-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "full-rocm", dockerfile: ".devops/full-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "server-rocm", dockerfile: ".devops/server-rocm.Dockerfile", platforms: "linux/amd64,linux/arm64" }
          - { tag: "light-intel", dockerfile: ".devops/main-intel.Dockerfile", platforms: "linux/amd64" }
          - { tag: "server-intel", dockerfile: ".devops/server-intel.Dockerfile", platforms: "linux/amd64" }
    steps:
      - name: Check out the repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
        with:
          fetch-depth: 0 # preserve git history, so we can determine the build number
      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
+        uses: docker/setup-qemu-action@v3
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
+        uses: docker/setup-buildx-action@v3
      - name: Log in to Docker Hub
        uses: docker/login-action@v2
@ -57,9 +62,45 @@ jobs:
          username: ${{ github.repository_owner }}
          password: ${{ secrets.GITHUB_TOKEN }}
-      # https://github.com/jlumbroso/free-disk-space/tree/54081f138730dfa15788a46383842cd2f914a1be#example
+      - name: Determine tag name
        id: tag
        shell: bash
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          REPO_OWNER="${GITHUB_REPOSITORY_OWNER@L}"  # to lower case
          REPO_NAME="${{ github.event.repository.name }}"
          # determine tag name postfix (build number, commit hash)
          if [[ "${{ env.GITHUB_BRANCH_NAME }}" == "master" ]]; then
            TAG_POSTFIX="-b${BUILD_NUMBER}"
          else
            SAFE_NAME=$(echo "${{ env.GITHUB_BRANCH_NAME }}" | tr '/' '-')
            TAG_POSTFIX="-${SAFE_NAME}-${SHORT_HASH}"
          fi
          # list all tags possible
          if [[ "${{ matrix.config.tag }}" == "cpu" ]]; then
              TYPE=""
          else
              TYPE="-${{ matrix.config.tag }}"
          fi
          PREFIX="ghcr.io/${REPO_OWNER}/${REPO_NAME}:"
          FULLTAGS="${PREFIX}full${TYPE},${PREFIX}full${TYPE}${TAG_POSTFIX}"
          LIGHTTAGS="${PREFIX}light${TYPE},${PREFIX}light${TYPE}${TAG_POSTFIX}"
          SERVERTAGS="${PREFIX}server${TYPE},${PREFIX}server${TYPE}${TAG_POSTFIX}"
          echo "full_output_tags=$FULLTAGS" >> $GITHUB_OUTPUT
          echo "light_output_tags=$LIGHTTAGS" >> $GITHUB_OUTPUT
          echo "server_output_tags=$SERVERTAGS" >> $GITHUB_OUTPUT
          echo "full_output_tags=$FULLTAGS"  # print out for debugging
          echo "light_output_tags=$LIGHTTAGS"  # print out for debugging
          echo "server_output_tags=$SERVERTAGS"  # print out for debugging
        env:
          GITHUB_BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
          GITHUB_REPOSITORY_OWNER: '${{ github.repository_owner }}'
      - name: Free Disk Space (Ubuntu)
-        uses: jlumbroso/free-disk-space@main
+        if: ${{ matrix.config.free_disk_space == true }}
        uses: ggml-org/free-disk-space@v1.3.1
        with:
          # this might remove tools that are actually needed,
          # if set to "true" but frees about 6 GB
@ -74,34 +115,59 @@ jobs:
          docker-images: true
          swap-storage: true
-      - name: Determine tag name
+      - name: Build and push Full Docker image (tagged + versioned)
-        id: tag
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.full == true }}
-        shell: bash
+        uses: docker/build-push-action@v6
        run: |
          BUILD_NUMBER="$(git rev-list --count HEAD)"
          SHORT_HASH="$(git rev-parse --short=7 HEAD)"
          if [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
            echo "name=b${BUILD_NUMBER}" >> $GITHUB_OUTPUT
          else
            SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
            echo "name=${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}" >> $GITHUB_OUTPUT
          fi
      - name: Build and push Docker image (versioned)
        if: github.event_name == 'push'
        uses: docker/build-push-action@v4
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
+          # tag list is generated from step above
          tags: ${{ steps.tag.outputs.full_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: full
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
-      - name: Build and push Docker image (tagged)
+      - name: Build and push Light Docker image (tagged + versioned)
-        uses: docker/build-push-action@v4
+        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.light == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
-          push: ${{ github.event_name == 'push' }}
+          push: true
          platforms: ${{ matrix.config.platforms }}
-          tags: "ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }},ghcr.io/${{ github.repository_owner }}/llama.cpp:${{ matrix.config.tag }}-${{ steps.tag.outputs.name }}"
+          # tag list is generated from step above
          tags: ${{ steps.tag.outputs.light_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: light
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
      - name: Build and push Server Docker image (tagged + versioned)
        if: ${{ (github.event_name == 'push' || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch') && matrix.config.server == true }}
        uses: docker/build-push-action@v6
        with:
          context: .
          push: true
          platforms: ${{ matrix.config.platforms }}
          # tag list is generated from step above
          tags: ${{ steps.tag.outputs.server_output_tags }}
          file: ${{ matrix.config.dockerfile }}
          target: server
          provenance: false
          # using github experimental cache
          cache-from: type=gha
          cache-to: type=gha,mode=max
          # return to this if the experimental github cache is having issues
          #cache-to: type=local,dest=/tmp/.buildx-cache
          #cache-from: type=local,src=/tmp/.buildx-cache
--- a/.github/workflows/editorconfig.yml
+++ b/.github/workflows/editorconfig.yml
@ -14,10 +14,16 @@ on:
    branches:
      - master
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  editorconfig:
    runs-on: ubuntu-latest
    steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
-      - uses: editorconfig-checker/action-editorconfig-checker@main
+      - uses: editorconfig-checker/action-editorconfig-checker@v2
        with:
          version: v3.0.3
      - run: editorconfig-checker
--- a/.github/workflows/gguf-publish.yml
+++ b/.github/workflows/gguf-publish.yml
@ -24,9 +24,9 @@ jobs:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v5
      with:
        python-version: '3.9.x'
    - name: Install dependencies
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -0,0 +1,17 @@
 name: "Pull Request Labeler"
 on:
 - pull_request_target
 jobs:
  labeler:
    permissions:
      contents: read
      pull-requests: write
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
      with:
        repository: "ggerganov/llama.cpp"
    - uses: actions/labeler@v5
      with:
        configuration-path: '.github/labeler.yml'
--- a/.github/workflows/nix-ci-aarch64.yml
+++ b/.github/workflows/nix-ci-aarch64.yml
@ -1,62 +0,0 @@
 name: Nix aarch64 builds
 on:
  workflow_dispatch: # allows manual triggering
  schedule:
    # Rebuild daily rather than on every push because QEMU is expensive (e.g.
    # 1.5h instead of minutes with the cold cache).
    #
    # randint(0, 59), randint(0, 23)
    - cron: '26 12 * * *'
  # But also rebuild if we touched any of the Nix expressions:
  push:
    branches:
      - master
    paths: ['**/*.nix', 'flake.lock']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['**/*.nix', 'flake.lock']
 jobs:
  nix-build-aarch64:
    if: ${{ vars.CACHIX_NAME != '' }}
    runs-on: ubuntu-latest
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install QEMU
      # Copy-paste from https://github.com/orgs/community/discussions/8305#discussioncomment-5888654
      run: |
        sudo apt-get update
        sudo apt-get install -y qemu-user-static qemu-system-aarch64
        sudo usermod -a -G kvm $USER
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-platforms = aarch64-linux
          extra-system-features = nixos-test kvm
          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: ${{ vars.CACHIX_NAME }}
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.aarch64-linux"
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --systems aarch64-linux
          --flake
          ".#checks.aarch64-linux"
--- a/.github/workflows/nix-ci.yml
+++ b/.github/workflows/nix-ci.yml
@ -1,69 +0,0 @@
 name: Nix CI
 on:
  workflow_dispatch: # allows manual triggering
  push:
    branches:
      - master
  pull_request:
    types: [opened, synchronize, reopened]
 jobs:
  nix-eval:
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: List all flake outputs
      run: nix flake show --all-systems
    - name: Show all output paths
      run: >
          nix run github:nix-community/nix-eval-jobs
          -- --gc-roots-dir gcroot
          --flake
          ".#packages.$(nix eval --raw --impure --expr builtins.currentSystem)"
  nix-build:
    if: ${{ vars.CACHIX_NAME != '' }}
    strategy:
      fail-fast: false
      matrix:
        os: [ ubuntu-latest, macos-latest ]
    runs-on: ${{ matrix.os }}
    steps:
    - name: Checkout repository
      uses: actions/checkout@v4
    - name: Install Nix
      uses: DeterminateSystems/nix-installer-action@v9
      with:
        github-token: ${{ secrets.GITHUB_TOKEN }}
        extra-conf: |
          extra-substituters = https://${{ vars.CACHIX_NAME }}.cachix.org https://cuda-maintainers.cachix.org
          extra-trusted-public-keys = ${{ vars.CACHIX_PUBLIC_KEY }} cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=
    - uses: DeterminateSystems/magic-nix-cache-action@v2
      with:
        upstream-cache: https://${{ matrix.cachixName }}.cachix.org
    - name: Set-up cachix to push the results to
      uses: cachix/cachix-action@v13
      with:
        authToken: '${{ secrets.CACHIX_AUTH_TOKEN }}'
        name: ${{ vars.CACHIX_NAME }}
    - name: Build
      run: >
          nix run github:Mic92/nix-fast-build
          -- --skip-cached --no-nom
          --flake
          ".#checks.$(nix eval --raw --impure --expr builtins.currentSystem)"
--- a/.github/workflows/nix-flake-update.yml
+++ b/.github/workflows/nix-flake-update.yml
@ -1,22 +0,0 @@
 name: update-flake-lock
 on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 0' # runs weekly on Sunday at 00:00
 jobs:
  lockfile:
    runs-on: ubuntu-latest
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: Install Nix
        uses: DeterminateSystems/nix-installer-action@main
      - name: Update flake.lock
        uses: DeterminateSystems/update-flake-lock@main
        with:
          pr-title: "nix: update flake.lock"
          pr-labels: |
            nix
          pr-reviewers: philiptaron,SomeoneSerge
          token: ${{ secrets.FLAKE_TOKEN }}
--- a/.github/workflows/nix-publish-flake.yml
+++ b/.github/workflows/nix-publish-flake.yml
@ -1,36 +0,0 @@
 # Make the flake discoverable on https://flakestry.dev and https://flakehub.com/flakes
 name: "Publish a flake to flakestry & flakehub"
 on:
    push:
        tags:
        - "*"
    workflow_dispatch:
        inputs:
            tag:
                description: "The existing tag to publish"
                type: "string"
                required: true
 jobs:
    flakestry-publish:
        runs-on: ubuntu-latest
        permissions:
            id-token: "write"
            contents: "read"
        steps:
            - uses: flakestry/flakestry-publish@main
              with:
                version: "${{ inputs.tag || github.ref_name }}"
    flakehub-publish:
      runs-on: "ubuntu-latest"
      permissions:
        id-token: "write"
        contents: "read"
      steps:
        - uses: "actions/checkout@v4"
          with:
            ref: "${{ (inputs.tag != null) && format('refs/tags/{0}', inputs.tag) || '' }}"
        - uses: "DeterminateSystems/nix-installer-action@main"
        - uses: "DeterminateSystems/flakehub-push@main"
          with:
            visibility: "public"
            tag: "${{ inputs.tag }}"
--- a/.github/workflows/python-check-requirements.yml
+++ b/.github/workflows/python-check-requirements.yml
@ -3,16 +3,20 @@ name: Python check requirements.txt
 on:
  push:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - 'requirements.txt'
+      - '**/requirements*.txt'
      - 'requirements/*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-check-requirements.yml'
      - 'scripts/check-requirements.sh'
      - 'convert*.py'
-      - 'requirements.txt'
+      - '**/requirements*.txt'
-      - 'requirements/*.txt'
+
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  python-check-requirements:
@ -20,10 +24,10 @@ jobs:
    name: check-requirements
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up Python environment
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Run check-requirements.sh script
-        run:  bash scripts/check-requirements.sh nocleanup
+        run:  bash scripts/check-requirements.sh
--- a/.github/workflows/python-lint.yml
+++ b/.github/workflows/python-lint.yml
@ -1,6 +1,17 @@
 name: flake8 Lint
-on: [push, pull_request]
+on:
  push:
    branches:
      - master
    paths: ['.github/workflows/python-lint.yml', '**/*.py']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/python-lint.yml', '**/*.py']
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  flake8-lint:
@ -8,13 +19,12 @@ jobs:
    name: Lint
    steps:
      - name: Check out source repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Set up Python environment
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: flake8 Lint
        uses: py-actions/flake8@v2
        with:
-            ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
+            plugins: "flake8-no-print"
            exclude: "examples/*,examples/*/**,*/**/__init__.py"
--- a/.github/workflows/python-type-check.yml
+++ b/.github/workflows/python-type-check.yml
@ -0,0 +1,40 @@
 name: Python Type-Check
 on:
  push:
    paths:
      - '.github/workflows/python-type-check.yml'
      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
  pull_request:
    paths:
      - '.github/workflows/python-type-check.yml'
      - 'pyrightconfig.json'
      - '**.py'
      - '**/requirements*.txt'
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  python-type-check:
    runs-on: ubuntu-latest
    name: pyright type-check
    steps:
      - name: Check out source repository
        uses: actions/checkout@v4
      - name: Set up Python environment
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Python dependencies
        # TODO: use a venv
        run: pip install -r requirements/requirements-all.txt
      - name: Type-check with Pyright
        uses: jakebailey/pyright-action@v2
        with:
          version: 1.1.382
          level: warning
          warnings: true
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@ -0,0 +1,239 @@
 # Server build and tests
 name: Server
 on:
  workflow_dispatch: # allows manual triggering
    inputs:
      sha:
        description: 'Commit SHA1 to build'
        required: false
        type: string
      slow_tests:
        description: 'Run slow tests'
        required: true
        type: boolean
  push:
    branches:
      - master
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
  pull_request:
    types: [opened, synchronize, reopened]
    paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
 env:
  LLAMA_LOG_COLORS: 1
  LLAMA_LOG_PREFIX: 1
  LLAMA_LOG_TIMESTAMPS: 1
  LLAMA_LOG_VERBOSITY: 10
 concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  server:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        sanitizer: [ADDRESS, UNDEFINED] # THREAD is broken
        build_type: [RelWithDebInfo]
        include:
          - build_type: Release
            sanitizer: ""
      fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken
    steps:
      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get -y install \
            build-essential \
            xxd \
            git \
            cmake \
            curl \
            wget \
            language-pack-en \
            libcurl4-openssl-dev
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      # Setup nodejs (to be used for verifying bundled index.html)
      - uses: actions/setup-node@v4
        with:
          node-version: '22.11.0'
      - name: WebUI - Install dependencies
        id: webui_lint
        run: |
          cd examples/server/webui
          npm ci
      - name: WebUI - Check code format
        id: webui_format
        run: |
          git config --global --add safe.directory $(realpath .)
          cd examples/server/webui
          git status
          npm run format
          git status
          modified_files="$(git status -s)"
          echo "Modified files: ${modified_files}"
          if [ -n "${modified_files}" ]; then
            echo "Files do not follow coding style. To fix: npm run format"
            echo "${modified_files}"
            exit 1
          fi
      - name: Verify bundled index.html
        id: verify_server_index_html
        run: |
          git config --global --add safe.directory $(realpath .)
          cd examples/server/webui
          git status
          npm run build
          git status
          modified_files="$(git status -s)"
          echo "Modified files: ${modified_files}"
          if [ -n "${modified_files}" ]; then
            echo "Repository is dirty or server/webui is not built as expected"
            echo "Hint: You may need to follow Web UI build guide in server/README.md"
            echo "${modified_files}"
            exit 1
          fi
      - name: Build (no OpenMP)
        id: cmake_build_no_openmp
        if: ${{ matrix.sanitizer == 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
              -DGGML_OPENMP=OFF ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Build (sanitizers)
        id: cmake_build_sanitizers
        if: ${{ matrix.sanitizer != '' && matrix.sanitizer != 'THREAD' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
              -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Build (sanitizers)
        id: cmake_build
        if: ${{ matrix.sanitizer == '' }}
        run: |
          cmake -B build \
              -DGGML_NATIVE=OFF \
              -DLLAMA_BUILD_SERVER=ON \
              -DLLAMA_CURL=ON \
              -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ;
          cmake --build build --config ${{ matrix.build_type }} -j $(nproc) --target llama-server
      - name: Tests
        id: server_integration_tests
        if: ${{ matrix.sanitizer == '' }}
        run: |
          cd examples/server/tests
          ./tests.sh
      - name: Tests (sanitizers)
        id: server_integration_tests_sanitizers
        if: ${{ matrix.sanitizer != '' }}
        run: |
          cd examples/server/tests
          LLAMA_SANITIZE=1 ./tests.sh
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          SLOW_TESTS=1 ./tests.sh
  server-windows:
    runs-on: windows-2019
    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          ref: ${{ github.event.inputs.sha || github.event.pull_request.head.sha || github.sha || github.head_ref || github.ref_name }}
      - name: libCURL
        id: get_libcurl
        env:
          CURL_VERSION: 8.6.0_6
        run: |
          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
          mkdir $env:RUNNER_TEMP/libcurl
          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
      - name: Build
        id: cmake_build
        run: |
          cmake -B build -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
          cmake --build build --config Release -j ${env:NUMBER_OF_PROCESSORS} --target llama-server
      - name: Python setup
        id: setup_python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Tests dependencies
        id: test_dependencies
        run: |
          pip install -r examples/server/tests/requirements.txt
      - name: Copy Libcurl
        id: prepare_libcurl
        run: |
          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
      - name: Tests
        id: server_integration_tests
        if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
        run: |
          cd examples/server/tests
          $env:PYTHONIOENCODING = ":replace"
          pytest -v -x -m "not slow"
      - name: Slow tests
        id: server_integration_tests_slow
        if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
        run: |
          cd examples/server/tests
          $env:SLOW_TESTS = "1"
          pytest -v -x
--- a/.github/workflows/tidy-post.yml
+++ b/.github/workflows/tidy-post.yml
@ -1,20 +0,0 @@
 name: clang-tidy review post comments
 on:
  workflow_dispatch:
    workflows: ["clang-tidy-review"]
    types:
      - completed
 jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: ZedThree/clang-tidy-review/post@v0.13.0
        # lgtm_comment_body, max_comments, and annotations need to be set on the posting workflow in a split setup
        with:
          # adjust options as necessary
          lgtm_comment_body: ''
          annotations: false
          max_comments: 25
--- a/.github/workflows/tidy-review.yml
+++ b/.github/workflows/tidy-review.yml
@ -1,23 +0,0 @@
 name: clang-tidy-review
 on:
  pull_request:
    branches:
      - master
 jobs:
  clang-tidy-review:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v3
    - uses: ZedThree/clang-tidy-review@v0.13.0
      id: review
      with:
        lgtm_comment_body: ''
        build_dir: build
        cmake_command: cmake . -B build -DCMAKE_EXPORT_COMPILE_COMMANDS=on
        split_workflow: true
    - uses: ZedThree/clang-tidy-review/upload@v0.13.0
--- a/.github/workflows/zig-build.yml
+++ b/.github/workflows/zig-build.yml
@ -1,25 +0,0 @@
 name: Zig CI
 on:
  pull_request:
  push:
    branches:
      - master
 jobs:
  build:
    strategy:
      fail-fast: false
      matrix:
        runs-on: [ubuntu-latest, macos-latest, windows-latest]
    runs-on: ${{ matrix.runs-on }}
    steps:
      - uses: actions/checkout@v3
        with:
          submodules: recursive
          fetch-depth: 0
      - uses: goto-bus-stop/setup-zig@v2
        with:
          version: 0.11.0
      - name: Build Summary
        run: zig build --summary all -freference-trace
--- a/.gitignore
+++ b/.gitignore
@ -1,92 +1,145 @@
-*.o
+# Extensions
 *.a
 *.so
 *.gguf
 *.bin
 *.exe
 *.dll
 *.log
 *.gcov
 *.gcno
 *.gcda
 *.dot
 *.bat
 *.bin
 *.d
 *.dll
 *.dot
 *.etag
 *.exe
 *.gcda
 *.gcno
 *.gcov
 *.gguf
 *.gguf.json
 *.lastModified
 *.log
 *.metallib
-.DS_Store
+*.o
-.build/
+*.so
 *.swp
 *.tmp
 # IDE / OS
 .cache/
 .ccls-cache/
 .direnv/
 .DS_Store
 .envrc
 .idea/
 .swiftpm
 .venv
 .clang-tidy
 .vs/
 .vscode/
 nppBackup
 # Coverage
 lcov-report/
 gcovr-report/
 lcov-report/
 # Build Artifacts
 tags
 .build/
 build*
 !build-info.cmake
 !build-info.cpp.in
 !build-info.sh
 !build.zig
 !docs/build.md
 /libllama.so
 /llama-*
 /vulkan-shaders-gen
 android-ndk-*
 arm_neon.h
 cmake-build-*
 CMakeSettings.json
 compile_commands.json
 ggml-metal-embed.metal
 llama-batched-swift
 /rpc-server
 out/
 tmp/
 autogen-*.md
 # Deprecated
 /main
 /server
 # CI
 !.github/workflows/*.yml
 # Models
 models/*
 models-mnt
 !models/.editorconfig
 !models/ggml-vocab-*.gguf*
-/Pipfile
+# Zig
 /baby-llama
 /beam-search
 /benchmark-matmult
 /convert-llama2c-to-ggml
 /embd-input-test
 /embedding
 /gguf
 /gguf-llama-simple
 /imatrix
 /infill
 /libllama.so
 /llama-bench
 /llava-cli
 /lookahead
 /lookup
 /main
 /metal
 /passkey
 /perplexity
 /q8dot
 /quantize
 /quantize-stats
 /result
 /save-load-state
 /server
 /simple
 /batched
 /batched-bench
 /export-lora
 /finetune
 /speculative
 /parallel
 /train-text-from-scratch
 /tokenize
 /vdot
 /common/build-info.cpp
 arm_neon.h
 compile_commands.json
 CMakeSettings.json
 __pycache__
 dist
 zig-out/
 zig-cache/
 # Logs
 ppl-*.txt
 qnt-*.txt
 perf-*.txt
-examples/jeopardy/results.txt
+# Examples
-poetry.lock
+examples/jeopardy/results.txt
 examples/server/*.css.hpp
 examples/server/*.html.hpp
 examples/server/*.js.hpp
 examples/server/*.mjs.hpp
 !build_64.sh
 !examples/*.bat
 !examples/*/*.kts
 !examples/*/*/*.kts
 !examples/sycl/*.bat
 !examples/sycl/*.sh
 # Server Web UI temporary files
 node_modules
 examples/server/webui/dist
 # Python
 /.venv
 __pycache__/
 */poetry.lock
 poetry.toml
-nppBackup
+
 # Nix
 /result
 # Test binaries
 /tests/test-backend-ops
 /tests/test-double-float
 /tests/test-grad0
 /tests/test-grammar-parser
 /tests/test-llama-grammar
 /tests/test-opt
 /tests/test-quantize-fns
 /tests/test-quantize-perf
 /tests/test-rope
 /tests/test-sampling
 /tests/test-tokenizer-0
 /tests/test-tokenizer-1-bpe
 /tests/test-tokenizer-1-spm
 # Scripts
 !/scripts/install-oneapi.bat
 # Test models for lora adapters
 /lora-tests
 # Local scripts
 /run-vim.sh
 /run-chat.sh
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,3 @@
 [submodule "kompute"]
-	path = kompute
+	path = ggml/src/ggml-kompute/kompute
 	url = https://github.com/nomic-ai/kompute.git
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,13 +3,14 @@
 exclude: prompts/.*.txt
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.2.0
+  rev: v4.6.0
  hooks:
  - id: trailing-whitespace
  - id: end-of-file-fixer
  - id: check-yaml
  - id: check-added-large-files
 - repo: https://github.com/PyCQA/flake8
-  rev: 6.0.0
+  rev: 7.0.0
  hooks:
  -   id: flake8
      additional_dependencies: [flake8-no-print]
--- a/1047
+++ b/1047
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/CMakePresets.json
+++ b/CMakePresets.json
@ -0,0 +1,97 @@
 {
  "version": 4,
  "configurePresets": [
    {
        "name":  "base",
        "hidden": true,
        "generator":   "Ninja",
        "binaryDir":   "${sourceDir}/build-${presetName}",
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
    {
        "name": "sycl-base",
        "hidden": true,
        "generator": "Ninja",
        "binaryDir": "${sourceDir}/build-${presetName}",
        "cacheVariables": {
            "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
            "CMAKE_CXX_COMPILER": "icx",
            "CMAKE_C_COMPILER": "cl",
            "GGML_SYCL": "ON",
            "CMAKE_INSTALL_RPATH": "$ORIGIN;$ORIGIN/.."
        }
    },
    { "name": "debug",    "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Debug" } },
    { "name": "release",  "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "Release" } },
    { "name": "reldbg",   "hidden": true, "cacheVariables": { "CMAKE_BUILD_TYPE": "RelWithDebInfo" } },
    { "name": "static",   "hidden": true, "cacheVariables": { "GGML_STATIC":      "ON" } },
    { "name": "sycl_f16", "hidden": true, "cacheVariables": { "GGML_SYCL_F16":    "ON" } },
    { "name": "vulkan",   "hidden": true, "cacheVariables": { "GGML_VULKAN":      "ON" } },
    {
        "name": "x64-windows-llvm", "hidden": true,
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/x64-windows-llvm.cmake"
        }
    },
    {
        "name": "arm64-windows-msvc", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
        "toolset":      { "value": "host=x64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-msvc.cmake"
        }
    },
    {
        "name": "arm64-windows-llvm", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
        "toolset":      { "value": "host=x64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-windows-llvm.cmake"
        }
    },
    {
        "name": "arm64-apple-clang", "hidden": true,
        "architecture": { "value": "arm64",    "strategy": "external" },
        "toolset":      { "value": "host=x64", "strategy": "external" },
        "cacheVariables": {
            "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/cmake/arm64-apple-clang.cmake"
        }
    },
    { "name": "arm64-windows-llvm-debug", "inherits": [ "base", "arm64-windows-llvm", "debug" ] },
    { "name": "arm64-windows-llvm-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg" ] },
    { "name": "arm64-windows-llvm+static-release", "inherits": [ "base", "arm64-windows-llvm", "reldbg", "static" ] },
    { "name": "arm64-apple-clang-debug", "inherits": [ "base", "arm64-apple-clang", "debug" ] },
    { "name": "arm64-apple-clang-release", "inherits": [ "base", "arm64-apple-clang", "reldbg" ] },
    { "name": "arm64-apple-clang+static-release", "inherits": [ "base", "arm64-apple-clang",  "reldbg", "static" ] },
    { "name": "arm64-windows-msvc-debug", "inherits": [ "base", "arm64-windows-msvc",  "debug"   ] },
    { "name": "arm64-windows-msvc-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg" ] },
    { "name": "arm64-windows-msvc+static-release", "inherits": [ "base", "arm64-windows-msvc",  "reldbg", "static" ] },
    { "name": "x64-windows-llvm-debug", "inherits": [ "base", "x64-windows-llvm", "debug" ] },
    { "name": "x64-windows-llvm-release", "inherits": [ "base", "x64-windows-llvm", "release" ] },
    { "name": "x64-windows-llvm-reldbg", "inherits": [ "base", "x64-windows-llvm", "reldbg" ] },
    { "name": "x64-windows-llvm+static-release", "inherits": [ "base", "x64-windows-llvm", "reldbg", "static" ] },
    { "name": "x64-windows-msvc-debug", "inherits": [ "base", "debug" ] },
    { "name": "x64-windows-msvc-release", "inherits": [ "base", "reldbg" ] },
    { "name": "x64-windows-msvc+static-release", "inherits": [ "base", "reldbg", "static" ] },
    { "name": "x64-windows-sycl-debug", "inherits": [ "sycl-base", "debug" ] },
    { "name": "x64-windows-sycl-debug-f16", "inherits": [ "sycl-base", "debug", "sycl_f16" ] },
    { "name": "x64-windows-sycl-release", "inherits": [ "sycl-base", "release" ] },
    { "name": "x64-windows-sycl-release-f16", "inherits": [ "sycl-base", "release", "sycl_f16" ] },
    { "name": "x64-windows-vulkan-debug", "inherits": [ "base", "vulkan", "debug" ] },
    { "name": "x64-windows-vulkan-release", "inherits": [ "base", "vulkan", "release" ] }
  ]
 }
--- a/11
+++ b/11
@ -0,0 +1,11 @@
 # collaborators can optionally add themselves here to indicate their availability for reviewing related PRs
 /ci/ @ggerganov
 /.devops/*.Dockerfile @ngxson
 /examples/server/ @ngxson
 /ggml/src/ggml-cuda/fattn* @JohannesGaessler
 /ggml/src/ggml-cuda/mmq.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmv.* @JohannesGaessler
 /ggml/src/ggml-cuda/mmvq.* @JohannesGaessler
 /ggml/src/ggml-opt.cpp @JohannesGaessler
 /ggml/src/gguf.cpp @JohannesGaessler
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,125 @@
 # Pull requests (for contributors)
 - Test your changes:
    - Execute [the full CI locally on your machine](ci/README.md) before publishing
    - Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
    - If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
    - If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
 - Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
 - If your PR becomes stale, don't hesitate to ping the maintainers in the comments
 # Pull requests (for collaborators)
 - Squash-merge PRs
 - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
 - Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
 - Consider adding yourself to [CODEOWNERS](CODEOWNERS)
 # Coding guidelines
 - Avoid adding third-party dependencies, extra files, extra headers, etc.
 - Always consider cross-compatibility with other operating systems and architectures
 - Avoid fancy-looking modern STL constructs, use basic `for` loops, avoid templates, keep it simple
 - Vertical alignment makes things more readable and easier to batch edit
 - Clean-up any trailing whitespaces, use 4 spaces for indentation, brackets on the same line, `void * ptr`, `int & a`
 - Use sized integer types such as `int32_t` in the public API, e.g. `size_t` may also be appropriate for allocation sizes or byte offsets
 - Declare structs with `struct foo {}` instead of `typedef struct foo {} foo`
    - In C++ code omit optional `struct` and `enum` keyword whenever they are not necessary
    ```cpp
    // OK
    llama_context * ctx;
    const llama_rope_type rope_type;
    // not OK
    struct llama_context * ctx;
    const enum llama_rope_type rope_type;
    ```
    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline.)_
 - Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
 - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
 - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
 - Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
 ![matmul](media/matmul.png)
 # Naming guidelines
 - Use `snake_case` for function, variable and type names
 - Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
    ```cpp
    // not OK
    int small_number;
    int big_number;
    // OK
    int number_small;
    int number_big;
    ```
 - Enum values are always in upper case and prefixed with the enum name
    ```cpp
    enum llama_vocab_type {
        LLAMA_VOCAB_TYPE_NONE = 0,
        LLAMA_VOCAB_TYPE_SPM  = 1,
        LLAMA_VOCAB_TYPE_BPE  = 2,
        LLAMA_VOCAB_TYPE_WPM  = 3,
        LLAMA_VOCAB_TYPE_UGM  = 4,
        LLAMA_VOCAB_TYPE_RWKV = 5,
    };
    ```
 - The general naming pattern is `<class>_<method>`, with `<method>` being `<action>_<noun>`
    ```cpp
    llama_model_init();           // class: "llama_model",         method: "init"
    llama_sampler_chain_remove(); // class: "llama_sampler_chain", method: "remove"
    llama_sampler_get_seed();     // class: "llama_sampler",       method: "get_seed"
    llama_set_embeddings();       // class: "llama_context",       method: "set_embeddings"
    llama_n_threads();            // class: "llama_context",       method: "n_threads"
    llama_adapter_lora_free();    // class: "llama_adapter_lora",  method: "free"
    ```
    - The `get` `<action>` can be omitted
    - The `<noun>` can be omitted if not necessary
    - The `_context` suffix of the `<class>` is optional. Use it to disambiguate symbols when needed
    - Use `init`/`free` for constructor/destructor `<action>`
 - Use the `_t` suffix when a type is supposed to be opaque to the user - it's not relevant to them if it is a struct or anything else
    ```cpp
    typedef struct llama_context * llama_context_t;
    enum llama_pooling_type llama_pooling_type(const llama_context_t ctx);
    ```
    _(NOTE: this guideline is yet to be applied to the `llama.cpp` codebase. New code should follow this guideline)_
 - C/C++ filenames are all lowercase with dashes. Headers use the `.h` extension. Source files use the `.c` or `.cpp` extension
 - Python filenames are all lowercase with underscores
 - _(TODO: abbreviations usage)_
 # Preprocessor directives
 - _(TODO: add guidelines with examples and apply them to the codebase)_
    ```cpp
    #ifdef FOO
    #endif // FOO
    ```
 # Documentation
 - Documentation is a community effort
 - When you need to look into the source code to figure out how to use an API consider adding a short summary to the header file for future reference
 - When you notice incorrect or outdated documentation, please update it
 # Resources
 The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
 https://github.com/ggerganov/llama.cpp/projects
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2023 Georgi Gerganov
+Copyright (c) 2023-2024 The ggml authors
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/1517
+++ b/1517
--- a/Package.swift
+++ b/Package.swift
@ -13,34 +13,7 @@ let package = Package(
    products: [
        .library(name: "llama", targets: ["llama"]),
    ],
    dependencies: [
        .package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
    ],
    targets: [
-        .target(
+        .systemLibrary(name: "llama", pkgConfig: "llama"),
-            name: "llama",
+    ]
            dependencies: ["ggml"],
            path: ".",
            exclude: ["ggml-metal.metal"],
            sources: [
                "llama.cpp",
            ],
            publicHeadersPath: "spm-headers",
            cSettings: [
                .unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
                .define("GGML_USE_ACCELERATE"),
                .unsafeFlags(["-fno-objc-arc"]),
                .define("GGML_USE_METAL"),
                // NOTE: NEW_LAPACK will required iOS version 16.4+
                // We should consider add this in the future when we drop support for iOS 14
                // (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
                // .define("ACCELERATE_NEW_LAPACK"),
                // .define("ACCELERATE_LAPACK_ILP64")
            ],
            linkerSettings: [
                .linkedFramework("Accelerate")
            ]
        )
    ],
    cxxLanguageStandard: .cxx11
 )
--- a/README-sycl.md
+++ b/README-sycl.md
@ -1,426 +0,0 @@
 # llama.cpp for SYCL
 [Background](#background)
 [OS](#os)
 [Intel GPU](#intel-gpu)
 [Linux](#linux)
 [Windows](#windows)
 [Environment Variable](#environment-variable)
 [Known Issue](#known-issue)
 [Q&A](#q&a)
 [Todo](#todo)
 ## Background
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
 oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
 Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
 To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
 The llama.cpp for SYCL is used to support Intel GPUs.
 For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 ## OS
 |OS|Status|Verified|
 |-|-|-|
 |Linux|Support|Ubuntu 22.04|
 |Windows|Support|Windows 11|
 ## Intel GPU
 |Intel GPU| Status | Verified Model|
 |-|-|-|
 |Intel Data Center Max Series| Support| Max 1550|
 |Intel Data Center Flex Series| Support| Flex 170|
 |Intel Arc Series| Support| Arc 770, 730M|
 |Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
 |Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
 ## Linux
 ### Setup Environment
 1. Install Intel GPU driver.
 a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
 Note: for iGPU, please install the client GPU driver.
 b. Add user to group: video, render.
 ```
 sudo usermod -aG render username
 sudo usermod -aG video username
 ```
 Note: re-login to enable it.
 c. Check
 ```
 sudo apt install clinfo
 sudo clinfo -l
 ```
 Output (example):
 ```
 Platform #0: Intel(R) OpenCL Graphics
 `-- Device #0: Intel(R) Arc(TM) A770 Graphics
 Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```
 2. Install Intel® oneAPI Base toolkit.
 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 Recommend to install to default folder: **/opt/intel/oneapi**.
 Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
 b. Check
 ```
 source /opt/intel/oneapi/setvars.sh
 sycl-ls
 ```
 There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
 Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 ```
 2. Build locally:
 ```
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
 #for FP16
 #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
 #for FP32
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #build example/main only
 #cmake --build . --config Release --target main
 #build all binary
 cmake --build . --config Release -v
 cd ..
 ```
 or
 ```
 ./examples/sycl/build.sh
 ```
 Note:
 - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
 ### Run
 1. Put model file to folder **models**
 2. Enable oneAPI running environment
 ```
 source /opt/intel/oneapi/setvars.sh
 ```
 3. List device ID
 Run without parameter:
 ```
 ./build/bin/ls-sycl-device
 or
 ./build/bin/main
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
 4. Set device ID and execute llama.cpp
 Set device ID = 0 by **GGML_SYCL_DEVICE=0**
 ```
 GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 or run by script:
 ```
 ./examples/sycl/run-llama2.sh
 ```
 Note:
 - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
 5. Check the device ID in output
 Like:
 ```
 Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 ## Windows
 ### Setup Environment
 1. Install Intel GPU driver.
 Please install Intel GPU driver by official guide: [Install GPU Drivers](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/arc/software/drivers.html).
 2. Install Intel® oneAPI Base toolkit.
 a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 Recommend to install to default folder: **/opt/intel/oneapi**.
 Following guide uses the default folder as example. If you use other folder, please modify the following guide info with your folder.
 b. Enable oneAPI running environment:
 - In Search, input 'oneAPI'.
 Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
 - In Run:
 In CMD:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```
 c. Check GPU
 In oneAPI command line:
 ```
 sycl-ls
 ```
 There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
 Output (example):
 ```
 [opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
 [opencl:cpu:1] Intel(R) OpenCL, 11th Gen Intel(R) Core(TM) i7-1185G7 @ 3.00GHz OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
 [opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Iris(R) Xe Graphics OpenCL 3.0 NEO  [31.0.101.5186]
 [ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Iris(R) Xe Graphics 1.3 [1.3.28044]
 ```
 3. Install cmake & make
 a. Download & install cmake for windows: https://cmake.org/download/
 b. Download & install make for windows provided by mingw-w64: https://www.mingw-w64.org/downloads/
 ### Build locally:
 In oneAPI command line window:
 ```
 mkdir -p build
 cd build
@call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force
 ::  for FP16
 ::  faster for long-prompt inference
 ::  cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release -DLLAMA_SYCL_F16=ON
 ::  for FP32
 cmake -G "MinGW Makefiles" ..  -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icx  -DCMAKE_BUILD_TYPE=Release
 ::  build example/main only
 ::  make main
 ::  build all binary
 make -j
 cd ..
 ```
 or
 ```
 .\examples\sycl\win-build-sycl.bat
 ```
 Note:
 - By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
 ### Run
 1. Put model file to folder **models**
 2. Enable oneAPI running environment
 - In Search, input 'oneAPI'.
 Search & open "Intel oneAPI command prompt for Intel 64 for Visual Studio 2022"
 - In Run:
 In CMD:
 ```
 "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
 ```
 3. List device ID
 Run without parameter:
 ```
 build\bin\ls-sycl-device.exe
 or
 build\bin\main.exe
 ```
 Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 ```
 |Attribute|Note|
 |-|-|
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
 4. Set device ID and execute llama.cpp
 Set device ID = 0 by **set GGML_SYCL_DEVICE=0**
 ```
 set GGML_SYCL_DEVICE=0
 build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0
 ```
 or run by script:
 ```
 .\examples\sycl\win-run-llama2.bat
 ```
 Note:
 - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
 5. Check the device ID in output
 Like:
 ```
 Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 ## Environment Variable
 #### Build
 |Name|Value|Function|
 |-|-|-|
 |LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
 |LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
 |CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
 |CMAKE_CXX_COMPILER|icpx (Linux), icx (Windows)|use icpx/icx for SYCL code path|
 #### Running
 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
 ## Known Issue
 - Hang during startup
  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
  Solution: add **--no-mmap**.
 ## Q&A
 - Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
  Miss to enable oneAPI running environment.
  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
 - In Windows, no result, not error.
  Miss to enable oneAPI running environment.
 ## Todo
 - Support to build in Windows.
 - Support multiple cards.
--- a/README.md
+++ b/README.md
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,67 @@
 # Security Policy
 - [**Using llama.cpp securely**](#using-llamacpp-securely)
   - [Untrusted models](#untrusted-models)
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Untrusted environments or networks](#untrusted-environments-or-networks)
   - [Multi-Tenant environments](#multi-tenant-environments)
 - [**Reporting a vulnerability**](#reporting-a-vulnerability)
 ## Using llama.cpp securely
 ### Untrusted models
 Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources.
 *Always execute untrusted models within a secure, isolated environment such as a sandbox* (e.g., containers, virtual machines). This helps protect your system from potentially malicious code.
 > [!NOTE]
 > The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
 ### Untrusted inputs
 Some models accept various input formats (text, images, audio, etc.). The libraries converting these inputs have varying security levels, so it's crucial to isolate the model and carefully pre-process inputs to mitigate script injection risks.
 For maximum security when handling untrusted inputs, you may need to employ the following:
 * Sandboxing: Isolate the environment where the inference happens.
 * Pre-analysis: Check how the model performs by default when exposed to prompt injection (e.g. using [fuzzing for prompt injection](https://github.com/FonduAI/awesome-prompt-injection?tab=readme-ov-file#tools)). This will give you leads on how hard you will have to work on the next topics.
 * Updates: Keep both LLaMA C++ and your libraries updated with the latest security patches.
 * Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as:
    * Validation: Enforce strict rules on allowed characters and data types.
    * Filtering: Remove potentially malicious scripts or code fragments.
    * Encoding: Convert special characters into safe representations.
    * Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)).
 ### Data privacy
 To protect sensitive data from potential leaks or unauthorized access, it is crucial to sandbox the model execution. This means running the model in a secure, isolated environment, which helps mitigate many attack vectors.
 ### Untrusted environments or networks
 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
 * Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
 * Encrypt your data if sending it over the network.
 ### Multi-Tenant environments
 If you intend to run multiple models in parallel with shared memory, it is your responsibility to ensure the models do not interact or access each other's data. The primary areas of concern are tenant isolation, resource allocation, model sharing and hardware attacks.
 1. Tenant Isolation: Models should run separately with strong isolation methods to prevent unwanted data access. Separating networks is crucial for isolation, as it prevents unauthorized access to data or models and malicious users from sending graphs to execute under another tenant's identity.
 2. Resource Allocation: A denial of service caused by one model can impact the overall system health. Implement safeguards like rate limits, access controls, and health monitoring.
 3. Model Sharing: In a multitenant model sharing design, tenants and users must understand the security risks of running code provided by others. Since there are no reliable methods to detect malicious models, sandboxing the model execution is the recommended approach to mitigate the risk.
 4. Hardware Attacks: GPUs or TPUs can also be attacked. [Researches](https://scholar.google.com/scholar?q=gpu+side+channel) has shown that side channel attacks on GPUs are possible, which can make data leak from other models or processes running on the same system at the same time.
 ## Reporting a vulnerability
 Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-securely) are considered vulnerabilities of LLaMA C++.
 <!-- normal version -->
 However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
 Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
 A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
--- a/40
+++ b/40
@ -1,40 +0,0 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
 ec2f2d1f0dfb73b72a4cbac7fa121abbe04c37ab327125a38248f930c0f09ddf  models/7B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
 fad169e6f0f575402cf75945961cb4a8ecd824ba4da6be2af831f320c4348fa5  models/13B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
 24a87f01028cbd3a12de551dcedb712346c0b5cbdeff1454e0ddf2df9b675378  models/30B/consolidated.02.pth
 1adfcef71420886119544949767f6a56cb6339b4d5fcde755d80fe68b49de93b  models/30B/consolidated.03.pth
 7e1b524061a9f4b27c22a12d6d2a5bf13b8ebbea73e99f218809351ed9cf7d37  models/30B/ggml-model-f16.bin
 d2a441403944819492ec8c2002cc36fa38468149bfb4b7b4c52afc7bd9a7166d  models/30B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/30B/ggml-model-q5_1.bin
 2c07118ea98d69dbe7810d88520e30288fa994751b337f8fca02b171955f44cb  models/30B/params.json
 135c563f6b3938114458183afb01adc9a63bef3d8ff7cccc3977e5d3664ecafe  models/65B/consolidated.00.pth
 9a600b37b19d38c7e43809485f70d17d1dc12206c07efa83bc72bb498a568bde  models/65B/consolidated.01.pth
 e7babf7c5606f165a3756f527cb0fedc4f83e67ef1290391e52fb1cce5f26770  models/65B/consolidated.02.pth
 73176ffb426b40482f2aa67ae1217ef79fbbd1fff5482bae5060cdc5a24ab70e  models/65B/consolidated.03.pth
 882e6431d0b08a8bc66261a0d3607da21cbaeafa96a24e7e59777632dbdac225  models/65B/consolidated.04.pth
 a287c0dfe49081626567c7fe87f74cce5831f58e459b427b5e05567641f47b78  models/65B/consolidated.05.pth
 72b4eba67a1a3b18cb67a85b70f8f1640caae9b40033ea943fb166bd80a7b36b  models/65B/consolidated.06.pth
 d27f5b0677d7ff129ceacd73fd461c4d06910ad7787cf217b249948c3f3bc638  models/65B/consolidated.07.pth
 60758f2384d74e423dffddfd020ffed9d3bb186ebc54506f9c4a787d0f5367b0  models/65B/ggml-model-f16.bin
 cde053439fa4910ae454407e2717cc46cc2c2b4995c00c93297a2b52e790fa92  models/65B/ggml-model-q4_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q4_1.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_0.bin
 ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/65B/ggml-model-q5_1.bin
 999ed1659b469ccc2a941714c0a9656fa571d17c9f7c8c7589817ca90edef51b  models/65B/params.json
 9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347  models/tokenizer.model
--- a/Sources/llama/llama.h
+++ b/Sources/llama/llama.h
@ -0,0 +1,4 @@
 #pragma once
 #include <llama.h>
--- a/Sources/llama/module.modulemap
+++ b/Sources/llama/module.modulemap
@ -0,0 +1,5 @@
 module llama [system] {
    header "llama.h"
    link "llama"
    export *
 }
--- a/awq-py/README.md
+++ b/awq-py/README.md
@ -1,116 +0,0 @@
 # AWQ: Activation-aware Weight Quantization for LLM - version apply to llamacpp
 [[Paper](https://arxiv.org/abs/2306.00978)][[Original Repo](https://github.com/mit-han-lab/llm-awq)][[Easy-to-use Repo](https://github.com/casper-hansen/AutoAWQ)]
 **Supported models:**
 - [X] LLaMA
 - [x] LLaMA 2
 - [X] MPT
 - [X] Mistral AI v0.1
 - [ ] Bloom
 - [ ] Mixtral MoE
 **TODO:**
 - [x] Update version work with both MPT and MPT-AWQ model
 - [ ] Add OPT model
 - [ ] Add Bloom model
 - [ ] Add Mixtral MoE
 - [ ] Support w3, w2
 ## Contents
 - [Install](##Install)
 - [Convert](##Convert)
 - [Quantize](##Quantize)
 - [Test](##Test)
 - [Benchmark](##Benchmark)
 - [Results](##Results)
 ## Install
 Install requirements
 ```bash
 pip install -r requirements.txt
 ```
 Get the pre-computed AWQ search results for multiple model families, including LLaMA, LLaMA2, MPT, OPT
 ```bash
 git clone https://huggingface.co/datasets/mit-han-lab/awq-model-zoo awq_cache
 ```
 ## Convert
 Example for llama model
 ```bash
 # For llama7b and llama2 models
 python convert.py models/llama-7b/ --awq-path awq_cache/llama-7b-w4-g128.pt --outfile models/llama_7b_fp16.gguf
 # For mistral and mpt models
 python convert-hf-to-gguf.py models/mpt-7b/ --awq-path awq_cache/mpt-7b-w4-g128.pt --outfile models/mpt_7b_fp16.gguf
 ```
 ## Quantize
 ```bash
 # We only benchmark and confirm the results on q4_0, q4_1, and q2_k types.
 ./quantize models/llama_7b_fp16.gguf models/llama_7b_q4_0.gguf q4_0
 ```
 ## Test
 ```bash
 # For all models.
 ./build/bin/main -m models/llama_7b_q4_0.gguf -n 128 --prompt "Once upon a time"
 ```
 ## Benchmark
 The perplexity measurements in table above are done against the `wikitext2` test dataset (https://paperswithcode.com/dataset/wikitext-2), with context length of 512.
 ```bash
 # For llama and llama2, and mistral models.
 ./perplexity -m models/llama_7b_q4_0.gguf -f datasets/wikitext-2-raw/wiki.test.raw
 ```
 ## Results
 Results are run on OpenBLAS (CPU) and CuBLAS (GPU) for fair comparison
 We use three types of llamacpp quantization methods to work with our version, including q4_0, q4_1, and q2_k
 ### Llama 7B (Build with OpenBLAS)
 | Model      | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
 |-----------:|--------------|-------:|-------:|-------:|-------:|
 |Llama 7B    | perplexity   | 5.9066 | 6.1214 | 6.0643 | 6.5808 |
 |Llama 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
 |Llama 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
 |AWQ-LLama 7B| perplexity   | 5.9175 | 6.0252 | 5.9987 | 6.3692 |
 |AWQ-LLama 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
 |AWQ-LLama 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
 ### Llama2 7B (Build with CuBLAS)
 | Model       | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
 |------------:|--------------|-------:|-------:|-------:|-------:|
 |Llama2 7B    | perplexity   | 5.8664 | 6.0260 | 6.0656 | 6.4496 |
 |Llama2 7B    | file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
 |Llama2 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
 |AWQ-LLama2 7B| perplexity   | 5.8801 | 6.0054 | 5.9849 | 6.3650 |
 |AWQ-LLama2 7B| file size    |  12.9G  |   3.5G |   3.9G |   2.7G |
 |AWQ-LLama2 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
 ### Mistral 7B v0.1 (Build with CuBLAS)
 | Model        | Measure      | F16    | Q4_0   | Q4_1   | Q2_K   |
 |-------------:|--------------|-------:|-------:|-------:|-------:|
 |Mistral 7B    | perplexity   | 5.6931 | 5.8202 | 5.8268 | 6.1645 |
 |Mistral 7B    | file size     |  14.5G |   4.1G |   4.5G |   3.1G |
 |Mistral 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
 |AWQ-Mistral 7B| perplexity   | 5.6934 | 5.8020 | 5.7691 | 6.0426 |
 |AWQ-Mistral 7B| file size     |  14.5G |   4.1G |   4.5G |   3.1G |
 |AWQ-Mistral 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6 |
 ### MPT 7B (Build with OpenBLAS)
 | Model    | Measure      | F16    | Q4_0   | Q4_1   | Q2_K    |
 |---------:|--------------|-------:|-------:|-------:|--------:|
 |MPT 7B    | perplexity   | 8.4369 | 8.7956 | 8.6265 | 11.4913 |
 |MPT 7B    | file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
 |MPT 7B    | bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
 |AWQ-MPT 7B| perplexity   | 8.4944 | 8.7053 |  8.6750 | 10.2873|
 |AWQ-MPT 7B| file size    |  13.7G  |   3.9G |   4.3G |   2.8G  |
 |AWQ-MPT 7B| bits/weight  |   16.0 |    4.5 |    5.0 |    2.6  |
--- a/awq-py/awq/apply_awq.py
+++ b/awq-py/awq/apply_awq.py
@ -1,254 +0,0 @@
 """
 Implements the AWQ for llama.cpp use cases.
 Original paper: https://arxiv.org/abs/2306.00978
 This code is based on versions of the AWQ implementation found in the following repositories:
 * https://github.com/mit-han-lab/llm-awq
 * https://github.com/casper-hansen/AutoAWQ
 """
 import os
 import torch
 import torch.nn as nn
 from transformers import AutoModelForCausalLM, AutoConfig
 from transformers.models.bloom.modeling_bloom import BloomGelu
 from transformers.models.llama.modeling_llama import LlamaRMSNorm
 from transformers.activations import GELUActivation
 class ScaledActivation(nn.Module):
    """
    ScaledActivation module wraps an existing activation function and applies a
    scale factor to its output.
    Args:
        module (nn.Module): The activation function to be scaled.
        scales (torch.Tensor): A tensor of size (num_features,) containing the initial
            scale factors for each feature.
    Returns:
        torch.Tensor: The scaled output of the activation function.
    """
    def __init__(self, module, scales):
        super().__init__()
        self.act = module
        self.scales = nn.Parameter(scales.data)
    def forward(self, x):
        return self.act(x) / self.scales.view(1, 1, -1).to(x.device)
 def set_op_by_name(layer, name, new_module):
    """
    Set the new module for given module's name.
    Args:
        layer (nn.Module): The layer in which to replace the submodule.
        name (str): The path to the submodule to be replaced, using dot notation
            to access nested modules.
        new_module (nn.Module): The new module to replace the existing one.
    """
    levels = name.split(".")
    if len(levels) > 1:
        mod_ = layer
        for l_idx in range(len(levels) - 1):
            if levels[l_idx].isdigit():
                mod_ = mod_[int(levels[l_idx])]
            else:
                mod_ = getattr(mod_, levels[l_idx])
        setattr(mod_, levels[-1], new_module)
    else:
        setattr(layer, name, new_module)
 def get_op_by_name(module, op_name):
    """
    Retrieves a submodule within a given layer based on its name.
    Args:
        module (nn.Module): The layer containing the submodule to find.
        op_name (str): The name of the submodule.
    Returns:
        nn.Module: The requested submodule found within the given layer.
    Raises:
        ValueError: If the specified submodule cannot be found within the layer.
    """
    for name, m in module.named_modules():
        if name == op_name:
            return m
    raise ValueError(f"Cannot find op {op_name} in module {module}")
@torch.no_grad()
 def scale_ln_fcs(ln, fcs, scales):
    """
    Scales the weights of a LayerNorm and a list of fully-connected layers proportionally.
    Args:
        ln (nn.LayerNorm): The LayerNorm module to be scaled.
        fcs (List[nn.Linear]): A list of fully-connected layers to be scaled.
        scales (torch.Tensor): A 1D tensor of size (num_features,).
    """
    if not isinstance(fcs, list):
        fcs = [fcs]
    scales = scales.to(ln.weight.device)
    ln.weight.div_(scales)
    if hasattr(ln, "bias") and ln.bias is not None:
        ln.bias.div_(scales)
    for fc in fcs:
        fc.weight.mul_(scales.view(1, -1))
    for p in ln.parameters():
        assert torch.isnan(p).sum() == 0
    for fc in fcs:
        for p in fc.parameters():
            assert torch.isnan(p).sum() == 0
@torch.no_grad()
 def scale_fc_fc(fc1, fc2, scales):
    """
    Scales the weights of two fully-connected layers in a specific pattern.
    Args:
        fc1 (nn.Linear): The first fully-connected layer to be scaled.
        fc2 (nn.Linear): The second fully-connected layer to be scaled.
        scales (torch.Tensor): A 1D tensor of size (num_features,).
    """
    assert isinstance(fc1, nn.Linear)
    assert isinstance(fc2, nn.Linear)
    scales = scales.to(fc1.weight.device)
    fc1.weight[-scales.size(0):].div_(scales.view(-1, 1))
    if fc1.bias is not None:
        fc1.bias.div_(scales.view(-1))
    fc2.weight.mul_(scales.view(1, -1))
    for p in fc1.parameters():
        assert torch.isnan(p).sum() == 0
    for p in fc2.parameters():
        assert torch.isnan(p).sum() == 0
@torch.no_grad()
 def scale_gelu_fc(gelu, fc, scales):
    """
    Scales the weight of a GELU activation and a fully-connected layer proportionally.
    Args:
        gelu (Union[nn.GELU, BloomGelu, GELUActivation]): The GELU activation module to be scaled.
        fc (nn.Linear): The fully-connected layer to be scaled.
        scales (torch.Tensor): A 1D tensor of size (num_features,).
    Raises:
        TypeError: If the `gelu` module is not of type `nn.GELU`, `BloomGelu`, or `GELUActivation`.
        TypeError: If the `fc` module is not of type `nn.Linear`.
    """
    assert isinstance(gelu, (nn.GELU, BloomGelu, GELUActivation))
    assert isinstance(fc, nn.Linear)
    fc.weight.mul_(scales.view(1, -1).to(fc.weight.device))
    for p in fc.parameters():
        assert torch.isnan(p).sum() == 0
 def apply_scale(module, scales_list, input_feat_dict=None):
    """
    Applies different scaling strategies to layers based on their type and hierarchy within a given module.
    Args:
        module (nn.Module): The module containing the layers to be scaled.
        scales_list (List[Tuple[str, List[str], torch.Tensor]]): A list of tuples containing:
            * prev_op_name (str): The name of the preceding operation or module,
                relative to which the layers to be scaled are located.
            * layer_names (List[str]): A list of names of the layers to be scaled, relative to the preceding operation.
            * scales (torch.Tensor): A 1D tensor of size (num_features,) containing the scaling factors for each feature.
        input_feat_dict (Optional[Dict[str, torch.Tensor]]): A dictionary mapping layer names to their corresponding
            input features (optional).
    """
    for prev_op_name, layer_names, scales in scales_list:
        prev_op = get_op_by_name(module, prev_op_name)
        layers = [get_op_by_name(module, name) for name in layer_names]
        prev_op.cuda()
        for layer in layers:
            layer.cuda()
        scales.cuda()
        if isinstance(prev_op, nn.Linear):
            assert len(layers) == 1
            scale_fc_fc(prev_op, layers[0], scales)
        elif isinstance(prev_op, (nn.LayerNorm, LlamaRMSNorm)) or "rmsnorm" in str(prev_op.__class__).lower():
            scale_ln_fcs(prev_op, layers, scales)
        elif isinstance(prev_op, (nn.GELU, BloomGelu, GELUActivation)):
            new_module = ScaledActivation(prev_op, scales)
            set_op_by_name(module, prev_op_name, new_module)
            scale_gelu_fc(prev_op, layers[0], scales)
        else:
            raise NotImplementedError(f"prev_op {type(prev_op)} not supported yet!")
        # apply the scaling to input feat if given; prepare it for clipping
        if input_feat_dict is not None:
            for layer_name in layer_names:
                inp = input_feat_dict[layer_name]
                inp.div_(scales.view(1, -1).to(inp.device))
        prev_op.cpu()
        for layer in layers:
            layer.cpu()
        scales.cpu()
@torch.no_grad()
 def apply_clip(module, clip_list):
    """
    Applies element-wise clipping to the weight of a specific layer within a given module.
    Args:
        module (nn.Module): The module containing the layer to be clipped.
        clip_list (List[Tuple[str, torch.Tensor]]): A list of tuples containing:
            * name (str): The name of the layer to be clipped, relative to the root of the module.
            * max_val (torch.Tensor): A 1D or 2D tensor defining the upper bound for each element of the layer's weight.
    """
    for name, max_val in clip_list:
        layer = get_op_by_name(module, name)
        layer.cuda()
        max_val = max_val.to(layer.weight.device)
        org_shape = layer.weight.shape
        layer.weight.data = layer.weight.data.reshape(*max_val.shape[:2], -1)
        layer.weight.data = torch.clamp(layer.weight.data, -max_val, max_val)
        layer.weight.data = layer.weight.data.reshape(org_shape)
        layer.cpu()
 def add_scale_weights(model_path, scale_path, tmp_path):
    """
    Adds pre-computed Activation Weight Quantization (AWQ) results to a model,
    including scaling factors and clipping bounds.
    Args:
        model_path (str): Path to the pre-trained model to be equipped with AWQ.
        scale_path (str): Path to the AWQ scale factors (.pt file).
        tmp_path (str): Path to the temporary directory where the equipped model will be saved.
    """
    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_path, config=config, trust_remote_code=True
    )
    model.eval()
    awq_results = torch.load(str(scale_path), map_location="cpu")
    apply_scale(model, awq_results["scale"])
    apply_clip(model, awq_results["clip"])
    model.save_pretrained(str(tmp_path))
    os.system(f"cp {str(model_path)}/tokenizer* {str(tmp_path)}")
--- a/awq-py/requirements.txt
+++ b/awq-py/requirements.txt
@ -1,2 +0,0 @@
 torch>=2.1.1
 transformers>=4.32.0
--- a/build.zig
+++ b/build.zig
@ -1,138 +0,0 @@
 // Compatible with Zig Version 0.11.0
 const std = @import("std");
 const ArrayList = std.ArrayList;
 const Compile = std.Build.Step.Compile;
 const ConfigHeader = std.Build.Step.ConfigHeader;
 const Mode = std.builtin.Mode;
 const CrossTarget = std.zig.CrossTarget;
 const Maker = struct {
    builder: *std.build.Builder,
    target: CrossTarget,
    optimize: Mode,
    enable_lto: bool,
    include_dirs: ArrayList([]const u8),
    cflags: ArrayList([]const u8),
    cxxflags: ArrayList([]const u8),
    objs: ArrayList(*Compile),
    fn addInclude(m: *Maker, dir: []const u8) !void {
        try m.include_dirs.append(dir);
    }
    fn addProjectInclude(m: *Maker, path: []const []const u8) !void {
        try m.addInclude(try m.builder.build_root.join(m.builder.allocator, path));
    }
    fn addCFlag(m: *Maker, flag: []const u8) !void {
        try m.cflags.append(flag);
    }
    fn addCxxFlag(m: *Maker, flag: []const u8) !void {
        try m.cxxflags.append(flag);
    }
    fn addFlag(m: *Maker, flag: []const u8) !void {
        try m.addCFlag(flag);
        try m.addCxxFlag(flag);
    }
    fn init(builder: *std.build.Builder) !Maker {
        const target = builder.standardTargetOptions(.{});
        const zig_version = @import("builtin").zig_version_string;
        const commit_hash = try std.ChildProcess.exec(
            .{ .allocator = builder.allocator, .argv = &.{ "git", "rev-parse", "HEAD" } },
        );
        try std.fs.cwd().writeFile("common/build-info.cpp", builder.fmt(
            \\int LLAMA_BUILD_NUMBER = {};
            \\char const *LLAMA_COMMIT = "{s}";
            \\char const *LLAMA_COMPILER = "Zig {s}";
            \\char const *LLAMA_BUILD_TARGET = "{s}";
            \\
        , .{ 0, commit_hash.stdout[0 .. commit_hash.stdout.len - 1], zig_version, try target.allocDescription(builder.allocator) }));
        var m = Maker{
            .builder = builder,
            .target = target,
            .optimize = builder.standardOptimizeOption(.{}),
            .enable_lto = false,
            .include_dirs = ArrayList([]const u8).init(builder.allocator),
            .cflags = ArrayList([]const u8).init(builder.allocator),
            .cxxflags = ArrayList([]const u8).init(builder.allocator),
            .objs = ArrayList(*Compile).init(builder.allocator),
        };
        try m.addCFlag("-std=c11");
        try m.addCxxFlag("-std=c++11");
        try m.addProjectInclude(&.{});
        try m.addProjectInclude(&.{"common"});
        return m;
    }
    fn obj(m: *const Maker, name: []const u8, src: []const u8) *Compile {
        const o = m.builder.addObject(.{ .name = name, .target = m.target, .optimize = m.optimize });
        if (o.target.getAbi() != .msvc)
            o.defineCMacro("_GNU_SOURCE", null);
        if (std.mem.endsWith(u8, src, ".c")) {
            o.addCSourceFiles(&.{src}, m.cflags.items);
            o.linkLibC();
        } else {
            o.addCSourceFiles(&.{src}, m.cxxflags.items);
            if (o.target.getAbi() == .msvc) {
                o.linkLibC(); // need winsdk + crt
            } else {
                // linkLibCpp already add (libc++ + libunwind + libc)
                o.linkLibCpp();
            }
        }
        for (m.include_dirs.items) |i| o.addIncludePath(.{ .path = i });
        o.want_lto = m.enable_lto;
        return o;
    }
    fn exe(m: *const Maker, name: []const u8, src: []const u8, deps: []const *Compile) *Compile {
        const e = m.builder.addExecutable(.{ .name = name, .target = m.target, .optimize = m.optimize });
        e.addCSourceFiles(&.{src}, m.cxxflags.items);
        for (deps) |d| e.addObject(d);
        for (m.objs.items) |o| e.addObject(o);
        for (m.include_dirs.items) |i| e.addIncludePath(.{ .path = i });
        // https://github.com/ziglang/zig/issues/15448
        if (e.target.getAbi() == .msvc) {
            e.linkLibC(); // need winsdk + crt
        } else {
            // linkLibCpp already add (libc++ + libunwind + libc)
            e.linkLibCpp();
        }
        m.builder.installArtifact(e);
        e.want_lto = m.enable_lto;
        return e;
    }
 };
 pub fn build(b: *std.build.Builder) !void {
    var make = try Maker.init(b);
    make.enable_lto = b.option(bool, "lto", "Enable LTO optimization, (default: false)") orelse false;
    const ggml = make.obj("ggml", "ggml.c");
    const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
    const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
    const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
    const llama = make.obj("llama", "llama.cpp");
    const buildinfo = make.obj("common", "common/build-info.cpp");
    const common = make.obj("common", "common/common.cpp");
    const console = make.obj("console", "common/console.cpp");
    const sampling = make.obj("sampling", "common/sampling.cpp");
    const grammar_parser = make.obj("grammar-parser", "common/grammar-parser.cpp");
    const train = make.obj("train", "common/train.cpp");
    const clip = make.obj("clip", "examples/llava/clip.cpp");
    _ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
    _ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
    _ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
    _ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
    _ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
    _ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
    const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip });
    if (server.target.isWindows()) {
        server.linkSystemLibrary("ws2_32");
    }
 }
--- a/ci/run.sh
+++ b/ci/run.sh
@ -1,4 +1,4 @@
-#/bin/bash
+#!/bin/bash
 #
 # sample usage:
 #
@ -13,6 +13,9 @@
 # # with SYCL support
 # GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 # # with VULKAN support
 # GG_BUILD_VULKAN=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
 if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
@ -33,23 +36,28 @@ sd=`dirname $0`
 cd $sd/../
 SRC=`pwd`
-CMAKE_EXTRA=""
+CMAKE_EXTRA="-DLLAMA_FATAL_WARNINGS=ON"
 if [ ! -z ${GG_BUILD_METAL} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_METAL_SHADER_DEBUG=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON -DGGML_METAL_USE_BF16=ON"
 fi
 if [ ! -z ${GG_BUILD_CUDA} ]; then
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
 fi
 if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
-        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
 fi
 if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"
 fi
 ## helpers
@ -102,8 +110,11 @@ function gg_run_ctest_debug {
    set -e
    # Check cmake, make and ctest are installed
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                          ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                  ) 2>&1 | tee -a $OUT/${ci}-make.log
    (time ctest --output-on-failure -L main -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -130,8 +141,11 @@ function gg_run_ctest_release {
    set -e
    # Check cmake, make and ctest are installed
    gg_check_build_requirements
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                            ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    if [ -z ${GG_BUILD_LOW_PERF} ]; then
        (time ctest --output-on-failure -L main ) 2>&1 | tee -a $OUT/${ci}-ctest.log
@ -152,13 +166,64 @@ function gg_sum_ctest_release {
    gg_printf '```\n'
 }
 # test_scripts_debug
 function gg_run_test_scripts_debug {
    cd ${SRC}
    set -e
    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
 function gg_sum_test_scripts_debug {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs test scripts in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 # test_scripts_release
 function gg_run_test_scripts_release {
    cd ${SRC}
    set -e
    (cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    (cd ./examples/quantize   && time bash tests.sh "$SRC/build-ci-release/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
    set +e
 }
 function gg_sum_test_scripts_release {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Runs test scripts in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-scripts.log)"
    gg_printf '```\n'
    gg_printf '\n'
 }
 function gg_get_model {
-    local gguf_3b="$MNT/models/open-llama/3B-v2/ggml-model-f16.gguf"
+    local gguf_0="$MNT/models/pythia/1.4B/ggml-model-f16.gguf"
-    local gguf_7b="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
+    local gguf_1="$MNT/models/pythia/2.8B/ggml-model-f16.gguf"
-    if [[ -s $gguf_3b ]]; then
+    local gguf_2="$MNT/models/open-llama/7B-v2/ggml-model-f16.gguf"
-        echo -n "$gguf_3b"
+    if [[ -s $gguf_0 ]]; then
-    elif [[ -s $gguf_7b ]]; then
+        echo -n "$gguf_0"
-        echo -n "$gguf_7b"
+    elif [[ -s $gguf_1 ]]; then
        echo -n "$gguf_1"
    elif [[ -s $gguf_2 ]]; then
        echo -n "$gguf_2"
    else
        echo >&2 "No model found. Can't run gg_run_ctest_with_model."
        exit 1
@ -207,187 +272,7 @@ function gg_sum_ctest_with_model_release {
    gg_printf '```\n'
 }
 # open_llama_3b_v2
 function gg_run_open_llama_3b_v2 {
    cd ${SRC}
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/config.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/tokenizer.model
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/tokenizer_config.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/special_tokens_map.json
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/resolve/main/pytorch_model.bin
    gg_wget models-mnt/open-llama/3B-v2/ https://huggingface.co/openlm-research/open_llama_3b_v2/raw/main/generation_config.json
    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
    path_models="../models-mnt/open-llama/3B-v2"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_QKK_64=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert.py ${path_models}
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/main --model ${model_f16}  -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/main --model ${model_q8_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/main --model ${model_q4_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/main --model ${model_q4_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/main --model ${model_q5_0} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/main --model ${model_q5_1} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/main --model ${model_q2_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/main --model ${model_q3_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/main --model ${model_q4_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/main --model ${model_q5_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/main --model ${model_q6_k} -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    # lora
    function compare_ppl {
        qnt="$1"
        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
            return 20
        fi
        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
        return 0
    }
    path_lora="../models-mnt/open-llama/3B-v2/lora"
    path_shakespeare="../models-mnt/shakespeare"
    shakespeare="${path_shakespeare}/shakespeare.txt"
    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
    python3 ../convert-lora-to-ggml.py ${path_lora}
    # f16
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0 + f16 lora-base
    (time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 2 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    set +e
 }
 function gg_sum_open_llama_3b_v2 {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'OpenLLaMA 3B-v2:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
    gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
    gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
    gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
 }
 # open_llama_7b_v2
 # requires: GG_BUILD_CUDA
 function gg_run_open_llama_7b_v2 {
    cd ${SRC}
@ -401,7 +286,7 @@ function gg_run_open_llama_7b_v2 {
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/resolve/main/pytorch_model-00002-of-00002.bin
    gg_wget models-mnt/open-llama/7B-v2/ https://huggingface.co/openlm-research/open_llama_7b_v2/raw/main/generation_config.json
-    gg_wget models-mnt/wikitext/ https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip
+    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    path_models="../models-mnt/open-llama/7B-v2"
@ -411,10 +296,10 @@ function gg_run_open_llama_7b_v2 {
    set -e
-    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} -DLLAMA_CUBLAS=1 .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
+    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
-    (time make -j                                                             ) 2>&1 | tee -a $OUT/${ci}-make.log
+    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
-    python3 ../convert.py ${path_models}
+    python3 ../examples/convert_legacy_llama.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
@ -430,44 +315,47 @@ function gg_run_open_llama_7b_v2 {
    wiki_test="${path_wiki}/wiki.test.raw"
-    ./bin/quantize ${model_f16} ${model_q8_0} q8_0
+    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
-    ./bin/quantize ${model_f16} ${model_q4_0} q4_0
+    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
-    ./bin/quantize ${model_f16} ${model_q4_1} q4_1
+    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
-    ./bin/quantize ${model_f16} ${model_q5_0} q5_0
+    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
-    ./bin/quantize ${model_f16} ${model_q5_1} q5_1
+    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
-    ./bin/quantize ${model_f16} ${model_q2_k} q2_k
+    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
-    ./bin/quantize ${model_f16} ${model_q3_k} q3_k
+    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
-    ./bin/quantize ${model_f16} ${model_q4_k} q4_k
+    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
-    ./bin/quantize ${model_f16} ${model_q5_k} q5_k
+    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
-    ./bin/quantize ${model_f16} ${model_q6_k} q6_k
+    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
-    (time ./bin/main --model ${model_f16}  -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/main --model ${model_q8_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/main --model ${model_q4_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/main --model ${model_q4_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/main --model ${model_q5_0} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/main --model ${model_q5_1} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/main --model ${model_q2_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/main --model ${model_q3_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/main --model ${model_q4_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/main --model ${model_q5_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/main --model ${model_q6_k} -t 1 -ngl 999 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
+    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
-    (time ./bin/perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
+    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
-    (time ./bin/perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
+    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
-    (time ./bin/perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
+    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
-    (time ./bin/perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
+    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
-    (time ./bin/perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
+    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
-    (time ./bin/perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
+    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
-    (time ./bin/perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
+    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
-    (time ./bin/perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
+    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
-    (time ./bin/perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
+    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
-    (time ./bin/perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
+    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
-    (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
+    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state--model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
@ -496,48 +384,6 @@ function gg_run_open_llama_7b_v2 {
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    # lora
    function compare_ppl {
        qnt="$1"
        ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
            return 20
        fi
        printf '  - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
        return 0
    }
    path_lora="../models-mnt/open-llama/7B-v2/lora"
    path_shakespeare="../models-mnt/shakespeare"
    shakespeare="${path_shakespeare}/shakespeare.txt"
    lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
    gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
    gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
    python3 ../convert-lora-to-ggml.py ${path_lora}
    # f16
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
    (time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
    compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # currently not supported by the CUDA backend
    # q8_0
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare}                            -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
    #compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    # q8_0 + f16 lora-base
    #(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
    #compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
    set +e
 }
@ -548,7 +394,6 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@ -561,15 +406,407 @@ function gg_sum_open_llama_7b_v2 {
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
-    gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
+}
-    gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
+
-    #gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
+# pythia_1.4b
-    #gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
+
-    #gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
+function gg_run_pythia_1_4b {
    cd ${SRC}
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/config.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/tokenizer_config.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/raw/main/special_tokens_map.json
    gg_wget models-mnt/pythia/1.4B/ https://huggingface.co/EleutherAI/pythia-1.4b/resolve/main/pytorch_model.bin
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    head -n 60 models-mnt/wikitext/wikitext-2-raw/wiki.test.raw > models-mnt/wikitext/wikitext-2-raw/wiki.test-60.raw
    path_models="../models-mnt/pythia/1.4B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test_60="${path_wiki}/wiki.test-60.raw"
    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -ngl 99 -c 0 -s 1234 -n 64 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test_60} -ngl 99 -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_pythia_1_4b {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Pythia 1.4B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # pythia_2_8b
 function gg_run_pythia_2_8b {
    cd ${SRC}
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/config.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/tokenizer_config.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/raw/main/special_tokens_map.json
    gg_wget models-mnt/pythia/2.8B/ https://huggingface.co/EleutherAI/pythia-2.8b/resolve/main/pytorch_model.bin
    gg_wget models-mnt/wikitext/ https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip
    unzip -o models-mnt/wikitext/wikitext-2-raw-v1.zip -d models-mnt/wikitext/
    path_models="../models-mnt/pythia/2.8B"
    path_wiki="../models-mnt/wikitext/wikitext-2-raw"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    model_q4_0="${path_models}/ggml-model-q4_0.gguf"
    model_q4_1="${path_models}/ggml-model-q4_1.gguf"
    model_q5_0="${path_models}/ggml-model-q5_0.gguf"
    model_q5_1="${path_models}/ggml-model-q5_1.gguf"
    model_q2_k="${path_models}/ggml-model-q2_k.gguf"
    model_q3_k="${path_models}/ggml-model-q3_k.gguf"
    model_q4_k="${path_models}/ggml-model-q4_k.gguf"
    model_q5_k="${path_models}/ggml-model-q5_k.gguf"
    model_q6_k="${path_models}/ggml-model-q6_k.gguf"
    wiki_test="${path_wiki}/wiki.test.raw"
    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
    ./bin/llama-quantize ${model_f16} ${model_q4_0} q4_0
    ./bin/llama-quantize ${model_f16} ${model_q4_1} q4_1
    ./bin/llama-quantize ${model_f16} ${model_q5_0} q5_0
    ./bin/llama-quantize ${model_f16} ${model_q5_1} q5_1
    ./bin/llama-quantize ${model_f16} ${model_q2_k} q2_k
    ./bin/llama-quantize ${model_f16} ${model_q3_k} q3_k
    ./bin/llama-quantize ${model_f16} ${model_q4_k} q4_k
    ./bin/llama-quantize ${model_f16} ${model_q5_k} q5_k
    ./bin/llama-quantize ${model_f16} ${model_q6_k} q6_k
    (time ./bin/llama-cli -no-cnv --model ${model_f16}  -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-cli -no-cnv --model ${model_q8_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-cli -no-cnv --model ${model_q4_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-cli -no-cnv --model ${model_q4_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-cli -no-cnv --model ${model_q5_0} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-cli -no-cnv --model ${model_q5_1} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-cli -no-cnv --model ${model_q2_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q3_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q4_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q5_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-cli -no-cnv --model ${model_q6_k} -t 1 -ngl 99 -c 0 -s 1234 -n 256 --ignore-eos -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-perplexity --model ${model_f16}  -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-perplexity --model ${model_q8_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    (time ./bin/llama-perplexity --model ${model_q4_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_0.log
    (time ./bin/llama-perplexity --model ${model_q4_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_1.log
    (time ./bin/llama-perplexity --model ${model_q5_0} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_0.log
    (time ./bin/llama-perplexity --model ${model_q5_1} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_1.log
    (time ./bin/llama-perplexity --model ${model_q2_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q2_k.log
    (time ./bin/llama-perplexity --model ${model_q3_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q3_k.log
    (time ./bin/llama-perplexity --model ${model_q4_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q4_k.log
    (time ./bin/llama-perplexity --model ${model_q5_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q5_k.log
    (time ./bin/llama-perplexity --model ${model_q6_k} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-tg-q6_k.log
    (time ./bin/llama-imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 99 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 10 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0     ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    (time ./bin/llama-save-load-state --model ${model_q4_0} -ngl 99 -c 0 -fa ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
    function check_ppl {
        qnt="$1"
        ppl=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$ppl > 20.0" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: ppl > 20.0)\n' "$qnt" "$ppl"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$ppl"
        return 0
    }
    check_ppl "f16"  "$(cat $OUT/${ci}-tg-f16.log  | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q8_0" "$(cat $OUT/${ci}-tg-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_0" "$(cat $OUT/${ci}-tg-q4_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_1" "$(cat $OUT/${ci}-tg-q4_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_0" "$(cat $OUT/${ci}-tg-q5_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_1" "$(cat $OUT/${ci}-tg-q5_1.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
   #check_ppl "q2_k" "$(cat $OUT/${ci}-tg-q2_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log # note: ppl > 20.0 for this quant and model
    check_ppl "q3_k" "$(cat $OUT/${ci}-tg-q3_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q4_k" "$(cat $OUT/${ci}-tg-q4_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q5_k" "$(cat $OUT/${ci}-tg-q5_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    check_ppl "q6_k" "$(cat $OUT/${ci}-tg-q6_k.log | grep "^\[1\]")" | tee -a $OUT/${ci}-ppl.log
    cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
    set +e
 }
 function gg_sum_pythia_2_8b {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Pythia 2.8B:\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
    gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
    gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
    gg_printf '- q4_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_1.log)"
    gg_printf '- q5_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_0.log)"
    gg_printf '- q5_1:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_1.log)"
    gg_printf '- q2_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q2_k.log)"
    gg_printf '- q3_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q3_k.log)"
    gg_printf '- q4_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_k.log)"
    gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
    gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
    gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
 }
 # bge-small
 function gg_run_embd_bge_small {
    cd ${SRC}
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/modules.json
    gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
    gg_wget models-mnt/bge-small/1_Pooling https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/1_Pooling/config.json
    path_models="../models-mnt/bge-small"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    model_q8_0="${path_models}/ggml-model-q8_0.gguf"
    ./bin/llama-quantize ${model_f16} ${model_q8_0} q8_0
    (time ./bin/llama-embedding --model ${model_f16}  -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
    (time ./bin/llama-embedding --model ${model_q8_0} -p "I believe the meaning of life is" -ngl 99 -c 0 ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
    set +e
 }
 function gg_sum_embd_bge_small {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'BGE Small (BERT):\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
    gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
 }
 # rerank_tiny
 function gg_run_rerank_tiny {
    cd ${SRC}
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/tokenizer_config.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/special_tokens_map.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/resolve/main/pytorch_model.bin
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/sentence_bert_config.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/vocab.txt
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/modules.json
    gg_wget models-mnt/rerank-tiny/ https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/config.json
    gg_wget models-mnt/rerank-tiny/1_Pooling https://huggingface.co/jinaai/jina-reranker-v1-tiny-en/raw/main/1_Pooling/config.json
    path_models="../models-mnt/rerank-tiny"
    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
    set -e
    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                    ) 2>&1 | tee -a $OUT/${ci}-make.log
    python3 ../convert_hf_to_gguf.py ${path_models} --outfile ${path_models}/ggml-model-f16.gguf
    model_f16="${path_models}/ggml-model-f16.gguf"
    # for this model, the SEP token is "</s>"
    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
    # sample output
    # rerank score 0:    0.029
    # rerank score 1:    0.029
    # rerank score 2:    0.135
    # check that the score is in the range [$3, $4]
    function check_score {
        qnt="$1"
        score=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
        if [ $(echo "$score < $3" | bc) -eq 1 ] || [ $(echo "$score > $4" | bc) -eq 1 ]; then
            printf '  - %s @ %s (FAIL: score not in range [%s, %s])\n' "$qnt" "$score" "$3" "$4"
            return 20
        fi
        printf '  - %s @ %s OK\n' "$qnt" "$score"
        return 0
    }
    check_score "rerank score 0" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 0")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
    check_score "rerank score 1" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 1")" "0.00" "0.05" | tee -a $OUT/${ci}-rk-f16.log
    check_score "rerank score 2" "$(cat $OUT/${ci}-rk-f16.log | grep "rerank score 2")" "0.10" "0.30" | tee -a $OUT/${ci}-rk-f16.log
    set +e
 }
 function gg_sum_rerank_tiny {
    gg_printf '### %s\n\n' "${ci}"
    gg_printf 'Rerank Tiny (Jina):\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-rk-f16.log)"
 }
 function gg_check_build_requirements {
    if ! command -v cmake &> /dev/null; then
        gg_printf 'cmake not found, please install'
    fi
    if ! command -v make &> /dev/null; then
        gg_printf 'make not found, please install'
    fi
    if ! command -v ctest &> /dev/null; then
        gg_printf 'ctest not found, please install'
    fi
 }
 ## main
 export LLAMA_LOG_PREFIX=1
 export LLAMA_LOG_TIMESTAMPS=1
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    # Create symlink: ./llama.cpp/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
@ -578,7 +815,10 @@ if [ -z ${GG_BUILD_LOW_PERF} ]; then
    ln -sfn ${mnt_models} ${SRC}/models-mnt
    # Create a fresh python3 venv and enter it
-    python3 -m venv "$MNT/venv"
+    if ! python3 -m venv "$MNT/venv"; then
        echo "Error: Failed to create Python virtual environment at $MNT/venv."
        exit 1
    fi
    source "$MNT/venv/bin/activate"
    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
@ -591,11 +831,20 @@ test $ret -eq 0 && gg_run ctest_debug
 test $ret -eq 0 && gg_run ctest_release
 if [ -z ${GG_BUILD_LOW_PERF} ]; then
    test $ret -eq 0 && gg_run embd_bge_small
    test $ret -eq 0 && gg_run rerank_tiny
    if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
        test $ret -eq 0 && gg_run test_scripts_debug
        test $ret -eq 0 && gg_run test_scripts_release
    fi
    if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
-        if [ -z ${GG_BUILD_CUDA} ]; then
+        if [ -z ${GG_BUILD_CUDA} ] && [ -z ${GG_BUILD_VULKAN} ]; then
-            test $ret -eq 0 && gg_run open_llama_3b_v2
+            test $ret -eq 0 && gg_run pythia_1_4b
        else
-            test $ret -eq 0 && gg_run open_llama_7b_v2
+            test $ret -eq 0 && gg_run pythia_2_8b
            #test $ret -eq 0 && gg_run open_llama_7b_v2
        fi
        test $ret -eq 0 && gg_run ctest_with_model_debug
        test $ret -eq 0 && gg_run ctest_with_model_release
--- a/cmake/arm64-apple-clang.cmake
+++ b/cmake/arm64-apple-clang.cmake
@ -0,0 +1,16 @@
 set( CMAKE_SYSTEM_NAME Darwin )
 set( CMAKE_SYSTEM_PROCESSOR arm64 )
 set( target arm64-apple-darwin-macho )
 set( CMAKE_C_COMPILER    clang )
 set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 set( arch_c_flags "-march=armv8.4-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
 set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function" )
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
 set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@ -0,0 +1,16 @@
 set( CMAKE_SYSTEM_NAME Windows )
 set( CMAKE_SYSTEM_PROCESSOR arm64 )
 set( target arm64-pc-windows-msvc )
 set( CMAKE_C_COMPILER    clang )
 set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
 set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
 set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
--- a/cmake/arm64-windows-msvc.cmake
+++ b/cmake/arm64-windows-msvc.cmake
@ -0,0 +1,6 @@
 set( CMAKE_SYSTEM_NAME Windows )
 set( CMAKE_SYSTEM_PROCESSOR arm64 )
 set( target arm64-pc-windows-msvc )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
--- a/scripts/build-info.cmake
+++ b/scripts/build-info.cmake
@ -44,7 +44,7 @@ if(MSVC)
    set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
 else()
    execute_process(
-        COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
+        COMMAND sh -c "\"$@\" --version | head -1" _ ${CMAKE_C_COMPILER}
        OUTPUT_VARIABLE OUT
        OUTPUT_STRIP_TRAILING_WHITESPACE
    )
--- a/cmake/common.cmake
+++ b/cmake/common.cmake
@ -0,0 +1,33 @@
 function(llama_add_compile_flags)
    if (LLAMA_FATAL_WARNINGS)
        if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
            list(APPEND C_FLAGS   -Werror)
            list(APPEND CXX_FLAGS -Werror)
        elseif (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
            add_compile_options(/WX)
        endif()
    endif()
    if (LLAMA_ALL_WARNINGS)
        if (NOT MSVC)
            list(APPEND C_FLAGS -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes
                                -Werror=implicit-int -Werror=implicit-function-declaration)
            list(APPEND CXX_FLAGS -Wmissing-declarations -Wmissing-noreturn)
            list(APPEND WARNING_FLAGS -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function)
            list(APPEND C_FLAGS   ${WARNING_FLAGS})
            list(APPEND CXX_FLAGS ${WARNING_FLAGS})
            ggml_get_flags(${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION})
            add_compile_options("$<$<COMPILE_LANGUAGE:C>:${C_FLAGS};${GF_C_FLAGS}>"
                                "$<$<COMPILE_LANGUAGE:CXX>:${CXX_FLAGS};${GF_CXX_FLAGS}>")
        else()
            # todo : msvc
            set(C_FLAGS   "" PARENT_SCOPE)
            set(CXX_FLAGS "" PARENT_SCOPE)
        endif()
    endif()
 endfunction()
--- a/cmake/git-vars.cmake
+++ b/cmake/git-vars.cmake
@ -0,0 +1,22 @@
 find_package(Git)
 # the commit's SHA1
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_SHA1
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # the date of the commit
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_DATE
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # the subject of the commit
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%s
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/cmake/llama-config.cmake.in
+++ b/cmake/llama-config.cmake.in
@ -0,0 +1,30 @@
 set(LLAMA_VERSION      @LLAMA_INSTALL_VERSION@)
 set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
 set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
 set(LLAMA_SHARED_LIB   @BUILD_SHARED_LIBS@)
@PACKAGE_INIT@
 set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
 set_and_check(LLAMA_LIB_DIR     "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
 set_and_check(LLAMA_BIN_DIR     "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
 find_package(ggml REQUIRED HINTS ${LLAMA_LIB_DIR}/cmake)
 find_library(llama_LIBRARY llama
    REQUIRED
    HINTS ${LLAMA_LIB_DIR}
    NO_CMAKE_FIND_ROOT_PATH
 )
 add_library(llama UNKNOWN IMPORTED)
 set_target_properties(llama
    PROPERTIES
        INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
        INTERFACE_LINK_LIBRARIES "ggml::ggml;ggml::ggml-base;"
        IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
        IMPORTED_LOCATION "${llama_LIBRARY}"
        INTERFACE_COMPILE_FEATURES c_std_90
        POSITION_INDEPENDENT_CODE ON)
 check_required_components(Llama)
--- a/cmake/llama.pc.in
+++ b/cmake/llama.pc.in
@ -0,0 +1,10 @@
 prefix=@CMAKE_INSTALL_PREFIX@
 exec_prefix=@CMAKE_INSTALL_PREFIX@
 libdir=@CMAKE_INSTALL_FULL_LIBDIR@
 includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 Name: llama
 Description: Port of Facebook's LLaMA model in C/C++
 Version: @LLAMA_INSTALL_VERSION@
 Libs: -L${libdir} -lggml -lggml-base -lllama
 Cflags: -I${includedir}
--- a/cmake/x64-windows-llvm.cmake
+++ b/cmake/x64-windows-llvm.cmake
@ -0,0 +1,11 @@
 set( CMAKE_SYSTEM_NAME Windows )
 set( CMAKE_SYSTEM_PROCESSOR x86_64 )
 set( CMAKE_C_COMPILER    clang )
 set( CMAKE_CXX_COMPILER  clang++ )
 set( arch_c_flags "-march=native" )
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags}" )
 set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
--- a/codecov.yml
+++ b/codecov.yml
@ -1,14 +0,0 @@
 comment: off
 coverage:
  status:
    project:
      default:
        target: auto
        threshold: 0
        base: auto
    patch:
      default:
        target: auto
        threshold: 0
        base: auto
--- a/Show more
+++ b/Show more
		`@ -0,0 +1 @@`
							`Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR`